| 280 282 282 282 282 282 253 51 50 251 251 249 281 281 175 1 280 253 253 216 215 253 252 282 260 260 202 200 202 169 171 40 25 25 171 281 281 281 280 280 76 26 26 26 26 26 281 281 26 281 80 121 77 78 281 281 281 282 281 81 281 280 281 281 281 8 8 8 8 8 8 251 8 251 8 8 8 8 8 8 8 8 55 252 79 79 79 252 37 251 251 251 251 251 252 251 252 8 252 8 252 251 230 43 226 226 280 281 281 36 8 8 281 277 281 50 252 251 251 252 75 252 252 216 252 52 252 52 281 51 50 280 281 81 80 30 80 80 79 45 44 43 38 80 78 79 280 261 281 280 1 87 57 33 57 57 57 26 8 18 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 | // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * * This file is part of the SCTP kernel implementation * * These functions handle output processing. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Jon Grimm <jgrimm@austin.ibm.com> * Sridhar Samudrala <sri@us.ibm.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/kernel.h> #include <linux/wait.h> #include <linux/time.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/init.h> #include <linux/slab.h> #include <net/inet_ecn.h> #include <net/ip.h> #include <net/icmp.h> #include <net/net_namespace.h> #include <linux/socket.h> /* for sa_family_t */ #include <net/sock.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/checksum.h> /* Forward declarations for private helpers. */ static enum sctp_xmit __sctp_packet_append_chunk(struct sctp_packet *packet, struct sctp_chunk *chunk); static enum sctp_xmit sctp_packet_can_append_data(struct sctp_packet *packet, struct sctp_chunk *chunk); static void sctp_packet_append_data(struct sctp_packet *packet, struct sctp_chunk *chunk); static enum sctp_xmit sctp_packet_will_fit(struct sctp_packet *packet, struct sctp_chunk *chunk, u16 chunk_len); static void sctp_packet_reset(struct sctp_packet *packet) { /* sctp_packet_transmit() relies on this to reset size to the * current overhead after sending packets. */ packet->size = packet->overhead; packet->has_cookie_echo = 0; packet->has_sack = 0; packet->has_data = 0; packet->has_auth = 0; packet->ipfragok = 0; packet->auth = NULL; } /* Config a packet. * This appears to be a followup set of initializations. */ void sctp_packet_config(struct sctp_packet *packet, __u32 vtag, int ecn_capable) { struct sctp_transport *tp = packet->transport; struct sctp_association *asoc = tp->asoc; struct sctp_sock *sp = NULL; struct sock *sk; pr_debug("%s: packet:%p vtag:0x%x\n", __func__, packet, vtag); packet->vtag = vtag; /* do the following jobs only once for a flush schedule */ if (!sctp_packet_empty(packet)) return; /* set packet max_size with pathmtu, then calculate overhead */ packet->max_size = tp->pathmtu; if (asoc) { sk = asoc->base.sk; sp = sctp_sk(sk); } packet->overhead = sctp_mtu_payload(sp, 0, 0); packet->size = packet->overhead; if (!asoc) return; /* update dst or transport pathmtu if in need */ if (!sctp_transport_dst_check(tp)) { sctp_transport_route(tp, NULL, sp); if (asoc->param_flags & SPP_PMTUD_ENABLE) sctp_assoc_sync_pmtu(asoc); } else if (!sctp_transport_pl_enabled(tp) && asoc->param_flags & SPP_PMTUD_ENABLE) { if (!sctp_transport_pmtu_check(tp)) sctp_assoc_sync_pmtu(asoc); } if (asoc->pmtu_pending) { if (asoc->param_flags & SPP_PMTUD_ENABLE) sctp_assoc_sync_pmtu(asoc); asoc->pmtu_pending = 0; } /* If there a is a prepend chunk stick it on the list before * any other chunks get appended. */ if (ecn_capable) { struct sctp_chunk *chunk = sctp_get_ecne_prepend(asoc); if (chunk) sctp_packet_append_chunk(packet, chunk); } if (!tp->dst) return; /* set packet max_size with gso_max_size if gso is enabled*/ rcu_read_lock(); if (__sk_dst_get(sk) != tp->dst) { dst_hold(tp->dst); sk_setup_caps(sk, tp->dst); } packet->max_size = sk_can_gso(sk) ? min(READ_ONCE(tp->dst->dev->gso_max_size), GSO_LEGACY_MAX_SIZE) : asoc->pathmtu; rcu_read_unlock(); } /* Initialize the packet structure. */ void sctp_packet_init(struct sctp_packet *packet, struct sctp_transport *transport, __u16 sport, __u16 dport) { pr_debug("%s: packet:%p transport:%p\n", __func__, packet, transport); packet->transport = transport; packet->source_port = sport; packet->destination_port = dport; INIT_LIST_HEAD(&packet->chunk_list); /* The overhead will be calculated by sctp_packet_config() */ packet->overhead = 0; sctp_packet_reset(packet); packet->vtag = 0; } /* Free a packet. */ void sctp_packet_free(struct sctp_packet *packet) { struct sctp_chunk *chunk, *tmp; pr_debug("%s: packet:%p\n", __func__, packet); list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { list_del_init(&chunk->list); sctp_chunk_free(chunk); } } /* This routine tries to append the chunk to the offered packet. If adding * the chunk causes the packet to exceed the path MTU and COOKIE_ECHO chunk * is not present in the packet, it transmits the input packet. * Data can be bundled with a packet containing a COOKIE_ECHO chunk as long * as it can fit in the packet, but any more data that does not fit in this * packet can be sent only after receiving the COOKIE_ACK. */ enum sctp_xmit sctp_packet_transmit_chunk(struct sctp_packet *packet, struct sctp_chunk *chunk, int one_packet, gfp_t gfp) { enum sctp_xmit retval; pr_debug("%s: packet:%p size:%zu chunk:%p size:%d\n", __func__, packet, packet->size, chunk, chunk->skb ? chunk->skb->len : -1); switch ((retval = (sctp_packet_append_chunk(packet, chunk)))) { case SCTP_XMIT_PMTU_FULL: if (!packet->has_cookie_echo) { int error = 0; error = sctp_packet_transmit(packet, gfp); if (error < 0) chunk->skb->sk->sk_err = -error; /* If we have an empty packet, then we can NOT ever * return PMTU_FULL. */ if (!one_packet) retval = sctp_packet_append_chunk(packet, chunk); } break; case SCTP_XMIT_RWND_FULL: case SCTP_XMIT_OK: case SCTP_XMIT_DELAY: break; } return retval; } /* Try to bundle a pad chunk into a packet with a heartbeat chunk for PLPMTUTD probe */ static enum sctp_xmit sctp_packet_bundle_pad(struct sctp_packet *pkt, struct sctp_chunk *chunk) { struct sctp_transport *t = pkt->transport; struct sctp_chunk *pad; int overhead = 0; if (!chunk->pmtu_probe) return SCTP_XMIT_OK; /* calculate the Padding Data size for the pad chunk */ overhead += sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr); overhead += sizeof(struct sctp_sender_hb_info) + sizeof(struct sctp_pad_chunk); pad = sctp_make_pad(t->asoc, t->pl.probe_size - overhead); if (!pad) return SCTP_XMIT_DELAY; list_add_tail(&pad->list, &pkt->chunk_list); pkt->size += SCTP_PAD4(ntohs(pad->chunk_hdr->length)); chunk->transport = t; return SCTP_XMIT_OK; } /* Try to bundle an auth chunk into the packet. */ static enum sctp_xmit sctp_packet_bundle_auth(struct sctp_packet *pkt, struct sctp_chunk *chunk) { struct sctp_association *asoc = pkt->transport->asoc; enum sctp_xmit retval = SCTP_XMIT_OK; struct sctp_chunk *auth; /* if we don't have an association, we can't do authentication */ if (!asoc) return retval; /* See if this is an auth chunk we are bundling or if * auth is already bundled. */ if (chunk->chunk_hdr->type == SCTP_CID_AUTH || pkt->has_auth) return retval; /* if the peer did not request this chunk to be authenticated, * don't do it */ if (!chunk->auth) return retval; auth = sctp_make_auth(asoc, chunk->shkey->key_id); if (!auth) return retval; auth->shkey = chunk->shkey; sctp_auth_shkey_hold(auth->shkey); retval = __sctp_packet_append_chunk(pkt, auth); if (retval != SCTP_XMIT_OK) sctp_chunk_free(auth); return retval; } /* Try to bundle a SACK with the packet. */ static enum sctp_xmit sctp_packet_bundle_sack(struct sctp_packet *pkt, struct sctp_chunk *chunk) { enum sctp_xmit retval = SCTP_XMIT_OK; /* If sending DATA and haven't aleady bundled a SACK, try to * bundle one in to the packet. */ if (sctp_chunk_is_data(chunk) && !pkt->has_sack && !pkt->has_cookie_echo) { struct sctp_association *asoc; struct timer_list *timer; asoc = pkt->transport->asoc; timer = &asoc->timers[SCTP_EVENT_TIMEOUT_SACK]; /* If the SACK timer is running, we have a pending SACK */ if (timer_pending(timer)) { struct sctp_chunk *sack; if (pkt->transport->sack_generation != pkt->transport->asoc->peer.sack_generation) return retval; asoc->a_rwnd = asoc->rwnd; sack = sctp_make_sack(asoc); if (sack) { retval = __sctp_packet_append_chunk(pkt, sack); if (retval != SCTP_XMIT_OK) { sctp_chunk_free(sack); goto out; } SCTP_INC_STATS(asoc->base.net, SCTP_MIB_OUTCTRLCHUNKS); asoc->stats.octrlchunks++; asoc->peer.sack_needed = 0; if (timer_delete(timer)) sctp_association_put(asoc); } } } out: return retval; } /* Append a chunk to the offered packet reporting back any inability to do * so. */ static enum sctp_xmit __sctp_packet_append_chunk(struct sctp_packet *packet, struct sctp_chunk *chunk) { __u16 chunk_len = SCTP_PAD4(ntohs(chunk->chunk_hdr->length)); enum sctp_xmit retval = SCTP_XMIT_OK; /* Check to see if this chunk will fit into the packet */ retval = sctp_packet_will_fit(packet, chunk, chunk_len); if (retval != SCTP_XMIT_OK) goto finish; /* We believe that this chunk is OK to add to the packet */ switch (chunk->chunk_hdr->type) { case SCTP_CID_DATA: case SCTP_CID_I_DATA: /* Account for the data being in the packet */ sctp_packet_append_data(packet, chunk); /* Disallow SACK bundling after DATA. */ packet->has_sack = 1; /* Disallow AUTH bundling after DATA */ packet->has_auth = 1; /* Let it be knows that packet has DATA in it */ packet->has_data = 1; /* timestamp the chunk for rtx purposes */ chunk->sent_at = jiffies; /* Mainly used for prsctp RTX policy */ chunk->sent_count++; break; case SCTP_CID_COOKIE_ECHO: packet->has_cookie_echo = 1; break; case SCTP_CID_SACK: packet->has_sack = 1; if (chunk->asoc) chunk->asoc->stats.osacks++; break; case SCTP_CID_AUTH: packet->has_auth = 1; packet->auth = chunk; break; } /* It is OK to send this chunk. */ list_add_tail(&chunk->list, &packet->chunk_list); packet->size += chunk_len; chunk->transport = packet->transport; finish: return retval; } /* Append a chunk to the offered packet reporting back any inability to do * so. */ enum sctp_xmit sctp_packet_append_chunk(struct sctp_packet *packet, struct sctp_chunk *chunk) { enum sctp_xmit retval = SCTP_XMIT_OK; pr_debug("%s: packet:%p chunk:%p\n", __func__, packet, chunk); /* Data chunks are special. Before seeing what else we can * bundle into this packet, check to see if we are allowed to * send this DATA. */ if (sctp_chunk_is_data(chunk)) { retval = sctp_packet_can_append_data(packet, chunk); if (retval != SCTP_XMIT_OK) goto finish; } /* Try to bundle AUTH chunk */ retval = sctp_packet_bundle_auth(packet, chunk); if (retval != SCTP_XMIT_OK) goto finish; /* Try to bundle SACK chunk */ retval = sctp_packet_bundle_sack(packet, chunk); if (retval != SCTP_XMIT_OK) goto finish; retval = __sctp_packet_append_chunk(packet, chunk); if (retval != SCTP_XMIT_OK) goto finish; retval = sctp_packet_bundle_pad(packet, chunk); finish: return retval; } static void sctp_packet_gso_append(struct sk_buff *head, struct sk_buff *skb) { if (SCTP_OUTPUT_CB(head)->last == head) skb_shinfo(head)->frag_list = skb; else SCTP_OUTPUT_CB(head)->last->next = skb; SCTP_OUTPUT_CB(head)->last = skb; head->truesize += skb->truesize; head->data_len += skb->len; head->len += skb->len; refcount_add(skb->truesize, &head->sk->sk_wmem_alloc); __skb_header_release(skb); } static int sctp_packet_pack(struct sctp_packet *packet, struct sk_buff *head, int gso, gfp_t gfp) { struct sctp_transport *tp = packet->transport; struct sctp_auth_chunk *auth = NULL; struct sctp_chunk *chunk, *tmp; int pkt_count = 0, pkt_size; struct sock *sk = head->sk; struct sk_buff *nskb; int auth_len = 0; if (gso) { skb_shinfo(head)->gso_type = sk->sk_gso_type; SCTP_OUTPUT_CB(head)->last = head; } else { nskb = head; pkt_size = packet->size; goto merge; } do { /* calculate the pkt_size and alloc nskb */ pkt_size = packet->overhead; list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { int padded = SCTP_PAD4(chunk->skb->len); if (chunk == packet->auth) auth_len = padded; else if (auth_len + padded + packet->overhead > tp->pathmtu) return 0; else if (pkt_size + padded > tp->pathmtu) break; pkt_size += padded; } nskb = alloc_skb(pkt_size + MAX_HEADER, gfp); if (!nskb) return 0; skb_reserve(nskb, packet->overhead + MAX_HEADER); merge: /* merge chunks into nskb and append nskb into head list */ pkt_size -= packet->overhead; list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { int padding; list_del_init(&chunk->list); if (sctp_chunk_is_data(chunk)) { if (!sctp_chunk_retransmitted(chunk) && !tp->rto_pending) { chunk->rtt_in_progress = 1; tp->rto_pending = 1; } } padding = SCTP_PAD4(chunk->skb->len) - chunk->skb->len; if (padding) skb_put_zero(chunk->skb, padding); if (chunk == packet->auth) auth = (struct sctp_auth_chunk *) skb_tail_pointer(nskb); skb_put_data(nskb, chunk->skb->data, chunk->skb->len); pr_debug("*** Chunk:%p[%s] %s 0x%x, length:%d, chunk->skb->len:%d, rtt_in_progress:%d\n", chunk, sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)), chunk->has_tsn ? "TSN" : "No TSN", chunk->has_tsn ? ntohl(chunk->subh.data_hdr->tsn) : 0, ntohs(chunk->chunk_hdr->length), chunk->skb->len, chunk->rtt_in_progress); pkt_size -= SCTP_PAD4(chunk->skb->len); if (!sctp_chunk_is_data(chunk) && chunk != packet->auth) sctp_chunk_free(chunk); if (!pkt_size) break; } if (auth) { sctp_auth_calculate_hmac(tp->asoc, nskb, auth, packet->auth->shkey, gfp); /* free auth if no more chunks, or add it back */ if (list_empty(&packet->chunk_list)) sctp_chunk_free(packet->auth); else list_add(&packet->auth->list, &packet->chunk_list); } if (gso) sctp_packet_gso_append(head, nskb); pkt_count++; } while (!list_empty(&packet->chunk_list)); if (gso) { memset(head->cb, 0, max(sizeof(struct inet_skb_parm), sizeof(struct inet6_skb_parm))); skb_shinfo(head)->gso_segs = pkt_count; skb_shinfo(head)->gso_size = GSO_BY_FRAGS; goto chksum; } if (sctp_checksum_disable) return 1; if (!(tp->dst->dev->features & NETIF_F_SCTP_CRC) || dst_xfrm(tp->dst) || packet->ipfragok || tp->encap_port) { struct sctphdr *sh = (struct sctphdr *)skb_transport_header(head); sh->checksum = sctp_compute_cksum(head, 0); } else { chksum: head->ip_summed = CHECKSUM_PARTIAL; head->csum_not_inet = 1; head->csum_start = skb_transport_header(head) - head->head; head->csum_offset = offsetof(struct sctphdr, checksum); } return pkt_count; } /* All packets are sent to the network through this function from * sctp_outq_tail(). * * The return value is always 0 for now. */ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) { struct sctp_transport *tp = packet->transport; struct sctp_association *asoc = tp->asoc; struct sctp_chunk *chunk, *tmp; int pkt_count, gso = 0; struct sk_buff *head; struct sctphdr *sh; struct sock *sk; pr_debug("%s: packet:%p\n", __func__, packet); if (list_empty(&packet->chunk_list)) return 0; chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list); sk = chunk->skb->sk; if (packet->size > tp->pathmtu && !packet->ipfragok && !chunk->pmtu_probe) { if (tp->pl.state == SCTP_PL_ERROR) { /* do IP fragmentation if in Error state */ packet->ipfragok = 1; } else { if (!sk_can_gso(sk)) { /* check gso */ pr_err_once("Trying to GSO but underlying device doesn't support it."); goto out; } gso = 1; } } /* alloc head skb */ head = alloc_skb((gso ? packet->overhead : packet->size) + MAX_HEADER, gfp); if (!head) goto out; skb_reserve(head, packet->overhead + MAX_HEADER); skb_set_owner_w(head, sk); /* set sctp header */ sh = skb_push(head, sizeof(struct sctphdr)); skb_reset_transport_header(head); sh->source = htons(packet->source_port); sh->dest = htons(packet->destination_port); sh->vtag = htonl(packet->vtag); sh->checksum = 0; /* drop packet if no dst */ if (!tp->dst) { IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); kfree_skb(head); goto out; } /* pack up chunks */ pkt_count = sctp_packet_pack(packet, head, gso, gfp); if (!pkt_count) { kfree_skb(head); goto out; } pr_debug("***sctp_transmit_packet*** skb->len:%d\n", head->len); /* start autoclose timer */ if (packet->has_data && sctp_state(asoc, ESTABLISHED) && asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE]) { struct timer_list *timer = &asoc->timers[SCTP_EVENT_TIMEOUT_AUTOCLOSE]; unsigned long timeout = asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE]; if (!mod_timer(timer, jiffies + timeout)) sctp_association_hold(asoc); } /* sctp xmit */ tp->af_specific->ecn_capable(sk); if (asoc) { asoc->stats.opackets += pkt_count; if (asoc->peer.last_sent_to != tp) asoc->peer.last_sent_to = tp; } head->ignore_df = packet->ipfragok; if (tp->dst_pending_confirm) skb_set_dst_pending_confirm(head, 1); /* neighbour should be confirmed on successful transmission or * positive error */ if (tp->af_specific->sctp_xmit(head, tp) >= 0 && tp->dst_pending_confirm) tp->dst_pending_confirm = 0; out: list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { list_del_init(&chunk->list); if (!sctp_chunk_is_data(chunk)) sctp_chunk_free(chunk); } sctp_packet_reset(packet); return 0; } /******************************************************************** * 2nd Level Abstractions ********************************************************************/ /* This private function check to see if a chunk can be added */ static enum sctp_xmit sctp_packet_can_append_data(struct sctp_packet *packet, struct sctp_chunk *chunk) { size_t datasize, rwnd, inflight, flight_size; struct sctp_transport *transport = packet->transport; struct sctp_association *asoc = transport->asoc; struct sctp_outq *q = &asoc->outqueue; /* RFC 2960 6.1 Transmission of DATA Chunks * * A) At any given time, the data sender MUST NOT transmit new data to * any destination transport address if its peer's rwnd indicates * that the peer has no buffer space (i.e. rwnd is 0, see Section * 6.2.1). However, regardless of the value of rwnd (including if it * is 0), the data sender can always have one DATA chunk in flight to * the receiver if allowed by cwnd (see rule B below). This rule * allows the sender to probe for a change in rwnd that the sender * missed due to the SACK having been lost in transit from the data * receiver to the data sender. */ rwnd = asoc->peer.rwnd; inflight = q->outstanding_bytes; flight_size = transport->flight_size; datasize = sctp_data_size(chunk); if (datasize > rwnd && inflight > 0) /* We have (at least) one data chunk in flight, * so we can't fall back to rule 6.1 B). */ return SCTP_XMIT_RWND_FULL; /* RFC 2960 6.1 Transmission of DATA Chunks * * B) At any given time, the sender MUST NOT transmit new data * to a given transport address if it has cwnd or more bytes * of data outstanding to that transport address. */ /* RFC 7.2.4 & the Implementers Guide 2.8. * * 3) ... * When a Fast Retransmit is being performed the sender SHOULD * ignore the value of cwnd and SHOULD NOT delay retransmission. */ if (chunk->fast_retransmit != SCTP_NEED_FRTX && flight_size >= transport->cwnd) return SCTP_XMIT_RWND_FULL; /* Nagle's algorithm to solve small-packet problem: * Inhibit the sending of new chunks when new outgoing data arrives * if any previously transmitted data on the connection remains * unacknowledged. */ if ((sctp_sk(asoc->base.sk)->nodelay || inflight == 0) && !asoc->force_delay) /* Nothing unacked */ return SCTP_XMIT_OK; if (!sctp_packet_empty(packet)) /* Append to packet */ return SCTP_XMIT_OK; if (!sctp_state(asoc, ESTABLISHED)) return SCTP_XMIT_OK; /* Check whether this chunk and all the rest of pending data will fit * or delay in hopes of bundling a full sized packet. */ if (chunk->skb->len + q->out_qlen > transport->pathmtu - packet->overhead - sctp_datachk_len(&chunk->asoc->stream) - 4) /* Enough data queued to fill a packet */ return SCTP_XMIT_OK; /* Don't delay large message writes that may have been fragmented */ if (!chunk->msg->can_delay) return SCTP_XMIT_OK; /* Defer until all data acked or packet full */ return SCTP_XMIT_DELAY; } /* This private function does management things when adding DATA chunk */ static void sctp_packet_append_data(struct sctp_packet *packet, struct sctp_chunk *chunk) { struct sctp_transport *transport = packet->transport; size_t datasize = sctp_data_size(chunk); struct sctp_association *asoc = transport->asoc; u32 rwnd = asoc->peer.rwnd; /* Keep track of how many bytes are in flight over this transport. */ transport->flight_size += datasize; /* Keep track of how many bytes are in flight to the receiver. */ asoc->outqueue.outstanding_bytes += datasize; /* Update our view of the receiver's rwnd. */ if (datasize < rwnd) rwnd -= datasize; else rwnd = 0; asoc->peer.rwnd = rwnd; sctp_chunk_assign_tsn(chunk); asoc->stream.si->assign_number(chunk); } static enum sctp_xmit sctp_packet_will_fit(struct sctp_packet *packet, struct sctp_chunk *chunk, u16 chunk_len) { enum sctp_xmit retval = SCTP_XMIT_OK; size_t psize, pmtu, maxsize; /* Don't bundle in this packet if this chunk's auth key doesn't * match other chunks already enqueued on this packet. Also, * don't bundle the chunk with auth key if other chunks in this * packet don't have auth key. */ if ((packet->auth && chunk->shkey != packet->auth->shkey) || (!packet->auth && chunk->shkey && chunk->chunk_hdr->type != SCTP_CID_AUTH)) return SCTP_XMIT_PMTU_FULL; psize = packet->size; if (packet->transport->asoc) pmtu = packet->transport->asoc->pathmtu; else pmtu = packet->transport->pathmtu; /* Decide if we need to fragment or resubmit later. */ if (psize + chunk_len > pmtu) { /* It's OK to fragment at IP level if any one of the following * is true: * 1. The packet is empty (meaning this chunk is greater * the MTU) * 2. The packet doesn't have any data in it yet and data * requires authentication. */ if (sctp_packet_empty(packet) || (!packet->has_data && chunk->auth)) { /* We no longer do re-fragmentation. * Just fragment at the IP layer, if we * actually hit this condition */ packet->ipfragok = 1; goto out; } /* Similarly, if this chunk was built before a PMTU * reduction, we have to fragment it at IP level now. So * if the packet already contains something, we need to * flush. */ maxsize = pmtu - packet->overhead; if (packet->auth) maxsize -= SCTP_PAD4(packet->auth->skb->len); if (chunk_len > maxsize) retval = SCTP_XMIT_PMTU_FULL; /* It is also okay to fragment if the chunk we are * adding is a control chunk, but only if current packet * is not a GSO one otherwise it causes fragmentation of * a large frame. So in this case we allow the * fragmentation by forcing it to be in a new packet. */ if (!sctp_chunk_is_data(chunk) && packet->has_data) retval = SCTP_XMIT_PMTU_FULL; if (psize + chunk_len > packet->max_size) /* Hit GSO/PMTU limit, gotta flush */ retval = SCTP_XMIT_PMTU_FULL; if (!packet->transport->burst_limited && psize + chunk_len > (packet->transport->cwnd >> 1)) /* Do not allow a single GSO packet to use more * than half of cwnd. */ retval = SCTP_XMIT_PMTU_FULL; if (packet->transport->burst_limited && psize + chunk_len > (packet->transport->burst_limited >> 1)) /* Do not allow a single GSO packet to use more * than half of original cwnd. */ retval = SCTP_XMIT_PMTU_FULL; /* Otherwise it will fit in the GSO packet */ } out: return retval; } |
| 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * Definitions for SMC Connections, Link Groups and Links * * Copyright IBM Corp. 2016 * * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> */ #ifndef _SMC_CORE_H #define _SMC_CORE_H #include <linux/atomic.h> #include <linux/types.h> #include <linux/smc.h> #include <linux/pci.h> #include <rdma/ib_verbs.h> #include <net/genetlink.h> #include <net/smc.h> #include "smc.h" #include "smc_ib.h" #include "smc_clc.h" #define SMC_RMBS_PER_LGR_MAX 255 /* max. # of RMBs per link group */ #define SMC_CONN_PER_LGR_MIN 16 /* min. # of connections per link group */ #define SMC_CONN_PER_LGR_MAX 255 /* max. # of connections per link group, * also is the default value for SMC-R v1 and v2.0 */ #define SMC_CONN_PER_LGR_PREFER 255 /* Preferred connections per link group used for * SMC-R v2.1 and later negotiation, vendors or * distributions may modify it to a value between * 16-255 as needed. */ #define SMCR_MAX_SEND_WR_DEF 16 /* Default number of work requests per send queue */ #define SMCR_MAX_RECV_WR_DEF 48 /* Default number of work requests per recv queue */ struct smc_lgr_list { /* list of link group definition */ struct list_head list; spinlock_t lock; /* protects list of link groups */ u32 num; /* unique link group number */ }; enum smc_lgr_role { /* possible roles of a link group */ SMC_CLNT, /* client */ SMC_SERV /* server */ }; enum smc_link_state { /* possible states of a link */ SMC_LNK_UNUSED, /* link is unused */ SMC_LNK_INACTIVE, /* link is inactive */ SMC_LNK_ACTIVATING, /* link is being activated */ SMC_LNK_ACTIVE, /* link is active */ }; #define SMC_WR_BUF_SIZE 48 /* size of work request buffer */ #define SMC_WR_BUF_V2_SIZE 8192 /* size of v2 work request buffer */ struct smc_wr_buf { u8 raw[SMC_WR_BUF_SIZE]; }; struct smc_wr_v2_buf { u8 raw[SMC_WR_BUF_V2_SIZE]; }; #define SMC_WR_REG_MR_WAIT_TIME (5 * HZ)/* wait time for ib_wr_reg_mr result */ enum smc_wr_reg_state { POSTED, /* ib_wr_reg_mr request posted */ CONFIRMED, /* ib_wr_reg_mr response: successful */ FAILED /* ib_wr_reg_mr response: failure */ }; struct smc_rdma_sge { /* sges for RDMA writes */ struct ib_sge wr_tx_rdma_sge[SMC_IB_MAX_SEND_SGE]; }; #define SMC_MAX_RDMA_WRITES 2 /* max. # of RDMA writes per * message send */ struct smc_rdma_sges { /* sges per message send */ struct smc_rdma_sge tx_rdma_sge[SMC_MAX_RDMA_WRITES]; }; struct smc_rdma_wr { /* work requests per message * send */ struct ib_rdma_wr wr_tx_rdma[SMC_MAX_RDMA_WRITES]; }; #define SMC_LGR_ID_SIZE 4 struct smc_link { struct smc_ib_device *smcibdev; /* ib-device */ u8 ibport; /* port - values 1 | 2 */ struct ib_pd *roce_pd; /* IB protection domain, * unique for every RoCE QP */ struct ib_qp *roce_qp; /* IB queue pair */ struct ib_qp_attr qp_attr; /* IB queue pair attributes */ struct smc_wr_buf *wr_tx_bufs; /* WR send payload buffers */ struct ib_send_wr *wr_tx_ibs; /* WR send meta data */ struct ib_sge *wr_tx_sges; /* WR send gather meta data */ struct smc_rdma_sges *wr_tx_rdma_sges;/*RDMA WRITE gather meta data*/ struct smc_rdma_wr *wr_tx_rdmas; /* WR RDMA WRITE */ struct smc_wr_tx_pend *wr_tx_pends; /* WR send waiting for CQE */ struct completion *wr_tx_compl; /* WR send CQE completion */ /* above four vectors have wr_tx_cnt elements and use the same index */ struct ib_send_wr *wr_tx_v2_ib; /* WR send v2 meta data */ struct ib_sge *wr_tx_v2_sge; /* WR send v2 gather meta data*/ struct smc_wr_tx_pend *wr_tx_v2_pend; /* WR send v2 waiting for CQE */ dma_addr_t wr_tx_dma_addr; /* DMA address of wr_tx_bufs */ dma_addr_t wr_tx_v2_dma_addr; /* DMA address of v2 tx buf*/ atomic_long_t wr_tx_id; /* seq # of last sent WR */ unsigned long *wr_tx_mask; /* bit mask of used indexes */ u32 wr_tx_cnt; /* number of WR send buffers */ wait_queue_head_t wr_tx_wait; /* wait for free WR send buf */ struct { struct percpu_ref wr_tx_refs; } ____cacheline_aligned_in_smp; struct completion tx_ref_comp; u8 *wr_rx_bufs; /* WR recv payload buffers */ struct ib_recv_wr *wr_rx_ibs; /* WR recv meta data */ struct ib_sge *wr_rx_sges; /* WR recv scatter meta data */ /* above three vectors have wr_rx_cnt elements and use the same index */ int wr_rx_sge_cnt; /* rx sge, V1 is 1, V2 is either 2 or 1 */ int wr_rx_buflen; /* buffer len for the first sge, len for the * second sge is lgr shared if rx sge is 2. */ dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */ dma_addr_t wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/ u64 wr_rx_id; /* seq # of last recv WR */ u64 wr_rx_id_compl; /* seq # of last completed WR */ u32 wr_rx_cnt; /* number of WR recv buffers */ unsigned long wr_rx_tstamp; /* jiffies when last buf rx */ wait_queue_head_t wr_rx_empty_wait; /* wait for RQ empty */ struct ib_reg_wr wr_reg; /* WR register memory region */ wait_queue_head_t wr_reg_wait; /* wait for wr_reg result */ struct { struct percpu_ref wr_reg_refs; } ____cacheline_aligned_in_smp; struct completion reg_ref_comp; enum smc_wr_reg_state wr_reg_state; /* state of wr_reg request */ u8 gid[SMC_GID_SIZE];/* gid matching used vlan id*/ u8 sgid_index; /* gid index for vlan id */ u32 peer_qpn; /* QP number of peer */ enum ib_mtu path_mtu; /* used mtu */ enum ib_mtu peer_mtu; /* mtu size of peer */ u32 psn_initial; /* QP tx initial packet seqno */ u32 peer_psn; /* QP rx initial packet seqno */ u8 peer_mac[ETH_ALEN]; /* = gid[8:10||13:15] */ u8 peer_gid[SMC_GID_SIZE]; /* gid of peer*/ u8 link_id; /* unique # within link group */ u8 link_uid[SMC_LGR_ID_SIZE]; /* unique lnk id */ u8 peer_link_uid[SMC_LGR_ID_SIZE]; /* peer uid */ u8 link_idx; /* index in lgr link array */ u8 link_is_asym; /* is link asymmetric? */ u8 clearing : 1; /* link is being cleared */ refcount_t refcnt; /* link reference count */ struct smc_link_group *lgr; /* parent link group */ struct work_struct link_down_wrk; /* wrk to bring link down */ char ibname[IB_DEVICE_NAME_MAX]; /* ib device name */ int ndev_ifidx; /* network device ifindex */ enum smc_link_state state; /* state of link */ struct delayed_work llc_testlink_wrk; /* testlink worker */ struct completion llc_testlink_resp; /* wait for rx of testlink */ int llc_testlink_time; /* testlink interval */ atomic_t conn_cnt; /* connections on this link */ u16 max_send_wr; u16 max_recv_wr; }; /* For now we just allow one parallel link per link group. The SMC protocol * allows more (up to 8). */ #define SMC_LINKS_PER_LGR_MAX 3 #define SMC_SINGLE_LINK 0 #define SMC_LINKS_ADD_LNK_MIN 1 /* min. # of links per link group */ #define SMC_LINKS_ADD_LNK_MAX 2 /* max. # of links per link group, also is the * default value for smc-r v1.0 and v2.0 */ #define SMC_LINKS_PER_LGR_MAX_PREFER 2 /* Preferred max links per link group used for * SMC-R v2.1 and later negotiation, vendors or * distributions may modify it to a value between * 1-2 as needed. */ /* tx/rx buffer list element for sndbufs list and rmbs list of a lgr */ struct smc_buf_desc { struct list_head list; void *cpu_addr; /* virtual address of buffer */ struct page *pages; int len; /* length of buffer */ u32 used; /* currently used / unused */ union { struct { /* SMC-R */ struct sg_table sgt[SMC_LINKS_PER_LGR_MAX]; /* virtual buffer */ struct ib_mr *mr[SMC_LINKS_PER_LGR_MAX]; /* memory region: for rmb and * vzalloced sndbuf * incl. rkey provided to peer * and lkey provided to local */ u32 order; /* allocation order */ u8 is_conf_rkey; /* confirm_rkey done */ u8 is_reg_mr[SMC_LINKS_PER_LGR_MAX]; /* mem region registered */ u8 is_map_ib[SMC_LINKS_PER_LGR_MAX]; /* mem region mapped to lnk */ u8 is_dma_need_sync; u8 is_reg_err; /* buffer registration err */ u8 is_vm; /* virtually contiguous */ }; struct { /* SMC-D */ /* SMC-D tx buffer */ bool is_attached; /* no need for explicit writes */ /* SMC-D rx buffer: */ unsigned short sba_idx; /* SBA index number */ u64 token; /* DMB token number */ dma_addr_t dma_addr; /* DMA address */ }; }; }; struct smc_rtoken { /* address/key of remote RMB */ u64 dma_addr; u32 rkey; }; #define SMC_BUF_MIN_SIZE 16384 /* minimum size of an RMB */ #define SMC_RMBE_SIZES 16 /* number of distinct RMBE sizes */ /* theoretically, the RFC states that largest size would be 512K, * i.e. compressed 5 and thus 6 sizes (0..5), despite * struct smc_clc_msg_accept_confirm.rmbe_size being a 4 bit value (0..15) */ struct smcd_dev; enum smc_lgr_type { /* redundancy state of lgr */ SMC_LGR_NONE, /* no active links, lgr to be deleted */ SMC_LGR_SINGLE, /* 1 active RNIC on each peer */ SMC_LGR_SYMMETRIC, /* 2 active RNICs on each peer */ SMC_LGR_ASYMMETRIC_PEER, /* local has 2, peer 1 active RNICs */ SMC_LGR_ASYMMETRIC_LOCAL, /* local has 1, peer 2 active RNICs */ }; enum smcr_buf_type { /* types of SMC-R sndbufs and RMBs */ SMCR_PHYS_CONT_BUFS = 0, SMCR_VIRT_CONT_BUFS = 1, SMCR_MIXED_BUFS = 2, }; enum smc_llc_flowtype { SMC_LLC_FLOW_NONE = 0, SMC_LLC_FLOW_ADD_LINK = 2, SMC_LLC_FLOW_DEL_LINK = 4, SMC_LLC_FLOW_REQ_ADD_LINK = 5, SMC_LLC_FLOW_RKEY = 6, }; struct smc_llc_qentry; struct smc_llc_flow { enum smc_llc_flowtype type; struct smc_llc_qentry *qentry; }; struct smc_link_group { struct list_head list; struct rb_root conns_all; /* connection tree */ rwlock_t conns_lock; /* protects conns_all */ unsigned int conns_num; /* current # of connections */ unsigned short vlan_id; /* vlan id of link group */ struct list_head sndbufs[SMC_RMBE_SIZES];/* tx buffers */ struct rw_semaphore sndbufs_lock; /* protects tx buffers */ struct list_head rmbs[SMC_RMBE_SIZES]; /* rx buffers */ struct rw_semaphore rmbs_lock; /* protects rx buffers */ u64 alloc_sndbufs; /* stats of tx buffers */ u64 alloc_rmbs; /* stats of rx buffers */ u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */ struct delayed_work free_work; /* delayed freeing of an lgr */ struct work_struct terminate_work; /* abnormal lgr termination */ struct workqueue_struct *tx_wq; /* wq for conn. tx workers */ u8 sync_err : 1; /* lgr no longer fits to peer */ u8 terminating : 1;/* lgr is terminating */ u8 freeing : 1; /* lgr is being freed */ refcount_t refcnt; /* lgr reference count */ bool is_smcd; /* SMC-R or SMC-D */ u8 smc_version; u8 negotiated_eid[SMC_MAX_EID_LEN]; u8 peer_os; /* peer operating system */ u8 peer_smc_release; u8 peer_hostname[SMC_MAX_HOSTNAME_LEN]; union { struct { /* SMC-R */ enum smc_lgr_role role; /* client or server */ struct smc_link lnk[SMC_LINKS_PER_LGR_MAX]; /* smc link */ struct smc_wr_v2_buf *wr_rx_buf_v2; /* WR v2 recv payload buffer */ struct smc_wr_v2_buf *wr_tx_buf_v2; /* WR v2 send payload buffer */ char peer_systemid[SMC_SYSTEMID_LEN]; /* unique system_id of peer */ struct smc_rtoken rtokens[SMC_RMBS_PER_LGR_MAX] [SMC_LINKS_PER_LGR_MAX]; /* remote addr/key pairs */ DECLARE_BITMAP(rtokens_used_mask, SMC_RMBS_PER_LGR_MAX); /* used rtoken elements */ u8 next_link_id; enum smc_lgr_type type; enum smcr_buf_type buf_type; /* redundancy state */ u8 pnet_id[SMC_MAX_PNETID_LEN + 1]; /* pnet id of this lgr */ struct list_head llc_event_q; /* queue for llc events */ spinlock_t llc_event_q_lock; /* protects llc_event_q */ struct rw_semaphore llc_conf_mutex; /* protects lgr reconfig. */ struct work_struct llc_add_link_work; struct work_struct llc_del_link_work; struct work_struct llc_event_work; /* llc event worker */ wait_queue_head_t llc_flow_waiter; /* w4 next llc event */ wait_queue_head_t llc_msg_waiter; /* w4 next llc msg */ struct smc_llc_flow llc_flow_lcl; /* llc local control field */ struct smc_llc_flow llc_flow_rmt; /* llc remote control field */ struct smc_llc_qentry *delayed_event; /* arrived when flow active */ spinlock_t llc_flow_lock; /* protects llc flow */ int llc_testlink_time; /* link keep alive time */ u32 llc_termination_rsn; /* rsn code for termination */ u8 nexthop_mac[ETH_ALEN]; u8 uses_gateway; __be32 saddr; /* net namespace */ struct net *net; u8 max_conns; /* max conn can be assigned to lgr */ u8 max_links; /* max links can be added in lgr */ u16 max_send_wr; /* number of WR buffers on send */ u16 max_recv_wr; /* number of WR buffers on recv */ }; struct { /* SMC-D */ struct smcd_gid peer_gid; /* Peer GID (remote) */ struct smcd_dev *smcd; /* ISM device for VLAN reg. */ u8 peer_shutdown : 1; /* peer triggered shutdownn */ }; }; }; struct smc_clc_msg_local; #define GID_LIST_SIZE 2 struct smc_gidlist { u8 len; u8 list[GID_LIST_SIZE][SMC_GID_SIZE]; }; struct smc_init_info_smcrv2 { /* Input fields */ __be32 saddr; struct sock *clc_sk; __be32 daddr; /* Output fields when saddr is set */ struct smc_ib_device *ib_dev_v2; u8 ib_port_v2; u8 ib_gid_v2[SMC_GID_SIZE]; /* Additional output fields when clc_sk and daddr is set as well */ u8 uses_gateway; u8 nexthop_mac[ETH_ALEN]; struct smc_gidlist gidlist; }; #define SMC_MAX_V2_ISM_DEVS SMCD_CLC_MAX_V2_GID_ENTRIES /* max # of proposed non-native ISM devices, * which can't exceed the max # of CHID-GID * entries in CLC proposal SMC-Dv2 extension. */ struct smc_init_info { u8 is_smcd; u8 smc_type_v1; u8 smc_type_v2; u8 release_nr; u8 max_conns; u8 max_links; u8 first_contact_peer; u8 first_contact_local; u16 feature_mask; unsigned short vlan_id; u32 rc; u8 negotiated_eid[SMC_MAX_EID_LEN]; /* SMC-R */ u8 smcr_version; u8 check_smcrv2; u8 peer_gid[SMC_GID_SIZE]; u8 peer_mac[ETH_ALEN]; u8 peer_systemid[SMC_SYSTEMID_LEN]; struct smc_ib_device *ib_dev; u8 ib_gid[SMC_GID_SIZE]; u8 ib_port; u32 ib_clcqpn; struct smc_init_info_smcrv2 smcrv2; /* SMC-D */ struct smcd_gid ism_peer_gid[SMC_MAX_V2_ISM_DEVS + 1]; struct smcd_dev *ism_dev[SMC_MAX_V2_ISM_DEVS + 1]; u16 ism_chid[SMC_MAX_V2_ISM_DEVS + 1]; u8 ism_offered_cnt; /* # of ISM devices offered */ u8 ism_selected; /* index of selected ISM dev*/ u8 smcd_version; }; /* Find the connection associated with the given alert token in the link group. * To use rbtrees we have to implement our own search core. * Requires @conns_lock * @token alert token to search for * @lgr link group to search in * Returns connection associated with token if found, NULL otherwise. */ static inline struct smc_connection *smc_lgr_find_conn( u32 token, struct smc_link_group *lgr) { struct smc_connection *res = NULL; struct rb_node *node; node = lgr->conns_all.rb_node; while (node) { struct smc_connection *cur = rb_entry(node, struct smc_connection, alert_node); if (cur->alert_token_local > token) { node = node->rb_left; } else { if (cur->alert_token_local < token) { node = node->rb_right; } else { res = cur; break; } } } return res; } static inline bool smc_conn_lgr_valid(struct smc_connection *conn) { return conn->lgr && conn->alert_token_local; } /* * Returns true if the specified link is usable. * * usable means the link is ready to receive RDMA messages, map memory * on the link, etc. This doesn't ensure we are able to send RDMA messages * on this link, if sending RDMA messages is needed, use smc_link_sendable() */ static inline bool smc_link_usable(struct smc_link *lnk) { if (lnk->state == SMC_LNK_UNUSED || lnk->state == SMC_LNK_INACTIVE) return false; return true; } /* * Returns true if the specified link is ready to receive AND send RDMA * messages. * * For the client side in first contact, the underlying QP may still in * RESET or RTR when the link state is ACTIVATING, checks in smc_link_usable() * is not strong enough. For those places that need to send any CDC or LLC * messages, use smc_link_sendable(), otherwise, use smc_link_usable() instead */ static inline bool smc_link_sendable(struct smc_link *lnk) { return smc_link_usable(lnk) && lnk->qp_attr.cur_qp_state == IB_QPS_RTS; } static inline bool smc_link_active(struct smc_link *lnk) { return lnk->state == SMC_LNK_ACTIVE; } static inline bool smc_link_shared_v2_rxbuf(struct smc_link *lnk) { return lnk->wr_rx_sge_cnt > 1; } static inline void smc_gid_be16_convert(__u8 *buf, u8 *gid_raw) { sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x", be16_to_cpu(((__be16 *)gid_raw)[0]), be16_to_cpu(((__be16 *)gid_raw)[1]), be16_to_cpu(((__be16 *)gid_raw)[2]), be16_to_cpu(((__be16 *)gid_raw)[3]), be16_to_cpu(((__be16 *)gid_raw)[4]), be16_to_cpu(((__be16 *)gid_raw)[5]), be16_to_cpu(((__be16 *)gid_raw)[6]), be16_to_cpu(((__be16 *)gid_raw)[7])); } struct smc_pci_dev { __u32 pci_fid; __u16 pci_pchid; __u16 pci_vendor; __u16 pci_device; __u8 pci_id[SMC_PCI_ID_STR_LEN]; }; static inline void smc_set_pci_values(struct pci_dev *pci_dev, struct smc_pci_dev *smc_dev) { smc_dev->pci_vendor = pci_dev->vendor; smc_dev->pci_device = pci_dev->device; snprintf(smc_dev->pci_id, sizeof(smc_dev->pci_id), "%s", pci_name(pci_dev)); #if IS_ENABLED(CONFIG_S390) { /* Set s390 specific PCI information */ struct zpci_dev *zdev; zdev = to_zpci(pci_dev); smc_dev->pci_fid = zdev->fid; smc_dev->pci_pchid = zdev->pchid; } #endif } struct smc_sock; struct smc_clc_msg_accept_confirm; void smc_lgr_cleanup_early(struct smc_link_group *lgr); void smc_lgr_terminate_sched(struct smc_link_group *lgr); void smc_lgr_hold(struct smc_link_group *lgr); void smc_lgr_put(struct smc_link_group *lgr); void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport); void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport); void smc_smcd_terminate(struct smcd_dev *dev, struct smcd_gid *peer_gid, unsigned short vlan); void smc_smcd_terminate_all(struct smcd_dev *dev); void smc_smcr_terminate_all(struct smc_ib_device *smcibdev); int smc_buf_create(struct smc_sock *smc, bool is_smcd); int smcd_buf_attach(struct smc_sock *smc); int smc_uncompress_bufsize(u8 compressed); int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_link *link, struct smc_clc_msg_accept_confirm *clc); int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey); int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey); void smc_rtoken_set(struct smc_link_group *lgr, int link_idx, int link_idx_new, __be32 nw_rkey_known, __be64 nw_vaddr, __be32 nw_rkey); void smc_rtoken_set2(struct smc_link_group *lgr, int rtok_idx, int link_id, __be64 nw_vaddr, __be32 nw_rkey); void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn); void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn); int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini); void smc_conn_free(struct smc_connection *conn); int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini); int smc_core_init(void); void smc_core_exit(void); int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, u8 link_idx, struct smc_init_info *ini); void smcr_link_clear(struct smc_link *lnk, bool log); void smcr_link_hold(struct smc_link *lnk); void smcr_link_put(struct smc_link *lnk); void smc_switch_link_and_count(struct smc_connection *conn, struct smc_link *to_lnk); int smcr_buf_map_lgr(struct smc_link *lnk); int smcr_buf_reg_lgr(struct smc_link *lnk); void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type); void smcr_lgr_set_type_asym(struct smc_link_group *lgr, enum smc_lgr_type new_type, int asym_lnk_idx); int smcr_link_reg_buf(struct smc_link *link, struct smc_buf_desc *rmb_desc); struct smc_link *smc_switch_conns(struct smc_link_group *lgr, struct smc_link *from_lnk, bool is_dev_err); void smcr_link_down_cond(struct smc_link *lnk); void smcr_link_down_cond_sched(struct smc_link *lnk); int smc_nl_get_sys_info(struct sk_buff *skb, struct netlink_callback *cb); int smcr_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb); int smcr_nl_get_link(struct sk_buff *skb, struct netlink_callback *cb); int smcd_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb); static inline struct smc_link_group *smc_get_lgr(struct smc_link *link) { return link->lgr; } #endif |
| 754 55 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_SMP_H #define __LINUX_SMP_H /* * Generic SMP support * Alan Cox. <alan@redhat.com> */ #include <linux/errno.h> #include <linux/types.h> #include <linux/list.h> #include <linux/cpumask.h> #include <linux/init.h> #include <linux/smp_types.h> typedef void (*smp_call_func_t)(void *info); typedef bool (*smp_cond_func_t)(int cpu, void *info); /* * structure shares (partial) layout with struct irq_work */ struct __call_single_data { struct __call_single_node node; smp_call_func_t func; void *info; }; #define CSD_INIT(_func, _info) \ (struct __call_single_data){ .func = (_func), .info = (_info), } /* Use __aligned() to avoid to use 2 cache lines for 1 csd */ typedef struct __call_single_data call_single_data_t __aligned(sizeof(struct __call_single_data)); #define INIT_CSD(_csd, _func, _info) \ do { \ *(_csd) = CSD_INIT((_func), (_info)); \ } while (0) /* * Enqueue a llist_node on the call_single_queue; be very careful, read * flush_smp_call_function_queue() in detail. */ extern void __smp_call_single_queue(int cpu, struct llist_node *node); /* total number of cpus in this system (may exceed NR_CPUS) */ extern unsigned int total_cpus; int smp_call_function_single(int cpuid, smp_call_func_t func, void *info, int wait); void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, void *info, bool wait, const struct cpumask *mask); int smp_call_function_single_async(int cpu, call_single_data_t *csd); /* * Cpus stopping functions in panic. All have default weak definitions. * Architecture-dependent code may override them. */ void __noreturn panic_smp_self_stop(void); void __noreturn nmi_panic_self_stop(struct pt_regs *regs); void crash_smp_send_stop(void); /* * Call a function on all processors */ static inline void on_each_cpu(smp_call_func_t func, void *info, int wait) { on_each_cpu_cond_mask(NULL, func, info, wait, cpu_online_mask); } /** * on_each_cpu_mask(): Run a function on processors specified by * cpumask, which may include the local processor. * @mask: The set of cpus to run on (only runs on online subset). * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. * @wait: If true, wait (atomically) until function has completed * on other CPUs. * * If @wait is true, then returns once @func has returned. * * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. The * exception is that it may be used during early boot while * early_boot_irqs_disabled is set. */ static inline void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, void *info, bool wait) { on_each_cpu_cond_mask(NULL, func, info, wait, mask); } /* * Call a function on each processor for which the supplied function * cond_func returns a positive value. This may include the local * processor. May be used during early boot while early_boot_irqs_disabled is * set. Use local_irq_save/restore() instead of local_irq_disable/enable(). */ static inline void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func, void *info, bool wait) { on_each_cpu_cond_mask(cond_func, func, info, wait, cpu_online_mask); } /* * Architecture specific boot CPU setup. Defined as empty weak function in * init/main.c. Architectures can override it. */ void __init smp_prepare_boot_cpu(void); #ifdef CONFIG_SMP #include <linux/preempt.h> #include <linux/compiler.h> #include <linux/thread_info.h> #include <asm/smp.h> /* * main cross-CPU interfaces, handles INIT, TLB flush, STOP, etc. * (defined in asm header): */ /* * stops all CPUs but the current one: */ extern void smp_send_stop(void); /* * sends a 'reschedule' event to another CPU: */ extern void arch_smp_send_reschedule(int cpu); /* * scheduler_ipi() is inline so can't be passed as callback reason, but the * callsite IP should be sufficient for root-causing IPIs sent from here. */ #define smp_send_reschedule(cpu) ({ \ trace_ipi_send_cpu(cpu, _RET_IP_, NULL); \ arch_smp_send_reschedule(cpu); \ }) /* * Prepare machine for booting other CPUs. */ extern void smp_prepare_cpus(unsigned int max_cpus); /* * Bring a CPU up */ extern int __cpu_up(unsigned int cpunum, struct task_struct *tidle); /* * Final polishing of CPUs */ extern void smp_cpus_done(unsigned int max_cpus); /* * Call a function on all other processors */ void smp_call_function(smp_call_func_t func, void *info, int wait); void smp_call_function_many(const struct cpumask *mask, smp_call_func_t func, void *info, bool wait); int smp_call_function_any(const struct cpumask *mask, smp_call_func_t func, void *info, int wait); void kick_all_cpus_sync(void); void wake_up_all_idle_cpus(void); bool cpus_peek_for_pending_ipi(const struct cpumask *mask); /* * Generic and arch helpers */ void __init call_function_init(void); void generic_smp_call_function_single_interrupt(void); #define generic_smp_call_function_interrupt \ generic_smp_call_function_single_interrupt extern unsigned int setup_max_cpus; extern void __init setup_nr_cpu_ids(void); extern void __init smp_init(void); extern int __boot_cpu_id; static inline int get_boot_cpu_id(void) { return __boot_cpu_id; } #else /* !SMP */ static inline void smp_send_stop(void) { } /* * These macros fold the SMP functionality into a single CPU system */ #define raw_smp_processor_id() 0 static inline void up_smp_call_function(smp_call_func_t func, void *info) { } #define smp_call_function(func, info, wait) \ (up_smp_call_function(func, info)) static inline void smp_send_reschedule(int cpu) { } #define smp_call_function_many(mask, func, info, wait) \ (up_smp_call_function(func, info)) static inline void call_function_init(void) { } static inline int smp_call_function_any(const struct cpumask *mask, smp_call_func_t func, void *info, int wait) { return smp_call_function_single(0, func, info, wait); } static inline void kick_all_cpus_sync(void) { } static inline void wake_up_all_idle_cpus(void) { } static inline bool cpus_peek_for_pending_ipi(const struct cpumask *mask) { return false; } #define setup_max_cpus 0 #ifdef CONFIG_UP_LATE_INIT extern void __init up_late_init(void); static __always_inline void smp_init(void) { up_late_init(); } #else static inline void smp_init(void) { } #endif static inline int get_boot_cpu_id(void) { return 0; } #endif /* !SMP */ /** * raw_smp_processor_id() - get the current (unstable) CPU id * * For then you know what you are doing and need an unstable * CPU id. */ /** * smp_processor_id() - get the current (stable) CPU id * * This is the normal accessor to the CPU id and should be used * whenever possible. * * The CPU id is stable when: * * - IRQs are disabled; * - preemption is disabled; * - the task is CPU affine. * * When CONFIG_DEBUG_PREEMPT; we verify these assumption and WARN * when smp_processor_id() is used when the CPU id is not stable. */ /* * Allow the architecture to differentiate between a stable and unstable read. * For example, x86 uses an IRQ-safe asm-volatile read for the unstable but a * regular asm read for the stable. */ #ifndef __smp_processor_id #define __smp_processor_id() raw_smp_processor_id() #endif #ifdef CONFIG_DEBUG_PREEMPT extern unsigned int debug_smp_processor_id(void); # define smp_processor_id() debug_smp_processor_id() #else # define smp_processor_id() __smp_processor_id() #endif #define get_cpu() ({ preempt_disable(); __smp_processor_id(); }) #define put_cpu() preempt_enable() /* * Callback to arch code if there's nosmp or maxcpus=0 on the * boot command line: */ extern void arch_disable_smp_support(void); extern void arch_thaw_secondary_cpus_begin(void); extern void arch_thaw_secondary_cpus_end(void); void smp_setup_processor_id(void); int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys); /* SMP core functions */ int smpcfd_prepare_cpu(unsigned int cpu); int smpcfd_dead_cpu(unsigned int cpu); int smpcfd_dying_cpu(unsigned int cpu); #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG bool csd_lock_is_stuck(void); #else static inline bool csd_lock_is_stuck(void) { return false; } #endif #endif /* __LINUX_SMP_H */ |
| 1 1002 1003 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 | /* Copyright 2011, Siemens AG * written by Alexander Smirnov <alex.bluesman.smirnov@gmail.com> */ /* Based on patches from Jon Smirl <jonsmirl@gmail.com> * Copyright (c) 2011 Jon Smirl <jonsmirl@gmail.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ /* Jon's code is based on 6lowpan implementation for Contiki which is: * Copyright (c) 2008, Swedish Institute of Computer Science. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the Institute nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <linux/module.h> #include <linux/netdevice.h> #include <linux/ieee802154.h> #include <linux/if_arp.h> #include <net/ipv6.h> #include <net/netdev_lock.h> #include "6lowpan_i.h" static int open_count; static const struct header_ops lowpan_header_ops = { .create = lowpan_header_create, }; static int lowpan_dev_init(struct net_device *ldev) { netdev_lockdep_set_classes(ldev); return 0; } static int lowpan_open(struct net_device *dev) { if (!open_count) lowpan_rx_init(); open_count++; return 0; } static int lowpan_stop(struct net_device *dev) { open_count--; if (!open_count) lowpan_rx_exit(); return 0; } static int lowpan_neigh_construct(struct net_device *dev, struct neighbour *n) { struct lowpan_802154_neigh *neigh = lowpan_802154_neigh(neighbour_priv(n)); /* default no short_addr is available for a neighbour */ neigh->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC); return 0; } static int lowpan_get_iflink(const struct net_device *dev) { return READ_ONCE(lowpan_802154_dev(dev)->wdev->ifindex); } static const struct net_device_ops lowpan_netdev_ops = { .ndo_init = lowpan_dev_init, .ndo_start_xmit = lowpan_xmit, .ndo_open = lowpan_open, .ndo_stop = lowpan_stop, .ndo_neigh_construct = lowpan_neigh_construct, .ndo_get_iflink = lowpan_get_iflink, }; static void lowpan_setup(struct net_device *ldev) { memset(ldev->broadcast, 0xff, IEEE802154_ADDR_LEN); /* We need an ipv6hdr as minimum len when calling xmit */ ldev->hard_header_len = sizeof(struct ipv6hdr); ldev->flags = IFF_BROADCAST | IFF_MULTICAST; ldev->priv_flags |= IFF_NO_QUEUE; ldev->netdev_ops = &lowpan_netdev_ops; ldev->header_ops = &lowpan_header_ops; ldev->needs_free_netdev = true; ldev->netns_immutable = true; } static int lowpan_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != IEEE802154_ADDR_LEN) return -EINVAL; } return 0; } static int lowpan_newlink(struct net_device *ldev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct nlattr **tb = params->tb; struct net_device *wdev; int ret; ASSERT_RTNL(); pr_debug("adding new link\n"); if (!tb[IFLA_LINK]) return -EINVAL; if (params->link_net && !net_eq(params->link_net, dev_net(ldev))) return -EINVAL; /* find and hold wpan device */ wdev = dev_get_by_index(dev_net(ldev), nla_get_u32(tb[IFLA_LINK])); if (!wdev) return -ENODEV; if (wdev->type != ARPHRD_IEEE802154) { dev_put(wdev); return -EINVAL; } if (wdev->ieee802154_ptr->lowpan_dev) { dev_put(wdev); return -EBUSY; } lowpan_802154_dev(ldev)->wdev = wdev; /* Set the lowpan hardware address to the wpan hardware address. */ __dev_addr_set(ldev, wdev->dev_addr, IEEE802154_ADDR_LEN); /* We need headroom for possible wpan_dev_hard_header call and tailroom * for encryption/fcs handling. The lowpan interface will replace * the IPv6 header with 6LoWPAN header. At worst case the 6LoWPAN * header has LOWPAN_IPHC_MAX_HEADER_LEN more bytes than the IPv6 * header. */ ldev->needed_headroom = LOWPAN_IPHC_MAX_HEADER_LEN + wdev->needed_headroom; ldev->needed_tailroom = wdev->needed_tailroom; ldev->neigh_priv_len = sizeof(struct lowpan_802154_neigh); ret = lowpan_register_netdevice(ldev, LOWPAN_LLTYPE_IEEE802154); if (ret < 0) { dev_put(wdev); return ret; } wdev->ieee802154_ptr->lowpan_dev = ldev; return 0; } static void lowpan_dellink(struct net_device *ldev, struct list_head *head) { struct net_device *wdev = lowpan_802154_dev(ldev)->wdev; ASSERT_RTNL(); wdev->ieee802154_ptr->lowpan_dev = NULL; lowpan_unregister_netdevice(ldev); dev_put(wdev); } static struct rtnl_link_ops lowpan_link_ops __read_mostly = { .kind = "lowpan", .priv_size = LOWPAN_PRIV_SIZE(sizeof(struct lowpan_802154_dev)), .setup = lowpan_setup, .newlink = lowpan_newlink, .dellink = lowpan_dellink, .validate = lowpan_validate, }; static inline int __init lowpan_netlink_init(void) { return rtnl_link_register(&lowpan_link_ops); } static inline void lowpan_netlink_fini(void) { rtnl_link_unregister(&lowpan_link_ops); } static int lowpan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *ndev = netdev_notifier_info_to_dev(ptr); struct wpan_dev *wpan_dev; if (ndev->type != ARPHRD_IEEE802154) return NOTIFY_DONE; wpan_dev = ndev->ieee802154_ptr; if (!wpan_dev) return NOTIFY_DONE; switch (event) { case NETDEV_UNREGISTER: /* Check if wpan interface is unregistered that we * also delete possible lowpan interfaces which belongs * to the wpan interface. */ if (wpan_dev->lowpan_dev) lowpan_dellink(wpan_dev->lowpan_dev, NULL); break; default: return NOTIFY_DONE; } return NOTIFY_OK; } static struct notifier_block lowpan_dev_notifier = { .notifier_call = lowpan_device_event, }; static int __init lowpan_init_module(void) { int err = 0; err = lowpan_net_frag_init(); if (err < 0) goto out; err = lowpan_netlink_init(); if (err < 0) goto out_frag; err = register_netdevice_notifier(&lowpan_dev_notifier); if (err < 0) goto out_pack; return 0; out_pack: lowpan_netlink_fini(); out_frag: lowpan_net_frag_exit(); out: return err; } static void __exit lowpan_cleanup_module(void) { lowpan_netlink_fini(); lowpan_net_frag_exit(); unregister_netdevice_notifier(&lowpan_dev_notifier); } module_init(lowpan_init_module); module_exit(lowpan_cleanup_module); MODULE_DESCRIPTION("IPv6 over Low power Wireless Personal Area Network IEEE 802.15.4 core"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("lowpan"); |
| 137 137 137 137 136 137 136 137 137 137 137 137 37 38 37 38 38 38 38 38 38 4 4 4 8 8 8 7 6 2 2 2 2 8 8 40 40 40 40 40 40 21858 21853 445 445 445 21860 21690 21855 38 37 38 38 37 38 38 38 38 38 38 38 38 38 38 38 38 7 7 38 38 38 38 38 38 38 38 38 22 22 39 2 2 2 39 39 39 102 101 4 4 3 3 3 5 5 38 75 48 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux Socket Filter - Kernel level socket filtering * * Based on the design of the Berkeley Packet Filter. The new * internal format has been designed by PLUMgrid: * * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com * * Authors: * * Jay Schulist <jschlst@samba.org> * Alexei Starovoitov <ast@plumgrid.com> * Daniel Borkmann <dborkman@redhat.com> * * Andi Kleen - Fix a few bad bugs and races. * Kris Katterjohn - Added many additional checks in bpf_check_classic() */ #include <uapi/linux/btf.h> #include <crypto/sha1.h> #include <linux/filter.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> #include <linux/prandom.h> #include <linux/bpf.h> #include <linux/btf.h> #include <linux/objtool.h> #include <linux/overflow.h> #include <linux/rbtree_latch.h> #include <linux/kallsyms.h> #include <linux/rcupdate.h> #include <linux/perf_event.h> #include <linux/extable.h> #include <linux/log2.h> #include <linux/bpf_verifier.h> #include <linux/nodemask.h> #include <linux/nospec.h> #include <linux/bpf_mem_alloc.h> #include <linux/memcontrol.h> #include <linux/execmem.h> #include <crypto/sha2.h> #include <asm/barrier.h> #include <linux/unaligned.h> /* Registers */ #define BPF_R0 regs[BPF_REG_0] #define BPF_R1 regs[BPF_REG_1] #define BPF_R2 regs[BPF_REG_2] #define BPF_R3 regs[BPF_REG_3] #define BPF_R4 regs[BPF_REG_4] #define BPF_R5 regs[BPF_REG_5] #define BPF_R6 regs[BPF_REG_6] #define BPF_R7 regs[BPF_REG_7] #define BPF_R8 regs[BPF_REG_8] #define BPF_R9 regs[BPF_REG_9] #define BPF_R10 regs[BPF_REG_10] /* Named registers */ #define DST regs[insn->dst_reg] #define SRC regs[insn->src_reg] #define FP regs[BPF_REG_FP] #define AX regs[BPF_REG_AX] #define ARG1 regs[BPF_REG_ARG1] #define CTX regs[BPF_REG_CTX] #define OFF insn->off #define IMM insn->imm struct bpf_mem_alloc bpf_global_ma; bool bpf_global_ma_set; /* No hurry in this branch * * Exported for the bpf jit load helper. */ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size) { u8 *ptr = NULL; if (k >= SKF_NET_OFF) { ptr = skb_network_header(skb) + k - SKF_NET_OFF; } else if (k >= SKF_LL_OFF) { if (unlikely(!skb_mac_header_was_set(skb))) return NULL; ptr = skb_mac_header(skb) + k - SKF_LL_OFF; } if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb)) return ptr; return NULL; } /* tell bpf programs that include vmlinux.h kernel's PAGE_SIZE */ enum page_size_enum { __PAGE_SIZE = PAGE_SIZE }; struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags) { gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags); struct bpf_prog_aux *aux; struct bpf_prog *fp; size = round_up(size, __PAGE_SIZE); fp = __vmalloc(size, gfp_flags); if (fp == NULL) return NULL; aux = kzalloc(sizeof(*aux), bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags)); if (aux == NULL) { vfree(fp); return NULL; } fp->active = alloc_percpu_gfp(int, bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags)); if (!fp->active) { vfree(fp); kfree(aux); return NULL; } fp->pages = size / PAGE_SIZE; fp->aux = aux; fp->aux->main_prog_aux = aux; fp->aux->prog = fp; fp->jit_requested = ebpf_jit_enabled(); fp->blinding_requested = bpf_jit_blinding_enabled(fp); #ifdef CONFIG_CGROUP_BPF aux->cgroup_atype = CGROUP_BPF_ATTACH_TYPE_INVALID; #endif INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode); #ifdef CONFIG_FINEIBT INIT_LIST_HEAD_RCU(&fp->aux->ksym_prefix.lnode); #endif mutex_init(&fp->aux->used_maps_mutex); mutex_init(&fp->aux->ext_mutex); mutex_init(&fp->aux->dst_mutex); #ifdef CONFIG_BPF_SYSCALL bpf_prog_stream_init(fp); #endif return fp; } struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) { gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags); struct bpf_prog *prog; int cpu; prog = bpf_prog_alloc_no_stats(size, gfp_extra_flags); if (!prog) return NULL; prog->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags); if (!prog->stats) { free_percpu(prog->active); kfree(prog->aux); vfree(prog); return NULL; } for_each_possible_cpu(cpu) { struct bpf_prog_stats *pstats; pstats = per_cpu_ptr(prog->stats, cpu); u64_stats_init(&pstats->syncp); } return prog; } EXPORT_SYMBOL_GPL(bpf_prog_alloc); int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog) { if (!prog->aux->nr_linfo || !prog->jit_requested) return 0; prog->aux->jited_linfo = kvcalloc(prog->aux->nr_linfo, sizeof(*prog->aux->jited_linfo), bpf_memcg_flags(GFP_KERNEL | __GFP_NOWARN)); if (!prog->aux->jited_linfo) return -ENOMEM; return 0; } void bpf_prog_jit_attempt_done(struct bpf_prog *prog) { if (prog->aux->jited_linfo && (!prog->jited || !prog->aux->jited_linfo[0])) { kvfree(prog->aux->jited_linfo); prog->aux->jited_linfo = NULL; } kfree(prog->aux->kfunc_tab); prog->aux->kfunc_tab = NULL; } /* The jit engine is responsible to provide an array * for insn_off to the jited_off mapping (insn_to_jit_off). * * The idx to this array is the insn_off. Hence, the insn_off * here is relative to the prog itself instead of the main prog. * This array has one entry for each xlated bpf insn. * * jited_off is the byte off to the end of the jited insn. * * Hence, with * insn_start: * The first bpf insn off of the prog. The insn off * here is relative to the main prog. * e.g. if prog is a subprog, insn_start > 0 * linfo_idx: * The prog's idx to prog->aux->linfo and jited_linfo * * jited_linfo[linfo_idx] = prog->bpf_func * * For i > linfo_idx, * * jited_linfo[i] = prog->bpf_func + * insn_to_jit_off[linfo[i].insn_off - insn_start - 1] */ void bpf_prog_fill_jited_linfo(struct bpf_prog *prog, const u32 *insn_to_jit_off) { u32 linfo_idx, insn_start, insn_end, nr_linfo, i; const struct bpf_line_info *linfo; void **jited_linfo; if (!prog->aux->jited_linfo || prog->aux->func_idx > prog->aux->func_cnt) /* Userspace did not provide linfo */ return; linfo_idx = prog->aux->linfo_idx; linfo = &prog->aux->linfo[linfo_idx]; insn_start = linfo[0].insn_off; insn_end = insn_start + prog->len; jited_linfo = &prog->aux->jited_linfo[linfo_idx]; jited_linfo[0] = prog->bpf_func; nr_linfo = prog->aux->nr_linfo - linfo_idx; for (i = 1; i < nr_linfo && linfo[i].insn_off < insn_end; i++) /* The verifier ensures that linfo[i].insn_off is * strictly increasing */ jited_linfo[i] = prog->bpf_func + insn_to_jit_off[linfo[i].insn_off - insn_start - 1]; } struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags) { gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags); struct bpf_prog *fp; u32 pages; size = round_up(size, PAGE_SIZE); pages = size / PAGE_SIZE; if (pages <= fp_old->pages) return fp_old; fp = __vmalloc(size, gfp_flags); if (fp) { memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); fp->pages = pages; fp->aux->prog = fp; /* We keep fp->aux from fp_old around in the new * reallocated structure. */ fp_old->aux = NULL; fp_old->stats = NULL; fp_old->active = NULL; __bpf_prog_free(fp_old); } return fp; } void __bpf_prog_free(struct bpf_prog *fp) { if (fp->aux) { mutex_destroy(&fp->aux->used_maps_mutex); mutex_destroy(&fp->aux->dst_mutex); kfree(fp->aux->poke_tab); kfree(fp->aux); } free_percpu(fp->stats); free_percpu(fp->active); vfree(fp); } int bpf_prog_calc_tag(struct bpf_prog *fp) { size_t size = bpf_prog_insn_size(fp); struct bpf_insn *dst; bool was_ld_map; u32 i; dst = vmalloc(size); if (!dst) return -ENOMEM; /* We need to take out the map fd for the digest calculation * since they are unstable from user space side. */ for (i = 0, was_ld_map = false; i < fp->len; i++) { dst[i] = fp->insnsi[i]; if (!was_ld_map && dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) && (dst[i].src_reg == BPF_PSEUDO_MAP_FD || dst[i].src_reg == BPF_PSEUDO_MAP_VALUE)) { was_ld_map = true; dst[i].imm = 0; } else if (was_ld_map && dst[i].code == 0 && dst[i].dst_reg == 0 && dst[i].src_reg == 0 && dst[i].off == 0) { was_ld_map = false; dst[i].imm = 0; } else { was_ld_map = false; } } sha256((u8 *)dst, size, fp->digest); vfree(dst); return 0; } static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old, s32 end_new, s32 curr, const bool probe_pass) { const s64 imm_min = S32_MIN, imm_max = S32_MAX; s32 delta = end_new - end_old; s64 imm = insn->imm; if (curr < pos && curr + imm + 1 >= end_old) imm += delta; else if (curr >= end_new && curr + imm + 1 < end_new) imm -= delta; if (imm < imm_min || imm > imm_max) return -ERANGE; if (!probe_pass) insn->imm = imm; return 0; } static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old, s32 end_new, s32 curr, const bool probe_pass) { s64 off_min, off_max, off; s32 delta = end_new - end_old; if (insn->code == (BPF_JMP32 | BPF_JA)) { off = insn->imm; off_min = S32_MIN; off_max = S32_MAX; } else { off = insn->off; off_min = S16_MIN; off_max = S16_MAX; } if (curr < pos && curr + off + 1 >= end_old) off += delta; else if (curr >= end_new && curr + off + 1 < end_new) off -= delta; if (off < off_min || off > off_max) return -ERANGE; if (!probe_pass) { if (insn->code == (BPF_JMP32 | BPF_JA)) insn->imm = off; else insn->off = off; } return 0; } static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, s32 end_old, s32 end_new, const bool probe_pass) { u32 i, insn_cnt = prog->len + (probe_pass ? end_new - end_old : 0); struct bpf_insn *insn = prog->insnsi; int ret = 0; for (i = 0; i < insn_cnt; i++, insn++) { u8 code; /* In the probing pass we still operate on the original, * unpatched image in order to check overflows before we * do any other adjustments. Therefore skip the patchlet. */ if (probe_pass && i == pos) { i = end_new; insn = prog->insnsi + end_old; } if (bpf_pseudo_func(insn)) { ret = bpf_adj_delta_to_imm(insn, pos, end_old, end_new, i, probe_pass); if (ret) return ret; continue; } code = insn->code; if ((BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) || BPF_OP(code) == BPF_EXIT) continue; /* Adjust offset of jmps if we cross patch boundaries. */ if (BPF_OP(code) == BPF_CALL) { if (insn->src_reg != BPF_PSEUDO_CALL) continue; ret = bpf_adj_delta_to_imm(insn, pos, end_old, end_new, i, probe_pass); } else { ret = bpf_adj_delta_to_off(insn, pos, end_old, end_new, i, probe_pass); } if (ret) break; } return ret; } static void bpf_adj_linfo(struct bpf_prog *prog, u32 off, u32 delta) { struct bpf_line_info *linfo; u32 i, nr_linfo; nr_linfo = prog->aux->nr_linfo; if (!nr_linfo || !delta) return; linfo = prog->aux->linfo; for (i = 0; i < nr_linfo; i++) if (off < linfo[i].insn_off) break; /* Push all off < linfo[i].insn_off by delta */ for (; i < nr_linfo; i++) linfo[i].insn_off += delta; } struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len) { u32 insn_adj_cnt, insn_rest, insn_delta = len - 1; const u32 cnt_max = S16_MAX; struct bpf_prog *prog_adj; int err; /* Since our patchlet doesn't expand the image, we're done. */ if (insn_delta == 0) { memcpy(prog->insnsi + off, patch, sizeof(*patch)); return prog; } insn_adj_cnt = prog->len + insn_delta; /* Reject anything that would potentially let the insn->off * target overflow when we have excessive program expansions. * We need to probe here before we do any reallocation where * we afterwards may not fail anymore. */ if (insn_adj_cnt > cnt_max && (err = bpf_adj_branches(prog, off, off + 1, off + len, true))) return ERR_PTR(err); /* Several new instructions need to be inserted. Make room * for them. Likely, there's no need for a new allocation as * last page could have large enough tailroom. */ prog_adj = bpf_prog_realloc(prog, bpf_prog_size(insn_adj_cnt), GFP_USER); if (!prog_adj) return ERR_PTR(-ENOMEM); prog_adj->len = insn_adj_cnt; /* Patching happens in 3 steps: * * 1) Move over tail of insnsi from next instruction onwards, * so we can patch the single target insn with one or more * new ones (patching is always from 1 to n insns, n > 0). * 2) Inject new instructions at the target location. * 3) Adjust branch offsets if necessary. */ insn_rest = insn_adj_cnt - off - len; memmove(prog_adj->insnsi + off + len, prog_adj->insnsi + off + 1, sizeof(*patch) * insn_rest); memcpy(prog_adj->insnsi + off, patch, sizeof(*patch) * len); /* We are guaranteed to not fail at this point, otherwise * the ship has sailed to reverse to the original state. An * overflow cannot happen at this point. */ BUG_ON(bpf_adj_branches(prog_adj, off, off + 1, off + len, false)); bpf_adj_linfo(prog_adj, off, insn_delta); return prog_adj; } int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt) { int err; /* Branch offsets can't overflow when program is shrinking, no need * to call bpf_adj_branches(..., true) here */ memmove(prog->insnsi + off, prog->insnsi + off + cnt, sizeof(struct bpf_insn) * (prog->len - off - cnt)); prog->len -= cnt; err = bpf_adj_branches(prog, off, off + cnt, off, false); WARN_ON_ONCE(err); return err; } static void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp) { int i; for (i = 0; i < fp->aux->real_func_cnt; i++) bpf_prog_kallsyms_del(fp->aux->func[i]); } void bpf_prog_kallsyms_del_all(struct bpf_prog *fp) { bpf_prog_kallsyms_del_subprogs(fp); bpf_prog_kallsyms_del(fp); } #ifdef CONFIG_BPF_JIT /* All BPF JIT sysctl knobs here. */ int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON); int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON); int bpf_jit_harden __read_mostly; long bpf_jit_limit __read_mostly; long bpf_jit_limit_max __read_mostly; static void bpf_prog_ksym_set_addr(struct bpf_prog *prog) { WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog)); prog->aux->ksym.start = (unsigned long) prog->bpf_func; prog->aux->ksym.end = prog->aux->ksym.start + prog->jited_len; } static void bpf_prog_ksym_set_name(struct bpf_prog *prog) { char *sym = prog->aux->ksym.name; const char *end = sym + KSYM_NAME_LEN; const struct btf_type *type; const char *func_name; BUILD_BUG_ON(sizeof("bpf_prog_") + sizeof(prog->tag) * 2 + /* name has been null terminated. * We should need +1 for the '_' preceding * the name. However, the null character * is double counted between the name and the * sizeof("bpf_prog_") above, so we omit * the +1 here. */ sizeof(prog->aux->name) > KSYM_NAME_LEN); sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_"); sym = bin2hex(sym, prog->tag, sizeof(prog->tag)); /* prog->aux->name will be ignored if full btf name is available */ if (prog->aux->func_info_cnt && prog->aux->func_idx < prog->aux->func_info_cnt) { type = btf_type_by_id(prog->aux->btf, prog->aux->func_info[prog->aux->func_idx].type_id); func_name = btf_name_by_offset(prog->aux->btf, type->name_off); snprintf(sym, (size_t)(end - sym), "_%s", func_name); return; } if (prog->aux->name[0]) snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name); else *sym = 0; } static unsigned long bpf_get_ksym_start(struct latch_tree_node *n) { return container_of(n, struct bpf_ksym, tnode)->start; } static __always_inline bool bpf_tree_less(struct latch_tree_node *a, struct latch_tree_node *b) { return bpf_get_ksym_start(a) < bpf_get_ksym_start(b); } static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n) { unsigned long val = (unsigned long)key; const struct bpf_ksym *ksym; ksym = container_of(n, struct bpf_ksym, tnode); if (val < ksym->start) return -1; /* Ensure that we detect return addresses as part of the program, when * the final instruction is a call for a program part of the stack * trace. Therefore, do val > ksym->end instead of val >= ksym->end. */ if (val > ksym->end) return 1; return 0; } static const struct latch_tree_ops bpf_tree_ops = { .less = bpf_tree_less, .comp = bpf_tree_comp, }; static DEFINE_SPINLOCK(bpf_lock); static LIST_HEAD(bpf_kallsyms); static struct latch_tree_root bpf_tree __cacheline_aligned; void bpf_ksym_add(struct bpf_ksym *ksym) { spin_lock_bh(&bpf_lock); WARN_ON_ONCE(!list_empty(&ksym->lnode)); list_add_tail_rcu(&ksym->lnode, &bpf_kallsyms); latch_tree_insert(&ksym->tnode, &bpf_tree, &bpf_tree_ops); spin_unlock_bh(&bpf_lock); } static void __bpf_ksym_del(struct bpf_ksym *ksym) { if (list_empty(&ksym->lnode)) return; latch_tree_erase(&ksym->tnode, &bpf_tree, &bpf_tree_ops); list_del_rcu(&ksym->lnode); } void bpf_ksym_del(struct bpf_ksym *ksym) { spin_lock_bh(&bpf_lock); __bpf_ksym_del(ksym); spin_unlock_bh(&bpf_lock); } static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp) { return fp->jited && !bpf_prog_was_classic(fp); } void bpf_prog_kallsyms_add(struct bpf_prog *fp) { if (!bpf_prog_kallsyms_candidate(fp) || !bpf_token_capable(fp->aux->token, CAP_BPF)) return; bpf_prog_ksym_set_addr(fp); bpf_prog_ksym_set_name(fp); fp->aux->ksym.prog = true; bpf_ksym_add(&fp->aux->ksym); #ifdef CONFIG_FINEIBT /* * When FineIBT, code in the __cfi_foo() symbols can get executed * and hence unwinder needs help. */ if (cfi_mode != CFI_FINEIBT) return; snprintf(fp->aux->ksym_prefix.name, KSYM_NAME_LEN, "__cfi_%s", fp->aux->ksym.name); fp->aux->ksym_prefix.start = (unsigned long) fp->bpf_func - 16; fp->aux->ksym_prefix.end = (unsigned long) fp->bpf_func; bpf_ksym_add(&fp->aux->ksym_prefix); #endif } void bpf_prog_kallsyms_del(struct bpf_prog *fp) { if (!bpf_prog_kallsyms_candidate(fp)) return; bpf_ksym_del(&fp->aux->ksym); #ifdef CONFIG_FINEIBT if (cfi_mode != CFI_FINEIBT) return; bpf_ksym_del(&fp->aux->ksym_prefix); #endif } static struct bpf_ksym *bpf_ksym_find(unsigned long addr) { struct latch_tree_node *n; n = latch_tree_find((void *)addr, &bpf_tree, &bpf_tree_ops); return n ? container_of(n, struct bpf_ksym, tnode) : NULL; } int __bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char *sym) { struct bpf_ksym *ksym; int ret = 0; rcu_read_lock(); ksym = bpf_ksym_find(addr); if (ksym) { unsigned long symbol_start = ksym->start; unsigned long symbol_end = ksym->end; ret = strscpy(sym, ksym->name, KSYM_NAME_LEN); if (size) *size = symbol_end - symbol_start; if (off) *off = addr - symbol_start; } rcu_read_unlock(); return ret; } bool is_bpf_text_address(unsigned long addr) { bool ret; rcu_read_lock(); ret = bpf_ksym_find(addr) != NULL; rcu_read_unlock(); return ret; } struct bpf_prog *bpf_prog_ksym_find(unsigned long addr) { struct bpf_ksym *ksym; WARN_ON_ONCE(!rcu_read_lock_held()); ksym = bpf_ksym_find(addr); return ksym && ksym->prog ? container_of(ksym, struct bpf_prog_aux, ksym)->prog : NULL; } bool bpf_has_frame_pointer(unsigned long ip) { struct bpf_ksym *ksym; unsigned long offset; guard(rcu)(); ksym = bpf_ksym_find(ip); if (!ksym || !ksym->fp_start || !ksym->fp_end) return false; offset = ip - ksym->start; return offset >= ksym->fp_start && offset < ksym->fp_end; } const struct exception_table_entry *search_bpf_extables(unsigned long addr) { const struct exception_table_entry *e = NULL; struct bpf_prog *prog; rcu_read_lock(); prog = bpf_prog_ksym_find(addr); if (!prog) goto out; if (!prog->aux->num_exentries) goto out; e = search_extable(prog->aux->extable, prog->aux->num_exentries, addr); out: rcu_read_unlock(); return e; } int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *sym) { struct bpf_ksym *ksym; unsigned int it = 0; int ret = -ERANGE; if (!bpf_jit_kallsyms_enabled()) return ret; rcu_read_lock(); list_for_each_entry_rcu(ksym, &bpf_kallsyms, lnode) { if (it++ != symnum) continue; strscpy(sym, ksym->name, KSYM_NAME_LEN); *value = ksym->start; *type = BPF_SYM_ELF_TYPE; ret = 0; break; } rcu_read_unlock(); return ret; } int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, struct bpf_jit_poke_descriptor *poke) { struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab; static const u32 poke_tab_max = 1024; u32 slot = prog->aux->size_poke_tab; u32 size = slot + 1; if (size > poke_tab_max) return -ENOSPC; if (poke->tailcall_target || poke->tailcall_target_stable || poke->tailcall_bypass || poke->adj_off || poke->bypass_addr) return -EINVAL; switch (poke->reason) { case BPF_POKE_REASON_TAIL_CALL: if (!poke->tail_call.map) return -EINVAL; break; default: return -EINVAL; } tab = krealloc_array(tab, size, sizeof(*poke), GFP_KERNEL); if (!tab) return -ENOMEM; memcpy(&tab[slot], poke, sizeof(*poke)); prog->aux->size_poke_tab = size; prog->aux->poke_tab = tab; return slot; } /* * BPF program pack allocator. * * Most BPF programs are pretty small. Allocating a hole page for each * program is sometime a waste. Many small bpf program also adds pressure * to instruction TLB. To solve this issue, we introduce a BPF program pack * allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86) * to host BPF programs. */ #define BPF_PROG_CHUNK_SHIFT 6 #define BPF_PROG_CHUNK_SIZE (1 << BPF_PROG_CHUNK_SHIFT) #define BPF_PROG_CHUNK_MASK (~(BPF_PROG_CHUNK_SIZE - 1)) struct bpf_prog_pack { struct list_head list; void *ptr; unsigned long bitmap[]; }; void bpf_jit_fill_hole_with_zero(void *area, unsigned int size) { memset(area, 0, size); } #define BPF_PROG_SIZE_TO_NBITS(size) (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE) static DEFINE_MUTEX(pack_mutex); static LIST_HEAD(pack_list); /* PMD_SIZE is not available in some special config, e.g. ARCH=arm with * CONFIG_MMU=n. Use PAGE_SIZE in these cases. */ #ifdef PMD_SIZE /* PMD_SIZE is really big for some archs. It doesn't make sense to * reserve too much memory in one allocation. Hardcode BPF_PROG_PACK_SIZE to * 2MiB * num_possible_nodes(). On most architectures PMD_SIZE will be * greater than or equal to 2MB. */ #define BPF_PROG_PACK_SIZE (SZ_2M * num_possible_nodes()) #else #define BPF_PROG_PACK_SIZE PAGE_SIZE #endif #define BPF_PROG_CHUNK_COUNT (BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE) static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_insns) { struct bpf_prog_pack *pack; int err; pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)), GFP_KERNEL); if (!pack) return NULL; pack->ptr = bpf_jit_alloc_exec(BPF_PROG_PACK_SIZE); if (!pack->ptr) goto out; bpf_fill_ill_insns(pack->ptr, BPF_PROG_PACK_SIZE); bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE); set_vm_flush_reset_perms(pack->ptr); err = set_memory_rox((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE); if (err) goto out; list_add_tail(&pack->list, &pack_list); return pack; out: bpf_jit_free_exec(pack->ptr); kfree(pack); return NULL; } void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns) { unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size); struct bpf_prog_pack *pack; unsigned long pos; void *ptr = NULL; mutex_lock(&pack_mutex); if (size > BPF_PROG_PACK_SIZE) { size = round_up(size, PAGE_SIZE); ptr = bpf_jit_alloc_exec(size); if (ptr) { int err; bpf_fill_ill_insns(ptr, size); set_vm_flush_reset_perms(ptr); err = set_memory_rox((unsigned long)ptr, size / PAGE_SIZE); if (err) { bpf_jit_free_exec(ptr); ptr = NULL; } } goto out; } list_for_each_entry(pack, &pack_list, list) { pos = bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0, nbits, 0); if (pos < BPF_PROG_CHUNK_COUNT) goto found_free_area; } pack = alloc_new_pack(bpf_fill_ill_insns); if (!pack) goto out; pos = 0; found_free_area: bitmap_set(pack->bitmap, pos, nbits); ptr = (void *)(pack->ptr) + (pos << BPF_PROG_CHUNK_SHIFT); out: mutex_unlock(&pack_mutex); return ptr; } void bpf_prog_pack_free(void *ptr, u32 size) { struct bpf_prog_pack *pack = NULL, *tmp; unsigned int nbits; unsigned long pos; mutex_lock(&pack_mutex); if (size > BPF_PROG_PACK_SIZE) { bpf_jit_free_exec(ptr); goto out; } list_for_each_entry(tmp, &pack_list, list) { if (ptr >= tmp->ptr && (tmp->ptr + BPF_PROG_PACK_SIZE) > ptr) { pack = tmp; break; } } if (WARN_ONCE(!pack, "bpf_prog_pack bug\n")) goto out; nbits = BPF_PROG_SIZE_TO_NBITS(size); pos = ((unsigned long)ptr - (unsigned long)pack->ptr) >> BPF_PROG_CHUNK_SHIFT; WARN_ONCE(bpf_arch_text_invalidate(ptr, size), "bpf_prog_pack bug: missing bpf_arch_text_invalidate?\n"); bitmap_clear(pack->bitmap, pos, nbits); if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0, BPF_PROG_CHUNK_COUNT, 0) == 0) { list_del(&pack->list); bpf_jit_free_exec(pack->ptr); kfree(pack); } out: mutex_unlock(&pack_mutex); } static atomic_long_t bpf_jit_current; /* Can be overridden by an arch's JIT compiler if it has a custom, * dedicated BPF backend memory area, or if neither of the two * below apply. */ u64 __weak bpf_jit_alloc_exec_limit(void) { #if defined(MODULES_VADDR) return MODULES_END - MODULES_VADDR; #else return VMALLOC_END - VMALLOC_START; #endif } static int __init bpf_jit_charge_init(void) { /* Only used as heuristic here to derive limit. */ bpf_jit_limit_max = bpf_jit_alloc_exec_limit(); bpf_jit_limit = min_t(u64, round_up(bpf_jit_limit_max >> 1, PAGE_SIZE), LONG_MAX); return 0; } pure_initcall(bpf_jit_charge_init); int bpf_jit_charge_modmem(u32 size) { if (atomic_long_add_return(size, &bpf_jit_current) > READ_ONCE(bpf_jit_limit)) { if (!bpf_capable()) { atomic_long_sub(size, &bpf_jit_current); return -EPERM; } } return 0; } void bpf_jit_uncharge_modmem(u32 size) { atomic_long_sub(size, &bpf_jit_current); } void *__weak bpf_jit_alloc_exec(unsigned long size) { return execmem_alloc(EXECMEM_BPF, size); } void __weak bpf_jit_free_exec(void *addr) { execmem_free(addr); } struct bpf_binary_header * bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, unsigned int alignment, bpf_jit_fill_hole_t bpf_fill_ill_insns) { struct bpf_binary_header *hdr; u32 size, hole, start; WARN_ON_ONCE(!is_power_of_2(alignment) || alignment > BPF_IMAGE_ALIGNMENT); /* Most of BPF filters are really small, but if some of them * fill a page, allow at least 128 extra bytes to insert a * random section of illegal instructions. */ size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE); if (bpf_jit_charge_modmem(size)) return NULL; hdr = bpf_jit_alloc_exec(size); if (!hdr) { bpf_jit_uncharge_modmem(size); return NULL; } /* Fill space with illegal/arch-dep instructions. */ bpf_fill_ill_insns(hdr, size); hdr->size = size; hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), PAGE_SIZE - sizeof(*hdr)); start = get_random_u32_below(hole) & ~(alignment - 1); /* Leave a random number of instructions before BPF code. */ *image_ptr = &hdr->image[start]; return hdr; } void bpf_jit_binary_free(struct bpf_binary_header *hdr) { u32 size = hdr->size; bpf_jit_free_exec(hdr); bpf_jit_uncharge_modmem(size); } /* Allocate jit binary from bpf_prog_pack allocator. * Since the allocated memory is RO+X, the JIT engine cannot write directly * to the memory. To solve this problem, a RW buffer is also allocated at * as the same time. The JIT engine should calculate offsets based on the * RO memory address, but write JITed program to the RW buffer. Once the * JIT engine finishes, it calls bpf_jit_binary_pack_finalize, which copies * the JITed program to the RO memory. */ struct bpf_binary_header * bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr, unsigned int alignment, struct bpf_binary_header **rw_header, u8 **rw_image, bpf_jit_fill_hole_t bpf_fill_ill_insns) { struct bpf_binary_header *ro_header; u32 size, hole, start; WARN_ON_ONCE(!is_power_of_2(alignment) || alignment > BPF_IMAGE_ALIGNMENT); /* add 16 bytes for a random section of illegal instructions */ size = round_up(proglen + sizeof(*ro_header) + 16, BPF_PROG_CHUNK_SIZE); if (bpf_jit_charge_modmem(size)) return NULL; ro_header = bpf_prog_pack_alloc(size, bpf_fill_ill_insns); if (!ro_header) { bpf_jit_uncharge_modmem(size); return NULL; } *rw_header = kvmalloc(size, GFP_KERNEL); if (!*rw_header) { bpf_prog_pack_free(ro_header, size); bpf_jit_uncharge_modmem(size); return NULL; } /* Fill space with illegal/arch-dep instructions. */ bpf_fill_ill_insns(*rw_header, size); (*rw_header)->size = size; hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)), BPF_PROG_CHUNK_SIZE - sizeof(*ro_header)); start = get_random_u32_below(hole) & ~(alignment - 1); *image_ptr = &ro_header->image[start]; *rw_image = &(*rw_header)->image[start]; return ro_header; } /* Copy JITed text from rw_header to its final location, the ro_header. */ int bpf_jit_binary_pack_finalize(struct bpf_binary_header *ro_header, struct bpf_binary_header *rw_header) { void *ptr; ptr = bpf_arch_text_copy(ro_header, rw_header, rw_header->size); kvfree(rw_header); if (IS_ERR(ptr)) { bpf_prog_pack_free(ro_header, ro_header->size); return PTR_ERR(ptr); } return 0; } /* bpf_jit_binary_pack_free is called in two different scenarios: * 1) when the program is freed after; * 2) when the JIT engine fails (before bpf_jit_binary_pack_finalize). * For case 2), we need to free both the RO memory and the RW buffer. * * bpf_jit_binary_pack_free requires proper ro_header->size. However, * bpf_jit_binary_pack_alloc does not set it. Therefore, ro_header->size * must be set with either bpf_jit_binary_pack_finalize (normal path) or * bpf_arch_text_copy (when jit fails). */ void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header, struct bpf_binary_header *rw_header) { u32 size = ro_header->size; bpf_prog_pack_free(ro_header, size); kvfree(rw_header); bpf_jit_uncharge_modmem(size); } struct bpf_binary_header * bpf_jit_binary_pack_hdr(const struct bpf_prog *fp) { unsigned long real_start = (unsigned long)fp->bpf_func; unsigned long addr; addr = real_start & BPF_PROG_CHUNK_MASK; return (void *)addr; } static inline struct bpf_binary_header * bpf_jit_binary_hdr(const struct bpf_prog *fp) { unsigned long real_start = (unsigned long)fp->bpf_func; unsigned long addr; addr = real_start & PAGE_MASK; return (void *)addr; } /* This symbol is only overridden by archs that have different * requirements than the usual eBPF JITs, f.e. when they only * implement cBPF JIT, do not set images read-only, etc. */ void __weak bpf_jit_free(struct bpf_prog *fp) { if (fp->jited) { struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); bpf_jit_binary_free(hdr); WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp)); } bpf_prog_unlock_free(fp); } int bpf_jit_get_func_addr(const struct bpf_prog *prog, const struct bpf_insn *insn, bool extra_pass, u64 *func_addr, bool *func_addr_fixed) { s16 off = insn->off; s32 imm = insn->imm; u8 *addr; int err; *func_addr_fixed = insn->src_reg != BPF_PSEUDO_CALL; if (!*func_addr_fixed) { /* Place-holder address till the last pass has collected * all addresses for JITed subprograms in which case we * can pick them up from prog->aux. */ if (!extra_pass) addr = NULL; else if (prog->aux->func && off >= 0 && off < prog->aux->real_func_cnt) addr = (u8 *)prog->aux->func[off]->bpf_func; else return -EINVAL; } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && bpf_jit_supports_far_kfunc_call()) { err = bpf_get_kfunc_addr(prog, insn->imm, insn->off, &addr); if (err) return err; } else { /* Address of a BPF helper call. Since part of the core * kernel, it's always at a fixed location. __bpf_call_base * and the helper with imm relative to it are both in core * kernel. */ addr = (u8 *)__bpf_call_base + imm; } *func_addr = (unsigned long)addr; return 0; } const char *bpf_jit_get_prog_name(struct bpf_prog *prog) { if (prog->aux->ksym.prog) return prog->aux->ksym.name; return prog->aux->name; } static int bpf_jit_blind_insn(const struct bpf_insn *from, const struct bpf_insn *aux, struct bpf_insn *to_buff, bool emit_zext) { struct bpf_insn *to = to_buff; u32 imm_rnd = get_random_u32(); s16 off; BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG); BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG); /* Constraints on AX register: * * AX register is inaccessible from user space. It is mapped in * all JITs, and used here for constant blinding rewrites. It is * typically "stateless" meaning its contents are only valid within * the executed instruction, but not across several instructions. * There are a few exceptions however which are further detailed * below. * * Constant blinding is only used by JITs, not in the interpreter. * The interpreter uses AX in some occasions as a local temporary * register e.g. in DIV or MOD instructions. * * In restricted circumstances, the verifier can also use the AX * register for rewrites as long as they do not interfere with * the above cases! */ if (from->dst_reg == BPF_REG_AX || from->src_reg == BPF_REG_AX) goto out; if (from->imm == 0 && (from->code == (BPF_ALU | BPF_MOV | BPF_K) || from->code == (BPF_ALU64 | BPF_MOV | BPF_K))) { *to++ = BPF_ALU64_REG(BPF_XOR, from->dst_reg, from->dst_reg); goto out; } switch (from->code) { case BPF_ALU | BPF_ADD | BPF_K: case BPF_ALU | BPF_SUB | BPF_K: case BPF_ALU | BPF_AND | BPF_K: case BPF_ALU | BPF_OR | BPF_K: case BPF_ALU | BPF_XOR | BPF_K: case BPF_ALU | BPF_MUL | BPF_K: case BPF_ALU | BPF_MOV | BPF_K: case BPF_ALU | BPF_DIV | BPF_K: case BPF_ALU | BPF_MOD | BPF_K: *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); *to++ = BPF_ALU32_REG_OFF(from->code, from->dst_reg, BPF_REG_AX, from->off); break; case BPF_ALU64 | BPF_ADD | BPF_K: case BPF_ALU64 | BPF_SUB | BPF_K: case BPF_ALU64 | BPF_AND | BPF_K: case BPF_ALU64 | BPF_OR | BPF_K: case BPF_ALU64 | BPF_XOR | BPF_K: case BPF_ALU64 | BPF_MUL | BPF_K: case BPF_ALU64 | BPF_MOV | BPF_K: case BPF_ALU64 | BPF_DIV | BPF_K: case BPF_ALU64 | BPF_MOD | BPF_K: *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); *to++ = BPF_ALU64_REG_OFF(from->code, from->dst_reg, BPF_REG_AX, from->off); break; case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JNE | BPF_K: case BPF_JMP | BPF_JGT | BPF_K: case BPF_JMP | BPF_JLT | BPF_K: case BPF_JMP | BPF_JGE | BPF_K: case BPF_JMP | BPF_JLE | BPF_K: case BPF_JMP | BPF_JSGT | BPF_K: case BPF_JMP | BPF_JSLT | BPF_K: case BPF_JMP | BPF_JSGE | BPF_K: case BPF_JMP | BPF_JSLE | BPF_K: case BPF_JMP | BPF_JSET | BPF_K: /* Accommodate for extra offset in case of a backjump. */ off = from->off; if (off < 0) off -= 2; *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); *to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off); break; case BPF_JMP32 | BPF_JEQ | BPF_K: case BPF_JMP32 | BPF_JNE | BPF_K: case BPF_JMP32 | BPF_JGT | BPF_K: case BPF_JMP32 | BPF_JLT | BPF_K: case BPF_JMP32 | BPF_JGE | BPF_K: case BPF_JMP32 | BPF_JLE | BPF_K: case BPF_JMP32 | BPF_JSGT | BPF_K: case BPF_JMP32 | BPF_JSLT | BPF_K: case BPF_JMP32 | BPF_JSGE | BPF_K: case BPF_JMP32 | BPF_JSLE | BPF_K: case BPF_JMP32 | BPF_JSET | BPF_K: /* Accommodate for extra offset in case of a backjump. */ off = from->off; if (off < 0) off -= 2; *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); *to++ = BPF_JMP32_REG(from->code, from->dst_reg, BPF_REG_AX, off); break; case BPF_LD | BPF_IMM | BPF_DW: *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm); *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); *to++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32); *to++ = BPF_ALU64_REG(BPF_MOV, aux[0].dst_reg, BPF_REG_AX); break; case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */ *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm); *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); if (emit_zext) *to++ = BPF_ZEXT_REG(BPF_REG_AX); *to++ = BPF_ALU64_REG(BPF_OR, aux[0].dst_reg, BPF_REG_AX); break; case BPF_ST | BPF_MEM | BPF_DW: case BPF_ST | BPF_MEM | BPF_W: case BPF_ST | BPF_MEM | BPF_H: case BPF_ST | BPF_MEM | BPF_B: *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); *to++ = BPF_STX_MEM(from->code, from->dst_reg, BPF_REG_AX, from->off); break; } out: return to - to_buff; } static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other, gfp_t gfp_extra_flags) { gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; struct bpf_prog *fp; fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags); if (fp != NULL) { /* aux->prog still points to the fp_other one, so * when promoting the clone to the real program, * this still needs to be adapted. */ memcpy(fp, fp_other, fp_other->pages * PAGE_SIZE); } return fp; } static void bpf_prog_clone_free(struct bpf_prog *fp) { /* aux was stolen by the other clone, so we cannot free * it from this path! It will be freed eventually by the * other program on release. * * At this point, we don't need a deferred release since * clone is guaranteed to not be locked. */ fp->aux = NULL; fp->stats = NULL; fp->active = NULL; __bpf_prog_free(fp); } void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other) { /* We have to repoint aux->prog to self, as we don't * know whether fp here is the clone or the original. */ fp->aux->prog = fp; bpf_prog_clone_free(fp_other); } static void adjust_insn_arrays(struct bpf_prog *prog, u32 off, u32 len) { #ifdef CONFIG_BPF_SYSCALL struct bpf_map *map; int i; if (len <= 1) return; for (i = 0; i < prog->aux->used_map_cnt; i++) { map = prog->aux->used_maps[i]; if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) bpf_insn_array_adjust(map, off, len); } #endif } struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) { struct bpf_insn insn_buff[16], aux[2]; struct bpf_prog *clone, *tmp; int insn_delta, insn_cnt; struct bpf_insn *insn; int i, rewritten; if (!prog->blinding_requested || prog->blinded) return prog; clone = bpf_prog_clone_create(prog, GFP_USER); if (!clone) return ERR_PTR(-ENOMEM); insn_cnt = clone->len; insn = clone->insnsi; for (i = 0; i < insn_cnt; i++, insn++) { if (bpf_pseudo_func(insn)) { /* ld_imm64 with an address of bpf subprog is not * a user controlled constant. Don't randomize it, * since it will conflict with jit_subprogs() logic. */ insn++; i++; continue; } /* We temporarily need to hold the original ld64 insn * so that we can still access the first part in the * second blinding run. */ if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW) && insn[1].code == 0) memcpy(aux, insn, sizeof(aux)); rewritten = bpf_jit_blind_insn(insn, aux, insn_buff, clone->aux->verifier_zext); if (!rewritten) continue; tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten); if (IS_ERR(tmp)) { /* Patching may have repointed aux->prog during * realloc from the original one, so we need to * fix it up here on error. */ bpf_jit_prog_release_other(prog, clone); return tmp; } clone = tmp; insn_delta = rewritten - 1; /* Instructions arrays must be updated using absolute xlated offsets */ adjust_insn_arrays(clone, prog->aux->subprog_start + i, rewritten); /* Walk new program and skip insns we just inserted. */ insn = clone->insnsi + i + insn_delta; insn_cnt += insn_delta; i += insn_delta; } clone->blinded = 1; return clone; } #endif /* CONFIG_BPF_JIT */ /* Base function for offset calculation. Needs to go into .text section, * therefore keeping it non-static as well; will also be used by JITs * anyway later on, so do not let the compiler omit it. This also needs * to go into kallsyms for correlation from e.g. bpftool, so naming * must not change. */ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) { return 0; } EXPORT_SYMBOL_GPL(__bpf_call_base); /* All UAPI available opcodes. */ #define BPF_INSN_MAP(INSN_2, INSN_3) \ /* 32 bit ALU operations. */ \ /* Register based. */ \ INSN_3(ALU, ADD, X), \ INSN_3(ALU, SUB, X), \ INSN_3(ALU, AND, X), \ INSN_3(ALU, OR, X), \ INSN_3(ALU, LSH, X), \ INSN_3(ALU, RSH, X), \ INSN_3(ALU, XOR, X), \ INSN_3(ALU, MUL, X), \ INSN_3(ALU, MOV, X), \ INSN_3(ALU, ARSH, X), \ INSN_3(ALU, DIV, X), \ INSN_3(ALU, MOD, X), \ INSN_2(ALU, NEG), \ INSN_3(ALU, END, TO_BE), \ INSN_3(ALU, END, TO_LE), \ /* Immediate based. */ \ INSN_3(ALU, ADD, K), \ INSN_3(ALU, SUB, K), \ INSN_3(ALU, AND, K), \ INSN_3(ALU, OR, K), \ INSN_3(ALU, LSH, K), \ INSN_3(ALU, RSH, K), \ INSN_3(ALU, XOR, K), \ INSN_3(ALU, MUL, K), \ INSN_3(ALU, MOV, K), \ INSN_3(ALU, ARSH, K), \ INSN_3(ALU, DIV, K), \ INSN_3(ALU, MOD, K), \ /* 64 bit ALU operations. */ \ /* Register based. */ \ INSN_3(ALU64, ADD, X), \ INSN_3(ALU64, SUB, X), \ INSN_3(ALU64, AND, X), \ INSN_3(ALU64, OR, X), \ INSN_3(ALU64, LSH, X), \ INSN_3(ALU64, RSH, X), \ INSN_3(ALU64, XOR, X), \ INSN_3(ALU64, MUL, X), \ INSN_3(ALU64, MOV, X), \ INSN_3(ALU64, ARSH, X), \ INSN_3(ALU64, DIV, X), \ INSN_3(ALU64, MOD, X), \ INSN_2(ALU64, NEG), \ INSN_3(ALU64, END, TO_LE), \ /* Immediate based. */ \ INSN_3(ALU64, ADD, K), \ INSN_3(ALU64, SUB, K), \ INSN_3(ALU64, AND, K), \ INSN_3(ALU64, OR, K), \ INSN_3(ALU64, LSH, K), \ INSN_3(ALU64, RSH, K), \ INSN_3(ALU64, XOR, K), \ INSN_3(ALU64, MUL, K), \ INSN_3(ALU64, MOV, K), \ INSN_3(ALU64, ARSH, K), \ INSN_3(ALU64, DIV, K), \ INSN_3(ALU64, MOD, K), \ /* Call instruction. */ \ INSN_2(JMP, CALL), \ /* Exit instruction. */ \ INSN_2(JMP, EXIT), \ /* 32-bit Jump instructions. */ \ /* Register based. */ \ INSN_3(JMP32, JEQ, X), \ INSN_3(JMP32, JNE, X), \ INSN_3(JMP32, JGT, X), \ INSN_3(JMP32, JLT, X), \ INSN_3(JMP32, JGE, X), \ INSN_3(JMP32, JLE, X), \ INSN_3(JMP32, JSGT, X), \ INSN_3(JMP32, JSLT, X), \ INSN_3(JMP32, JSGE, X), \ INSN_3(JMP32, JSLE, X), \ INSN_3(JMP32, JSET, X), \ /* Immediate based. */ \ INSN_3(JMP32, JEQ, K), \ INSN_3(JMP32, JNE, K), \ INSN_3(JMP32, JGT, K), \ INSN_3(JMP32, JLT, K), \ INSN_3(JMP32, JGE, K), \ INSN_3(JMP32, JLE, K), \ INSN_3(JMP32, JSGT, K), \ INSN_3(JMP32, JSLT, K), \ INSN_3(JMP32, JSGE, K), \ INSN_3(JMP32, JSLE, K), \ INSN_3(JMP32, JSET, K), \ /* Jump instructions. */ \ /* Register based. */ \ INSN_3(JMP, JEQ, X), \ INSN_3(JMP, JNE, X), \ INSN_3(JMP, JGT, X), \ INSN_3(JMP, JLT, X), \ INSN_3(JMP, JGE, X), \ INSN_3(JMP, JLE, X), \ INSN_3(JMP, JSGT, X), \ INSN_3(JMP, JSLT, X), \ INSN_3(JMP, JSGE, X), \ INSN_3(JMP, JSLE, X), \ INSN_3(JMP, JSET, X), \ /* Immediate based. */ \ INSN_3(JMP, JEQ, K), \ INSN_3(JMP, JNE, K), \ INSN_3(JMP, JGT, K), \ INSN_3(JMP, JLT, K), \ INSN_3(JMP, JGE, K), \ INSN_3(JMP, JLE, K), \ INSN_3(JMP, JSGT, K), \ INSN_3(JMP, JSLT, K), \ INSN_3(JMP, JSGE, K), \ INSN_3(JMP, JSLE, K), \ INSN_3(JMP, JSET, K), \ INSN_2(JMP, JA), \ INSN_2(JMP32, JA), \ /* Atomic operations. */ \ INSN_3(STX, ATOMIC, B), \ INSN_3(STX, ATOMIC, H), \ INSN_3(STX, ATOMIC, W), \ INSN_3(STX, ATOMIC, DW), \ /* Store instructions. */ \ /* Register based. */ \ INSN_3(STX, MEM, B), \ INSN_3(STX, MEM, H), \ INSN_3(STX, MEM, W), \ INSN_3(STX, MEM, DW), \ /* Immediate based. */ \ INSN_3(ST, MEM, B), \ INSN_3(ST, MEM, H), \ INSN_3(ST, MEM, W), \ INSN_3(ST, MEM, DW), \ /* Load instructions. */ \ /* Register based. */ \ INSN_3(LDX, MEM, B), \ INSN_3(LDX, MEM, H), \ INSN_3(LDX, MEM, W), \ INSN_3(LDX, MEM, DW), \ INSN_3(LDX, MEMSX, B), \ INSN_3(LDX, MEMSX, H), \ INSN_3(LDX, MEMSX, W), \ /* Immediate based. */ \ INSN_3(LD, IMM, DW) bool bpf_opcode_in_insntable(u8 code) { #define BPF_INSN_2_TBL(x, y) [BPF_##x | BPF_##y] = true #define BPF_INSN_3_TBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = true static const bool public_insntable[256] = { [0 ... 255] = false, /* Now overwrite non-defaults ... */ BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL), /* UAPI exposed, but rewritten opcodes. cBPF carry-over. */ [BPF_LD | BPF_ABS | BPF_B] = true, [BPF_LD | BPF_ABS | BPF_H] = true, [BPF_LD | BPF_ABS | BPF_W] = true, [BPF_LD | BPF_IND | BPF_B] = true, [BPF_LD | BPF_IND | BPF_H] = true, [BPF_LD | BPF_IND | BPF_W] = true, [BPF_JMP | BPF_JA | BPF_X] = true, [BPF_JMP | BPF_JCOND] = true, }; #undef BPF_INSN_3_TBL #undef BPF_INSN_2_TBL return public_insntable[code]; } #ifndef CONFIG_BPF_JIT_ALWAYS_ON /** * ___bpf_prog_run - run eBPF program on a given context * @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers * @insn: is the array of eBPF instructions * * Decode and execute eBPF instructions. * * Return: whatever value is in %BPF_R0 at program exit */ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn) { #define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y #define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z static const void * const jumptable[256] __annotate_jump_table = { [0 ... 255] = &&default_label, /* Now overwrite non-defaults ... */ BPF_INSN_MAP(BPF_INSN_2_LBL, BPF_INSN_3_LBL), /* Non-UAPI available opcodes. */ [BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS, [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL, [BPF_ST | BPF_NOSPEC] = &&ST_NOSPEC, [BPF_LDX | BPF_PROBE_MEM | BPF_B] = &&LDX_PROBE_MEM_B, [BPF_LDX | BPF_PROBE_MEM | BPF_H] = &&LDX_PROBE_MEM_H, [BPF_LDX | BPF_PROBE_MEM | BPF_W] = &&LDX_PROBE_MEM_W, [BPF_LDX | BPF_PROBE_MEM | BPF_DW] = &&LDX_PROBE_MEM_DW, [BPF_LDX | BPF_PROBE_MEMSX | BPF_B] = &&LDX_PROBE_MEMSX_B, [BPF_LDX | BPF_PROBE_MEMSX | BPF_H] = &&LDX_PROBE_MEMSX_H, [BPF_LDX | BPF_PROBE_MEMSX | BPF_W] = &&LDX_PROBE_MEMSX_W, }; #undef BPF_INSN_3_LBL #undef BPF_INSN_2_LBL u32 tail_call_cnt = 0; #define CONT ({ insn++; goto select_insn; }) #define CONT_JMP ({ insn++; goto select_insn; }) select_insn: goto *jumptable[insn->code]; /* Explicitly mask the register-based shift amounts with 63 or 31 * to avoid undefined behavior. Normally this won't affect the * generated code, for example, in case of native 64 bit archs such * as x86-64 or arm64, the compiler is optimizing the AND away for * the interpreter. In case of JITs, each of the JIT backends compiles * the BPF shift operations to machine instructions which produce * implementation-defined results in such a case; the resulting * contents of the register may be arbitrary, but program behaviour * as a whole remains defined. In other words, in case of JIT backends, * the AND must /not/ be added to the emitted LSH/RSH/ARSH translation. */ /* ALU (shifts) */ #define SHT(OPCODE, OP) \ ALU64_##OPCODE##_X: \ DST = DST OP (SRC & 63); \ CONT; \ ALU_##OPCODE##_X: \ DST = (u32) DST OP ((u32) SRC & 31); \ CONT; \ ALU64_##OPCODE##_K: \ DST = DST OP IMM; \ CONT; \ ALU_##OPCODE##_K: \ DST = (u32) DST OP (u32) IMM; \ CONT; /* ALU (rest) */ #define ALU(OPCODE, OP) \ ALU64_##OPCODE##_X: \ DST = DST OP SRC; \ CONT; \ ALU_##OPCODE##_X: \ DST = (u32) DST OP (u32) SRC; \ CONT; \ ALU64_##OPCODE##_K: \ DST = DST OP IMM; \ CONT; \ ALU_##OPCODE##_K: \ DST = (u32) DST OP (u32) IMM; \ CONT; ALU(ADD, +) ALU(SUB, -) ALU(AND, &) ALU(OR, |) ALU(XOR, ^) ALU(MUL, *) SHT(LSH, <<) SHT(RSH, >>) #undef SHT #undef ALU ALU_NEG: DST = (u32) -DST; CONT; ALU64_NEG: DST = -DST; CONT; ALU_MOV_X: switch (OFF) { case 0: DST = (u32) SRC; break; case 8: DST = (u32)(s8) SRC; break; case 16: DST = (u32)(s16) SRC; break; } CONT; ALU_MOV_K: DST = (u32) IMM; CONT; ALU64_MOV_X: switch (OFF) { case 0: DST = SRC; break; case 8: DST = (s8) SRC; break; case 16: DST = (s16) SRC; break; case 32: DST = (s32) SRC; break; } CONT; ALU64_MOV_K: DST = IMM; CONT; LD_IMM_DW: DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32; insn++; CONT; ALU_ARSH_X: DST = (u64) (u32) (((s32) DST) >> (SRC & 31)); CONT; ALU_ARSH_K: DST = (u64) (u32) (((s32) DST) >> IMM); CONT; ALU64_ARSH_X: (*(s64 *) &DST) >>= (SRC & 63); CONT; ALU64_ARSH_K: (*(s64 *) &DST) >>= IMM; CONT; ALU64_MOD_X: switch (OFF) { case 0: div64_u64_rem(DST, SRC, &AX); DST = AX; break; case 1: AX = div64_s64(DST, SRC); DST = DST - AX * SRC; break; } CONT; ALU_MOD_X: switch (OFF) { case 0: AX = (u32) DST; DST = do_div(AX, (u32) SRC); break; case 1: AX = abs((s32)DST); AX = do_div(AX, abs((s32)SRC)); if ((s32)DST < 0) DST = (u32)-AX; else DST = (u32)AX; break; } CONT; ALU64_MOD_K: switch (OFF) { case 0: div64_u64_rem(DST, IMM, &AX); DST = AX; break; case 1: AX = div64_s64(DST, IMM); DST = DST - AX * IMM; break; } CONT; ALU_MOD_K: switch (OFF) { case 0: AX = (u32) DST; DST = do_div(AX, (u32) IMM); break; case 1: AX = abs((s32)DST); AX = do_div(AX, abs((s32)IMM)); if ((s32)DST < 0) DST = (u32)-AX; else DST = (u32)AX; break; } CONT; ALU64_DIV_X: switch (OFF) { case 0: DST = div64_u64(DST, SRC); break; case 1: DST = div64_s64(DST, SRC); break; } CONT; ALU_DIV_X: switch (OFF) { case 0: AX = (u32) DST; do_div(AX, (u32) SRC); DST = (u32) AX; break; case 1: AX = abs((s32)DST); do_div(AX, abs((s32)SRC)); if (((s32)DST < 0) == ((s32)SRC < 0)) DST = (u32)AX; else DST = (u32)-AX; break; } CONT; ALU64_DIV_K: switch (OFF) { case 0: DST = div64_u64(DST, IMM); break; case 1: DST = div64_s64(DST, IMM); break; } CONT; ALU_DIV_K: switch (OFF) { case 0: AX = (u32) DST; do_div(AX, (u32) IMM); DST = (u32) AX; break; case 1: AX = abs((s32)DST); do_div(AX, abs((s32)IMM)); if (((s32)DST < 0) == ((s32)IMM < 0)) DST = (u32)AX; else DST = (u32)-AX; break; } CONT; ALU_END_TO_BE: switch (IMM) { case 16: DST = (__force u16) cpu_to_be16(DST); break; case 32: DST = (__force u32) cpu_to_be32(DST); break; case 64: DST = (__force u64) cpu_to_be64(DST); break; } CONT; ALU_END_TO_LE: switch (IMM) { case 16: DST = (__force u16) cpu_to_le16(DST); break; case 32: DST = (__force u32) cpu_to_le32(DST); break; case 64: DST = (__force u64) cpu_to_le64(DST); break; } CONT; ALU64_END_TO_LE: switch (IMM) { case 16: DST = (__force u16) __swab16(DST); break; case 32: DST = (__force u32) __swab32(DST); break; case 64: DST = (__force u64) __swab64(DST); break; } CONT; /* CALL */ JMP_CALL: /* Function call scratches BPF_R1-BPF_R5 registers, * preserves BPF_R6-BPF_R9, and stores return value * into BPF_R0. */ BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3, BPF_R4, BPF_R5); CONT; JMP_CALL_ARGS: BPF_R0 = (__bpf_call_base_args + insn->imm)(BPF_R1, BPF_R2, BPF_R3, BPF_R4, BPF_R5, insn + insn->off + 1); CONT; JMP_TAIL_CALL: { struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2; struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_prog *prog; u32 index = BPF_R3; if (unlikely(index >= array->map.max_entries)) goto out; if (unlikely(tail_call_cnt >= MAX_TAIL_CALL_CNT)) goto out; tail_call_cnt++; prog = READ_ONCE(array->ptrs[index]); if (!prog) goto out; /* ARG1 at this point is guaranteed to point to CTX from * the verifier side due to the fact that the tail call is * handled like a helper, that is, bpf_tail_call_proto, * where arg1_type is ARG_PTR_TO_CTX. */ insn = prog->insnsi; goto select_insn; out: CONT; } JMP_JA: insn += insn->off; CONT; JMP32_JA: insn += insn->imm; CONT; JMP_EXIT: return BPF_R0; /* JMP */ #define COND_JMP(SIGN, OPCODE, CMP_OP) \ JMP_##OPCODE##_X: \ if ((SIGN##64) DST CMP_OP (SIGN##64) SRC) { \ insn += insn->off; \ CONT_JMP; \ } \ CONT; \ JMP32_##OPCODE##_X: \ if ((SIGN##32) DST CMP_OP (SIGN##32) SRC) { \ insn += insn->off; \ CONT_JMP; \ } \ CONT; \ JMP_##OPCODE##_K: \ if ((SIGN##64) DST CMP_OP (SIGN##64) IMM) { \ insn += insn->off; \ CONT_JMP; \ } \ CONT; \ JMP32_##OPCODE##_K: \ if ((SIGN##32) DST CMP_OP (SIGN##32) IMM) { \ insn += insn->off; \ CONT_JMP; \ } \ CONT; COND_JMP(u, JEQ, ==) COND_JMP(u, JNE, !=) COND_JMP(u, JGT, >) COND_JMP(u, JLT, <) COND_JMP(u, JGE, >=) COND_JMP(u, JLE, <=) COND_JMP(u, JSET, &) COND_JMP(s, JSGT, >) COND_JMP(s, JSLT, <) COND_JMP(s, JSGE, >=) COND_JMP(s, JSLE, <=) #undef COND_JMP /* ST, STX and LDX*/ ST_NOSPEC: /* Speculation barrier for mitigating Speculative Store Bypass, * Bounds-Check Bypass and Type Confusion. In case of arm64, we * rely on the firmware mitigation as controlled via the ssbd * kernel parameter. Whenever the mitigation is enabled, it * works for all of the kernel code with no need to provide any * additional instructions here. In case of x86, we use 'lfence' * insn for mitigation. We reuse preexisting logic from Spectre * v1 mitigation that happens to produce the required code on * x86 for v4 as well. */ barrier_nospec(); CONT; #define LDST(SIZEOP, SIZE) \ STX_MEM_##SIZEOP: \ *(SIZE *)(unsigned long) (DST + insn->off) = SRC; \ CONT; \ ST_MEM_##SIZEOP: \ *(SIZE *)(unsigned long) (DST + insn->off) = IMM; \ CONT; \ LDX_MEM_##SIZEOP: \ DST = *(SIZE *)(unsigned long) (SRC + insn->off); \ CONT; \ LDX_PROBE_MEM_##SIZEOP: \ bpf_probe_read_kernel_common(&DST, sizeof(SIZE), \ (const void *)(long) (SRC + insn->off)); \ DST = *((SIZE *)&DST); \ CONT; LDST(B, u8) LDST(H, u16) LDST(W, u32) LDST(DW, u64) #undef LDST #define LDSX(SIZEOP, SIZE) \ LDX_MEMSX_##SIZEOP: \ DST = *(SIZE *)(unsigned long) (SRC + insn->off); \ CONT; \ LDX_PROBE_MEMSX_##SIZEOP: \ bpf_probe_read_kernel_common(&DST, sizeof(SIZE), \ (const void *)(long) (SRC + insn->off)); \ DST = *((SIZE *)&DST); \ CONT; LDSX(B, s8) LDSX(H, s16) LDSX(W, s32) #undef LDSX #define ATOMIC_ALU_OP(BOP, KOP) \ case BOP: \ if (BPF_SIZE(insn->code) == BPF_W) \ atomic_##KOP((u32) SRC, (atomic_t *)(unsigned long) \ (DST + insn->off)); \ else if (BPF_SIZE(insn->code) == BPF_DW) \ atomic64_##KOP((u64) SRC, (atomic64_t *)(unsigned long) \ (DST + insn->off)); \ else \ goto default_label; \ break; \ case BOP | BPF_FETCH: \ if (BPF_SIZE(insn->code) == BPF_W) \ SRC = (u32) atomic_fetch_##KOP( \ (u32) SRC, \ (atomic_t *)(unsigned long) (DST + insn->off)); \ else if (BPF_SIZE(insn->code) == BPF_DW) \ SRC = (u64) atomic64_fetch_##KOP( \ (u64) SRC, \ (atomic64_t *)(unsigned long) (DST + insn->off)); \ else \ goto default_label; \ break; STX_ATOMIC_DW: STX_ATOMIC_W: STX_ATOMIC_H: STX_ATOMIC_B: switch (IMM) { /* Atomic read-modify-write instructions support only W and DW * size modifiers. */ ATOMIC_ALU_OP(BPF_ADD, add) ATOMIC_ALU_OP(BPF_AND, and) ATOMIC_ALU_OP(BPF_OR, or) ATOMIC_ALU_OP(BPF_XOR, xor) #undef ATOMIC_ALU_OP case BPF_XCHG: if (BPF_SIZE(insn->code) == BPF_W) SRC = (u32) atomic_xchg( (atomic_t *)(unsigned long) (DST + insn->off), (u32) SRC); else if (BPF_SIZE(insn->code) == BPF_DW) SRC = (u64) atomic64_xchg( (atomic64_t *)(unsigned long) (DST + insn->off), (u64) SRC); else goto default_label; break; case BPF_CMPXCHG: if (BPF_SIZE(insn->code) == BPF_W) BPF_R0 = (u32) atomic_cmpxchg( (atomic_t *)(unsigned long) (DST + insn->off), (u32) BPF_R0, (u32) SRC); else if (BPF_SIZE(insn->code) == BPF_DW) BPF_R0 = (u64) atomic64_cmpxchg( (atomic64_t *)(unsigned long) (DST + insn->off), (u64) BPF_R0, (u64) SRC); else goto default_label; break; /* Atomic load and store instructions support all size * modifiers. */ case BPF_LOAD_ACQ: switch (BPF_SIZE(insn->code)) { #define LOAD_ACQUIRE(SIZEOP, SIZE) \ case BPF_##SIZEOP: \ DST = (SIZE)smp_load_acquire( \ (SIZE *)(unsigned long)(SRC + insn->off)); \ break; LOAD_ACQUIRE(B, u8) LOAD_ACQUIRE(H, u16) LOAD_ACQUIRE(W, u32) #ifdef CONFIG_64BIT LOAD_ACQUIRE(DW, u64) #endif #undef LOAD_ACQUIRE default: goto default_label; } break; case BPF_STORE_REL: switch (BPF_SIZE(insn->code)) { #define STORE_RELEASE(SIZEOP, SIZE) \ case BPF_##SIZEOP: \ smp_store_release( \ (SIZE *)(unsigned long)(DST + insn->off), (SIZE)SRC); \ break; STORE_RELEASE(B, u8) STORE_RELEASE(H, u16) STORE_RELEASE(W, u32) #ifdef CONFIG_64BIT STORE_RELEASE(DW, u64) #endif #undef STORE_RELEASE default: goto default_label; } break; default: goto default_label; } CONT; default_label: /* If we ever reach this, we have a bug somewhere. Die hard here * instead of just returning 0; we could be somewhere in a subprog, * so execution could continue otherwise which we do /not/ want. * * Note, verifier whitelists all opcodes in bpf_opcode_in_insntable(). */ pr_warn("BPF interpreter: unknown opcode %02x (imm: 0x%x)\n", insn->code, insn->imm); BUG_ON(1); return 0; } #define PROG_NAME(stack_size) __bpf_prog_run##stack_size #define DEFINE_BPF_PROG_RUN(stack_size) \ static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \ { \ u64 stack[stack_size / sizeof(u64)]; \ u64 regs[MAX_BPF_EXT_REG] = {}; \ \ kmsan_unpoison_memory(stack, sizeof(stack)); \ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ ARG1 = (u64) (unsigned long) ctx; \ return ___bpf_prog_run(regs, insn); \ } #define PROG_NAME_ARGS(stack_size) __bpf_prog_run_args##stack_size #define DEFINE_BPF_PROG_RUN_ARGS(stack_size) \ static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \ const struct bpf_insn *insn) \ { \ u64 stack[stack_size / sizeof(u64)]; \ u64 regs[MAX_BPF_EXT_REG]; \ \ kmsan_unpoison_memory(stack, sizeof(stack)); \ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ BPF_R1 = r1; \ BPF_R2 = r2; \ BPF_R3 = r3; \ BPF_R4 = r4; \ BPF_R5 = r5; \ return ___bpf_prog_run(regs, insn); \ } #define EVAL1(FN, X) FN(X) #define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y) #define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y) #define EVAL4(FN, X, Y...) FN(X) EVAL3(FN, Y) #define EVAL5(FN, X, Y...) FN(X) EVAL4(FN, Y) #define EVAL6(FN, X, Y...) FN(X) EVAL5(FN, Y) EVAL6(DEFINE_BPF_PROG_RUN, 32, 64, 96, 128, 160, 192); EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384); EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512); EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 32, 64, 96, 128, 160, 192); EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 224, 256, 288, 320, 352, 384); EVAL4(DEFINE_BPF_PROG_RUN_ARGS, 416, 448, 480, 512); #define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size), static unsigned int (*interpreters[])(const void *ctx, const struct bpf_insn *insn) = { EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192) EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) }; #undef PROG_NAME_LIST #define PROG_NAME_LIST(stack_size) PROG_NAME_ARGS(stack_size), static __maybe_unused u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, const struct bpf_insn *insn) = { EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192) EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) }; #undef PROG_NAME_LIST #ifdef CONFIG_BPF_SYSCALL void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth) { stack_depth = max_t(u32, stack_depth, 1); insn->off = (s16) insn->imm; insn->imm = interpreters_args[(round_up(stack_depth, 32) / 32) - 1] - __bpf_call_base_args; insn->code = BPF_JMP | BPF_CALL_ARGS; } #endif #endif static unsigned int __bpf_prog_ret0_warn(const void *ctx, const struct bpf_insn *insn) { /* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON * is not working properly, so warn about it! */ WARN_ON_ONCE(1); return 0; } static bool __bpf_prog_map_compatible(struct bpf_map *map, const struct bpf_prog *fp) { enum bpf_prog_type prog_type = resolve_prog_type(fp); struct bpf_prog_aux *aux = fp->aux; enum bpf_cgroup_storage_type i; bool ret = false; u64 cookie; if (fp->kprobe_override) return ret; spin_lock(&map->owner_lock); /* There's no owner yet where we could check for compatibility. */ if (!map->owner) { map->owner = bpf_map_owner_alloc(map); if (!map->owner) goto err; map->owner->type = prog_type; map->owner->jited = fp->jited; map->owner->xdp_has_frags = aux->xdp_has_frags; map->owner->expected_attach_type = fp->expected_attach_type; map->owner->attach_func_proto = aux->attach_func_proto; for_each_cgroup_storage_type(i) { map->owner->storage_cookie[i] = aux->cgroup_storage[i] ? aux->cgroup_storage[i]->cookie : 0; } ret = true; } else { ret = map->owner->type == prog_type && map->owner->jited == fp->jited && map->owner->xdp_has_frags == aux->xdp_has_frags; if (ret && map->map_type == BPF_MAP_TYPE_PROG_ARRAY && map->owner->expected_attach_type != fp->expected_attach_type) ret = false; for_each_cgroup_storage_type(i) { if (!ret) break; cookie = aux->cgroup_storage[i] ? aux->cgroup_storage[i]->cookie : 0; ret = map->owner->storage_cookie[i] == cookie || !cookie; } if (ret && map->owner->attach_func_proto != aux->attach_func_proto) { switch (prog_type) { case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_LSM: case BPF_PROG_TYPE_EXT: case BPF_PROG_TYPE_STRUCT_OPS: ret = false; break; default: break; } } } err: spin_unlock(&map->owner_lock); return ret; } bool bpf_prog_map_compatible(struct bpf_map *map, const struct bpf_prog *fp) { /* XDP programs inserted into maps are not guaranteed to run on * a particular netdev (and can run outside driver context entirely * in the case of devmap and cpumap). Until device checks * are implemented, prohibit adding dev-bound programs to program maps. */ if (bpf_prog_is_dev_bound(fp->aux)) return false; return __bpf_prog_map_compatible(map, fp); } static int bpf_check_tail_call(const struct bpf_prog *fp) { struct bpf_prog_aux *aux = fp->aux; int i, ret = 0; mutex_lock(&aux->used_maps_mutex); for (i = 0; i < aux->used_map_cnt; i++) { struct bpf_map *map = aux->used_maps[i]; if (!map_type_contains_progs(map)) continue; if (!__bpf_prog_map_compatible(map, fp)) { ret = -EINVAL; goto out; } } out: mutex_unlock(&aux->used_maps_mutex); return ret; } static bool bpf_prog_select_interpreter(struct bpf_prog *fp) { bool select_interpreter = false; #ifndef CONFIG_BPF_JIT_ALWAYS_ON u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); u32 idx = (round_up(stack_depth, 32) / 32) - 1; /* may_goto may cause stack size > 512, leading to idx out-of-bounds. * But for non-JITed programs, we don't need bpf_func, so no bounds * check needed. */ if (idx < ARRAY_SIZE(interpreters)) { fp->bpf_func = interpreters[idx]; select_interpreter = true; } else { fp->bpf_func = __bpf_prog_ret0_warn; } #else fp->bpf_func = __bpf_prog_ret0_warn; #endif return select_interpreter; } /** * bpf_prog_select_runtime - select exec runtime for BPF program * @fp: bpf_prog populated with BPF program * @err: pointer to error variable * * Try to JIT eBPF program, if JIT is not available, use interpreter. * The BPF program will be executed via bpf_prog_run() function. * * Return: the &fp argument along with &err set to 0 for success or * a negative errno code on failure */ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) { /* In case of BPF to BPF calls, verifier did all the prep * work with regards to JITing, etc. */ bool jit_needed = false; if (fp->bpf_func) goto finalize; if (IS_ENABLED(CONFIG_BPF_JIT_ALWAYS_ON) || bpf_prog_has_kfunc_call(fp)) jit_needed = true; if (!bpf_prog_select_interpreter(fp)) jit_needed = true; /* eBPF JITs can rewrite the program in case constant * blinding is active. However, in case of error during * blinding, bpf_int_jit_compile() must always return a * valid program, which in this case would simply not * be JITed, but falls back to the interpreter. */ if (!bpf_prog_is_offloaded(fp->aux)) { *err = bpf_prog_alloc_jited_linfo(fp); if (*err) return fp; fp = bpf_int_jit_compile(fp); bpf_prog_jit_attempt_done(fp); if (!fp->jited && jit_needed) { *err = -ENOTSUPP; return fp; } } else { *err = bpf_prog_offload_compile(fp); if (*err) return fp; } finalize: *err = bpf_prog_lock_ro(fp); if (*err) return fp; /* The tail call compatibility check can only be done at * this late stage as we need to determine, if we deal * with JITed or non JITed program concatenations and not * all eBPF JITs might immediately support all features. */ *err = bpf_check_tail_call(fp); return fp; } EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); static unsigned int __bpf_prog_ret1(const void *ctx, const struct bpf_insn *insn) { return 1; } static struct bpf_prog_dummy { struct bpf_prog prog; } dummy_bpf_prog = { .prog = { .bpf_func = __bpf_prog_ret1, }, }; struct bpf_empty_prog_array bpf_empty_prog_array = { .null_prog = NULL, }; EXPORT_SYMBOL(bpf_empty_prog_array); struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) { struct bpf_prog_array *p; if (prog_cnt) p = kzalloc(struct_size(p, items, prog_cnt + 1), flags); else p = &bpf_empty_prog_array.hdr; return p; } void bpf_prog_array_free(struct bpf_prog_array *progs) { if (!progs || progs == &bpf_empty_prog_array.hdr) return; kfree_rcu(progs, rcu); } static void __bpf_prog_array_free_sleepable_cb(struct rcu_head *rcu) { struct bpf_prog_array *progs; /* If RCU Tasks Trace grace period implies RCU grace period, there is * no need to call kfree_rcu(), just call kfree() directly. */ progs = container_of(rcu, struct bpf_prog_array, rcu); if (rcu_trace_implies_rcu_gp()) kfree(progs); else kfree_rcu(progs, rcu); } void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs) { if (!progs || progs == &bpf_empty_prog_array.hdr) return; call_rcu_tasks_trace(&progs->rcu, __bpf_prog_array_free_sleepable_cb); } int bpf_prog_array_length(struct bpf_prog_array *array) { struct bpf_prog_array_item *item; u32 cnt = 0; for (item = array->items; item->prog; item++) if (item->prog != &dummy_bpf_prog.prog) cnt++; return cnt; } bool bpf_prog_array_is_empty(struct bpf_prog_array *array) { struct bpf_prog_array_item *item; for (item = array->items; item->prog; item++) if (item->prog != &dummy_bpf_prog.prog) return false; return true; } static bool bpf_prog_array_copy_core(struct bpf_prog_array *array, u32 *prog_ids, u32 request_cnt) { struct bpf_prog_array_item *item; int i = 0; for (item = array->items; item->prog; item++) { if (item->prog == &dummy_bpf_prog.prog) continue; prog_ids[i] = item->prog->aux->id; if (++i == request_cnt) { item++; break; } } return !!(item->prog); } int bpf_prog_array_copy_to_user(struct bpf_prog_array *array, __u32 __user *prog_ids, u32 cnt) { unsigned long err = 0; bool nospc; u32 *ids; /* users of this function are doing: * cnt = bpf_prog_array_length(); * if (cnt > 0) * bpf_prog_array_copy_to_user(..., cnt); * so below kcalloc doesn't need extra cnt > 0 check. */ ids = kcalloc(cnt, sizeof(u32), GFP_USER | __GFP_NOWARN); if (!ids) return -ENOMEM; nospc = bpf_prog_array_copy_core(array, ids, cnt); err = copy_to_user(prog_ids, ids, cnt * sizeof(u32)); kfree(ids); if (err) return -EFAULT; if (nospc) return -ENOSPC; return 0; } void bpf_prog_array_delete_safe(struct bpf_prog_array *array, struct bpf_prog *old_prog) { struct bpf_prog_array_item *item; for (item = array->items; item->prog; item++) if (item->prog == old_prog) { WRITE_ONCE(item->prog, &dummy_bpf_prog.prog); break; } } /** * bpf_prog_array_delete_safe_at() - Replaces the program at the given * index into the program array with * a dummy no-op program. * @array: a bpf_prog_array * @index: the index of the program to replace * * Skips over dummy programs, by not counting them, when calculating * the position of the program to replace. * * Return: * * 0 - Success * * -EINVAL - Invalid index value. Must be a non-negative integer. * * -ENOENT - Index out of range */ int bpf_prog_array_delete_safe_at(struct bpf_prog_array *array, int index) { return bpf_prog_array_update_at(array, index, &dummy_bpf_prog.prog); } /** * bpf_prog_array_update_at() - Updates the program at the given index * into the program array. * @array: a bpf_prog_array * @index: the index of the program to update * @prog: the program to insert into the array * * Skips over dummy programs, by not counting them, when calculating * the position of the program to update. * * Return: * * 0 - Success * * -EINVAL - Invalid index value. Must be a non-negative integer. * * -ENOENT - Index out of range */ int bpf_prog_array_update_at(struct bpf_prog_array *array, int index, struct bpf_prog *prog) { struct bpf_prog_array_item *item; if (unlikely(index < 0)) return -EINVAL; for (item = array->items; item->prog; item++) { if (item->prog == &dummy_bpf_prog.prog) continue; if (!index) { WRITE_ONCE(item->prog, prog); return 0; } index--; } return -ENOENT; } int bpf_prog_array_copy(struct bpf_prog_array *old_array, struct bpf_prog *exclude_prog, struct bpf_prog *include_prog, u64 bpf_cookie, struct bpf_prog_array **new_array) { int new_prog_cnt, carry_prog_cnt = 0; struct bpf_prog_array_item *existing, *new; struct bpf_prog_array *array; bool found_exclude = false; /* Figure out how many existing progs we need to carry over to * the new array. */ if (old_array) { existing = old_array->items; for (; existing->prog; existing++) { if (existing->prog == exclude_prog) { found_exclude = true; continue; } if (existing->prog != &dummy_bpf_prog.prog) carry_prog_cnt++; if (existing->prog == include_prog) return -EEXIST; } } if (exclude_prog && !found_exclude) return -ENOENT; /* How many progs (not NULL) will be in the new array? */ new_prog_cnt = carry_prog_cnt; if (include_prog) new_prog_cnt += 1; /* Do we have any prog (not NULL) in the new array? */ if (!new_prog_cnt) { *new_array = NULL; return 0; } /* +1 as the end of prog_array is marked with NULL */ array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL); if (!array) return -ENOMEM; new = array->items; /* Fill in the new prog array */ if (carry_prog_cnt) { existing = old_array->items; for (; existing->prog; existing++) { if (existing->prog == exclude_prog || existing->prog == &dummy_bpf_prog.prog) continue; new->prog = existing->prog; new->bpf_cookie = existing->bpf_cookie; new++; } } if (include_prog) { new->prog = include_prog; new->bpf_cookie = bpf_cookie; new++; } new->prog = NULL; *new_array = array; return 0; } int bpf_prog_array_copy_info(struct bpf_prog_array *array, u32 *prog_ids, u32 request_cnt, u32 *prog_cnt) { u32 cnt = 0; if (array) cnt = bpf_prog_array_length(array); *prog_cnt = cnt; /* return early if user requested only program count or nothing to copy */ if (!request_cnt || !cnt) return 0; /* this function is called under trace/bpf_trace.c: bpf_event_mutex */ return bpf_prog_array_copy_core(array, prog_ids, request_cnt) ? -ENOSPC : 0; } void __bpf_free_used_maps(struct bpf_prog_aux *aux, struct bpf_map **used_maps, u32 len) { struct bpf_map *map; bool sleepable; u32 i; sleepable = aux->prog->sleepable; for (i = 0; i < len; i++) { map = used_maps[i]; if (map->ops->map_poke_untrack) map->ops->map_poke_untrack(map, aux); if (sleepable) atomic64_dec(&map->sleepable_refcnt); bpf_map_put(map); } } static void bpf_free_used_maps(struct bpf_prog_aux *aux) { __bpf_free_used_maps(aux, aux->used_maps, aux->used_map_cnt); kfree(aux->used_maps); } void __bpf_free_used_btfs(struct btf_mod_pair *used_btfs, u32 len) { #ifdef CONFIG_BPF_SYSCALL struct btf_mod_pair *btf_mod; u32 i; for (i = 0; i < len; i++) { btf_mod = &used_btfs[i]; if (btf_mod->module) module_put(btf_mod->module); btf_put(btf_mod->btf); } #endif } static void bpf_free_used_btfs(struct bpf_prog_aux *aux) { __bpf_free_used_btfs(aux->used_btfs, aux->used_btf_cnt); kfree(aux->used_btfs); } static void bpf_prog_free_deferred(struct work_struct *work) { struct bpf_prog_aux *aux; int i; aux = container_of(work, struct bpf_prog_aux, work); #ifdef CONFIG_BPF_SYSCALL bpf_free_kfunc_btf_tab(aux->kfunc_btf_tab); bpf_prog_stream_free(aux->prog); #endif #ifdef CONFIG_CGROUP_BPF if (aux->cgroup_atype != CGROUP_BPF_ATTACH_TYPE_INVALID) bpf_cgroup_atype_put(aux->cgroup_atype); #endif bpf_free_used_maps(aux); bpf_free_used_btfs(aux); if (bpf_prog_is_dev_bound(aux)) bpf_prog_dev_bound_destroy(aux->prog); #ifdef CONFIG_PERF_EVENTS if (aux->prog->has_callchain_buf) put_callchain_buffers(); #endif if (aux->dst_trampoline) bpf_trampoline_put(aux->dst_trampoline); for (i = 0; i < aux->real_func_cnt; i++) { /* We can just unlink the subprog poke descriptor table as * it was originally linked to the main program and is also * released along with it. */ aux->func[i]->aux->poke_tab = NULL; bpf_jit_free(aux->func[i]); } if (aux->real_func_cnt) { kfree(aux->func); bpf_prog_unlock_free(aux->prog); } else { bpf_jit_free(aux->prog); } } void bpf_prog_free(struct bpf_prog *fp) { struct bpf_prog_aux *aux = fp->aux; if (aux->dst_prog) bpf_prog_put(aux->dst_prog); bpf_token_put(aux->token); INIT_WORK(&aux->work, bpf_prog_free_deferred); schedule_work(&aux->work); } EXPORT_SYMBOL_GPL(bpf_prog_free); /* RNG for unprivileged user space with separated state from prandom_u32(). */ static DEFINE_PER_CPU(struct rnd_state, bpf_user_rnd_state); void bpf_user_rnd_init_once(void) { prandom_init_once(&bpf_user_rnd_state); } BPF_CALL_0(bpf_user_rnd_u32) { /* Should someone ever have the rather unwise idea to use some * of the registers passed into this function, then note that * this function is called from native eBPF and classic-to-eBPF * transformations. Register assignments from both sides are * different, f.e. classic always sets fn(ctx, A, X) here. */ struct rnd_state *state; u32 res; state = &get_cpu_var(bpf_user_rnd_state); res = prandom_u32_state(state); put_cpu_var(bpf_user_rnd_state); return res; } BPF_CALL_0(bpf_get_raw_cpu_id) { return raw_smp_processor_id(); } /* Weak definitions of helper functions in case we don't have bpf syscall. */ const struct bpf_func_proto bpf_map_lookup_elem_proto __weak; const struct bpf_func_proto bpf_map_update_elem_proto __weak; const struct bpf_func_proto bpf_map_delete_elem_proto __weak; const struct bpf_func_proto bpf_map_push_elem_proto __weak; const struct bpf_func_proto bpf_map_pop_elem_proto __weak; const struct bpf_func_proto bpf_map_peek_elem_proto __weak; const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto __weak; const struct bpf_func_proto bpf_spin_lock_proto __weak; const struct bpf_func_proto bpf_spin_unlock_proto __weak; const struct bpf_func_proto bpf_jiffies64_proto __weak; const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; const struct bpf_func_proto bpf_get_numa_node_id_proto __weak; const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak; const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto __weak; const struct bpf_func_proto bpf_ktime_get_tai_ns_proto __weak; const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; const struct bpf_func_proto bpf_get_current_comm_proto __weak; const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak; const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto __weak; const struct bpf_func_proto bpf_get_local_storage_proto __weak; const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_snprintf_btf_proto __weak; const struct bpf_func_proto bpf_seq_printf_btf_proto __weak; const struct bpf_func_proto bpf_set_retval_proto __weak; const struct bpf_func_proto bpf_get_retval_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { return NULL; } const struct bpf_func_proto * __weak bpf_get_trace_vprintk_proto(void) { return NULL; } const struct bpf_func_proto * __weak bpf_get_perf_event_read_value_proto(void) { return NULL; } u64 __weak bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) { return -ENOTSUPP; } EXPORT_SYMBOL_GPL(bpf_event_output); /* Always built-in helper functions. */ const struct bpf_func_proto bpf_tail_call_proto = { /* func is unused for tail_call, we set it to pass the * get_helper_proto check */ .func = BPF_PTR_POISON, .gpl_only = false, .ret_type = RET_VOID, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, }; /* Stub for JITs that only support cBPF. eBPF programs are interpreted. * It is encouraged to implement bpf_int_jit_compile() instead, so that * eBPF and implicitly also cBPF can get JITed! */ struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog) { return prog; } /* Stub for JITs that support eBPF. All cBPF code gets transformed into * eBPF by the kernel and is later compiled by bpf_int_jit_compile(). */ void __weak bpf_jit_compile(struct bpf_prog *prog) { } bool __weak bpf_helper_changes_pkt_data(enum bpf_func_id func_id) { return false; } /* Return TRUE if the JIT backend wants verifier to enable sub-register usage * analysis code and wants explicit zero extension inserted by verifier. * Otherwise, return FALSE. * * The verifier inserts an explicit zero extension after BPF_CMPXCHGs even if * you don't override this. JITs that don't want these extra insns can detect * them using insn_is_zext. */ bool __weak bpf_jit_needs_zext(void) { return false; } /* By default, enable the verifier's mitigations against Spectre v1 and v4 for * all archs. The value returned must not change at runtime as there is * currently no support for reloading programs that were loaded without * mitigations. */ bool __weak bpf_jit_bypass_spec_v1(void) { return false; } bool __weak bpf_jit_bypass_spec_v4(void) { return false; } /* Return true if the JIT inlines the call to the helper corresponding to * the imm. * * The verifier will not patch the insn->imm for the call to the helper if * this returns true. */ bool __weak bpf_jit_inlines_helper_call(s32 imm) { return false; } /* Return TRUE if the JIT backend supports mixing bpf2bpf and tailcalls. */ bool __weak bpf_jit_supports_subprog_tailcalls(void) { return false; } bool __weak bpf_jit_supports_percpu_insn(void) { return false; } bool __weak bpf_jit_supports_kfunc_call(void) { return false; } bool __weak bpf_jit_supports_far_kfunc_call(void) { return false; } bool __weak bpf_jit_supports_arena(void) { return false; } bool __weak bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) { return false; } u64 __weak bpf_arch_uaddress_limit(void) { #if defined(CONFIG_64BIT) && defined(CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE) return TASK_SIZE; #else return 0; #endif } /* Return TRUE if the JIT backend satisfies the following two conditions: * 1) JIT backend supports atomic_xchg() on pointer-sized words. * 2) Under the specific arch, the implementation of xchg() is the same * as atomic_xchg() on pointer-sized words. */ bool __weak bpf_jit_supports_ptr_xchg(void) { return false; } /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call * skb_copy_bits(), so provide a weak definition of it for NET-less config. */ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) { return -EFAULT; } int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t, enum bpf_text_poke_type new_t, void *old_addr, void *new_addr) { return -ENOTSUPP; } void * __weak bpf_arch_text_copy(void *dst, void *src, size_t len) { return ERR_PTR(-ENOTSUPP); } int __weak bpf_arch_text_invalidate(void *dst, size_t len) { return -ENOTSUPP; } bool __weak bpf_jit_supports_exceptions(void) { return false; } bool __weak bpf_jit_supports_private_stack(void) { return false; } void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie) { } bool __weak bpf_jit_supports_timed_may_goto(void) { return false; } u64 __weak arch_bpf_timed_may_goto(void) { return 0; } static noinline void bpf_prog_report_may_goto_violation(void) { #ifdef CONFIG_BPF_SYSCALL struct bpf_stream_stage ss; struct bpf_prog *prog; prog = bpf_prog_find_from_stack(); if (!prog) return; bpf_stream_stage(ss, prog, BPF_STDERR, ({ bpf_stream_printk(ss, "ERROR: Timeout detected for may_goto instruction\n"); bpf_stream_dump_stack(ss); })); #endif } u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *p) { u64 time = ktime_get_mono_fast_ns(); /* Populate the timestamp for this stack frame, and refresh count. */ if (!p->timestamp) { p->timestamp = time; return BPF_MAX_TIMED_LOOPS; } /* Check if we've exhausted our time slice, and zero count. */ if (unlikely(time - p->timestamp >= (NSEC_PER_SEC / 4))) { bpf_prog_report_may_goto_violation(); return 0; } /* Refresh the count for the stack frame. */ return BPF_MAX_TIMED_LOOPS; } /* for configs without MMU or 32-bit */ __weak const struct bpf_map_ops arena_map_ops; __weak u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena) { return 0; } __weak u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena) { return 0; } #ifdef CONFIG_BPF_SYSCALL static int __init bpf_global_ma_init(void) { int ret; ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false); bpf_global_ma_set = !ret; return ret; } late_initcall(bpf_global_ma_init); #endif DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key); EXPORT_SYMBOL(bpf_stats_enabled_key); /* All definitions of tracepoints related to BPF. */ #define CREATE_TRACE_POINTS #include <linux/bpf_trace.h> EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception); EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx); #ifdef CONFIG_BPF_SYSCALL int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char **filep, const char **linep, int *nump) { int idx = -1, insn_start, insn_end, len; struct bpf_line_info *linfo; void **jited_linfo; struct btf *btf; int nr_linfo; btf = prog->aux->btf; linfo = prog->aux->linfo; jited_linfo = prog->aux->jited_linfo; if (!btf || !linfo || !jited_linfo) return -EINVAL; len = prog->aux->func ? prog->aux->func[prog->aux->func_idx]->len : prog->len; linfo = &prog->aux->linfo[prog->aux->linfo_idx]; jited_linfo = &prog->aux->jited_linfo[prog->aux->linfo_idx]; insn_start = linfo[0].insn_off; insn_end = insn_start + len; nr_linfo = prog->aux->nr_linfo - prog->aux->linfo_idx; for (int i = 0; i < nr_linfo && linfo[i].insn_off >= insn_start && linfo[i].insn_off < insn_end; i++) { if (jited_linfo[i] >= (void *)ip) break; idx = i; } if (idx == -1) return -ENOENT; /* Get base component of the file path. */ *filep = btf_name_by_offset(btf, linfo[idx].file_name_off); *filep = kbasename(*filep); /* Obtain the source line, and strip whitespace in prefix. */ *linep = btf_name_by_offset(btf, linfo[idx].line_off); while (isspace(**linep)) *linep += 1; *nump = BPF_LINE_INFO_LINE_NUM(linfo[idx].line_col); return 0; } struct walk_stack_ctx { struct bpf_prog *prog; }; static bool find_from_stack_cb(void *cookie, u64 ip, u64 sp, u64 bp) { struct walk_stack_ctx *ctxp = cookie; struct bpf_prog *prog; /* * The RCU read lock is held to safely traverse the latch tree, but we * don't need its protection when accessing the prog, since it has an * active stack frame on the current stack trace, and won't disappear. */ rcu_read_lock(); prog = bpf_prog_ksym_find(ip); rcu_read_unlock(); if (!prog) return true; /* Make sure we return the main prog if we found a subprog */ ctxp->prog = prog->aux->main_prog_aux->prog; return false; } struct bpf_prog *bpf_prog_find_from_stack(void) { struct walk_stack_ctx ctx = {}; arch_bpf_stack_walk(find_from_stack_cb, &ctx); return ctx.prog; } #endif |
| 21 21 21 21 21 21 21 21 21 21 20 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2003-2005 Devicescape Software, Inc. * Copyright (c) 2006 Jiri Benc <jbenc@suse.cz> * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright (C) 2015 Intel Deutschland GmbH * Copyright (C) 2021-2023 Intel Corporation */ #include <linux/kobject.h> #include <linux/slab.h> #include "ieee80211_i.h" #include "key.h" #include "debugfs.h" #include "debugfs_key.h" #define KEY_READ(name, prop, format_string) \ static ssize_t key_##name##_read(struct file *file, \ char __user *userbuf, \ size_t count, loff_t *ppos) \ { \ struct ieee80211_key *key = file->private_data; \ return mac80211_format_buffer(userbuf, count, ppos, \ format_string, key->prop); \ } #define KEY_READ_X(name) KEY_READ(name, name, "0x%x\n") #define KEY_OPS(name) \ static const struct debugfs_short_fops key_ ##name## _ops = { \ .read = key_##name##_read, \ .llseek = generic_file_llseek, \ } #define KEY_OPS_W(name) \ static const struct debugfs_short_fops key_ ##name## _ops = { \ .read = key_##name##_read, \ .write = key_##name##_write, \ .llseek = generic_file_llseek, \ } #define KEY_FILE(name, format) \ KEY_READ_##format(name) \ KEY_OPS(name) #define KEY_CONF_READ(name, format_string) \ KEY_READ(conf_##name, conf.name, format_string) #define KEY_CONF_READ_D(name) KEY_CONF_READ(name, "%d\n") #define KEY_CONF_OPS(name) \ static const struct debugfs_short_fops key_ ##name## _ops = { \ .read = key_conf_##name##_read, \ .llseek = generic_file_llseek, \ } #define KEY_CONF_FILE(name, format) \ KEY_CONF_READ_##format(name) \ KEY_CONF_OPS(name) KEY_CONF_FILE(keylen, D); KEY_CONF_FILE(keyidx, D); KEY_CONF_FILE(hw_key_idx, D); KEY_FILE(flags, X); KEY_READ(ifindex, sdata->name, "%s\n"); KEY_OPS(ifindex); static ssize_t key_algorithm_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { char buf[15]; struct ieee80211_key *key = file->private_data; u32 c = key->conf.cipher; sprintf(buf, "%.2x-%.2x-%.2x:%d\n", c >> 24, (c >> 16) & 0xff, (c >> 8) & 0xff, c & 0xff); return simple_read_from_buffer(userbuf, count, ppos, buf, strlen(buf)); } KEY_OPS(algorithm); static ssize_t key_tx_spec_write(struct file *file, const char __user *userbuf, size_t count, loff_t *ppos) { struct ieee80211_key *key = file->private_data; u64 pn; int ret; switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_WEP40: case WLAN_CIPHER_SUITE_WEP104: return -EINVAL; case WLAN_CIPHER_SUITE_TKIP: /* not supported yet */ return -EOPNOTSUPP; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: ret = kstrtou64_from_user(userbuf, count, 16, &pn); if (ret) return ret; /* PN is a 48-bit counter */ if (pn >= (1ULL << 48)) return -ERANGE; atomic64_set(&key->conf.tx_pn, pn); return count; default: return 0; } } static ssize_t key_tx_spec_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { u64 pn; char buf[20]; int len; struct ieee80211_key *key = file->private_data; switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_WEP40: case WLAN_CIPHER_SUITE_WEP104: len = scnprintf(buf, sizeof(buf), "\n"); break; case WLAN_CIPHER_SUITE_TKIP: pn = atomic64_read(&key->conf.tx_pn); len = scnprintf(buf, sizeof(buf), "%08x %04x\n", TKIP_PN_TO_IV32(pn), TKIP_PN_TO_IV16(pn)); break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: pn = atomic64_read(&key->conf.tx_pn); len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n", (u8)(pn >> 40), (u8)(pn >> 32), (u8)(pn >> 24), (u8)(pn >> 16), (u8)(pn >> 8), (u8)pn); break; default: return 0; } return simple_read_from_buffer(userbuf, count, ppos, buf, len); } KEY_OPS_W(tx_spec); static ssize_t key_rx_spec_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { struct ieee80211_key *key = file->private_data; char buf[14*IEEE80211_NUM_TIDS+1], *p = buf; int i, len; const u8 *rpn; switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_WEP40: case WLAN_CIPHER_SUITE_WEP104: len = scnprintf(buf, sizeof(buf), "\n"); break; case WLAN_CIPHER_SUITE_TKIP: for (i = 0; i < IEEE80211_NUM_TIDS; i++) p += scnprintf(p, sizeof(buf)+buf-p, "%08x %04x\n", key->u.tkip.rx[i].iv32, key->u.tkip.rx[i].iv16); len = p - buf; break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) { rpn = key->u.ccmp.rx_pn[i]; p += scnprintf(p, sizeof(buf)+buf-p, "%02x%02x%02x%02x%02x%02x\n", rpn[0], rpn[1], rpn[2], rpn[3], rpn[4], rpn[5]); } len = p - buf; break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: rpn = key->u.aes_cmac.rx_pn; p += scnprintf(p, sizeof(buf)+buf-p, "%02x%02x%02x%02x%02x%02x\n", rpn[0], rpn[1], rpn[2], rpn[3], rpn[4], rpn[5]); len = p - buf; break; case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: rpn = key->u.aes_gmac.rx_pn; p += scnprintf(p, sizeof(buf)+buf-p, "%02x%02x%02x%02x%02x%02x\n", rpn[0], rpn[1], rpn[2], rpn[3], rpn[4], rpn[5]); len = p - buf; break; case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) { rpn = key->u.gcmp.rx_pn[i]; p += scnprintf(p, sizeof(buf)+buf-p, "%02x%02x%02x%02x%02x%02x\n", rpn[0], rpn[1], rpn[2], rpn[3], rpn[4], rpn[5]); } len = p - buf; break; default: return 0; } return simple_read_from_buffer(userbuf, count, ppos, buf, len); } KEY_OPS(rx_spec); static ssize_t key_replays_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { struct ieee80211_key *key = file->private_data; char buf[20]; int len; switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: len = scnprintf(buf, sizeof(buf), "%u\n", key->u.ccmp.replays); break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: len = scnprintf(buf, sizeof(buf), "%u\n", key->u.aes_cmac.replays); break; case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: len = scnprintf(buf, sizeof(buf), "%u\n", key->u.aes_gmac.replays); break; case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: len = scnprintf(buf, sizeof(buf), "%u\n", key->u.gcmp.replays); break; default: return 0; } return simple_read_from_buffer(userbuf, count, ppos, buf, len); } KEY_OPS(replays); static ssize_t key_icverrors_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { struct ieee80211_key *key = file->private_data; char buf[20]; int len; switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: len = scnprintf(buf, sizeof(buf), "%u\n", key->u.aes_cmac.icverrors); break; case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: len = scnprintf(buf, sizeof(buf), "%u\n", key->u.aes_gmac.icverrors); break; default: return 0; } return simple_read_from_buffer(userbuf, count, ppos, buf, len); } KEY_OPS(icverrors); static ssize_t key_mic_failures_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { struct ieee80211_key *key = file->private_data; char buf[20]; int len; if (key->conf.cipher != WLAN_CIPHER_SUITE_TKIP) return -EINVAL; len = scnprintf(buf, sizeof(buf), "%u\n", key->u.tkip.mic_failures); return simple_read_from_buffer(userbuf, count, ppos, buf, len); } KEY_OPS(mic_failures); static ssize_t key_key_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { struct ieee80211_key *key = file->private_data; int i, bufsize = 2 * key->conf.keylen + 2; char *buf = kmalloc(bufsize, GFP_KERNEL); char *p = buf; ssize_t res; if (!buf) return -ENOMEM; for (i = 0; i < key->conf.keylen; i++) p += scnprintf(p, bufsize + buf - p, "%02x", key->conf.key[i]); p += scnprintf(p, bufsize+buf-p, "\n"); res = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); kfree(buf); return res; } KEY_OPS(key); #define DEBUGFS_ADD(name) \ debugfs_create_file(#name, 0400, key->debugfs.dir, \ key, &key_##name##_ops) #define DEBUGFS_ADD_W(name) \ debugfs_create_file(#name, 0600, key->debugfs.dir, \ key, &key_##name##_ops); void ieee80211_debugfs_key_add(struct ieee80211_key *key) { static int keycount; char buf[100]; struct sta_info *sta; if (!key->local->debugfs.keys) return; sprintf(buf, "%d", keycount); key->debugfs.cnt = keycount; keycount++; key->debugfs.dir = debugfs_create_dir(buf, key->local->debugfs.keys); sta = key->sta; if (sta) { sprintf(buf, "../../netdev:%s/stations/%pM", sta->sdata->name, sta->sta.addr); key->debugfs.stalink = debugfs_create_symlink("station", key->debugfs.dir, buf); } DEBUGFS_ADD(keylen); DEBUGFS_ADD(flags); DEBUGFS_ADD(keyidx); DEBUGFS_ADD(hw_key_idx); DEBUGFS_ADD(algorithm); DEBUGFS_ADD_W(tx_spec); DEBUGFS_ADD(rx_spec); DEBUGFS_ADD(replays); DEBUGFS_ADD(icverrors); DEBUGFS_ADD(mic_failures); DEBUGFS_ADD(key); DEBUGFS_ADD(ifindex); }; void ieee80211_debugfs_key_remove(struct ieee80211_key *key) { if (!key) return; debugfs_remove_recursive(key->debugfs.dir); key->debugfs.dir = NULL; } void ieee80211_debugfs_key_update_default(struct ieee80211_sub_if_data *sdata) { char buf[50]; struct ieee80211_key *key; if (!sdata->vif.debugfs_dir) return; lockdep_assert_wiphy(sdata->local->hw.wiphy); debugfs_remove(sdata->debugfs.default_unicast_key); sdata->debugfs.default_unicast_key = NULL; if (sdata->default_unicast_key) { key = wiphy_dereference(sdata->local->hw.wiphy, sdata->default_unicast_key); sprintf(buf, "../keys/%d", key->debugfs.cnt); sdata->debugfs.default_unicast_key = debugfs_create_symlink("default_unicast_key", sdata->vif.debugfs_dir, buf); } debugfs_remove(sdata->debugfs.default_multicast_key); sdata->debugfs.default_multicast_key = NULL; if (sdata->deflink.default_multicast_key) { key = wiphy_dereference(sdata->local->hw.wiphy, sdata->deflink.default_multicast_key); sprintf(buf, "../keys/%d", key->debugfs.cnt); sdata->debugfs.default_multicast_key = debugfs_create_symlink("default_multicast_key", sdata->vif.debugfs_dir, buf); } } void ieee80211_debugfs_key_remove_mgmt_default(struct ieee80211_sub_if_data *sdata) { if (!sdata) return; debugfs_remove(sdata->debugfs.default_mgmt_key); sdata->debugfs.default_mgmt_key = NULL; } void ieee80211_debugfs_key_remove_beacon_default(struct ieee80211_sub_if_data *sdata) { if (!sdata) return; debugfs_remove(sdata->debugfs.default_beacon_key); sdata->debugfs.default_beacon_key = NULL; } |
| 7 7 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2002-2004, Instant802 Networks, Inc. * Copyright 2008, Jouni Malinen <j@w1.fi> * Copyright (C) 2016-2017 Intel Deutschland GmbH * Copyright (C) 2020-2023 Intel Corporation */ #include <linux/netdevice.h> #include <linux/types.h> #include <linux/skbuff.h> #include <linux/compiler.h> #include <linux/ieee80211.h> #include <linux/gfp.h> #include <linux/unaligned.h> #include <net/mac80211.h> #include <crypto/aes.h> #include <crypto/utils.h> #include "ieee80211_i.h" #include "michael.h" #include "tkip.h" #include "aes_ccm.h" #include "aes_cmac.h" #include "aes_gmac.h" #include "aes_gcm.h" #include "wpa.h" ieee80211_tx_result ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx) { u8 *data, *key, *mic; size_t data_len; unsigned int hdrlen; struct ieee80211_hdr *hdr; struct sk_buff *skb = tx->skb; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); int tail; hdr = (struct ieee80211_hdr *)skb->data; if (!tx->key || tx->key->conf.cipher != WLAN_CIPHER_SUITE_TKIP || skb->len < 24 || !ieee80211_is_data_present(hdr->frame_control)) return TX_CONTINUE; hdrlen = ieee80211_hdrlen(hdr->frame_control); if (skb->len < hdrlen) return TX_DROP; data = skb->data + hdrlen; data_len = skb->len - hdrlen; if (unlikely(info->flags & IEEE80211_TX_INTFL_TKIP_MIC_FAILURE)) { /* Need to use software crypto for the test */ info->control.hw_key = NULL; } if (info->control.hw_key && (info->flags & IEEE80211_TX_CTL_DONTFRAG || ieee80211_hw_check(&tx->local->hw, SUPPORTS_TX_FRAG)) && !(tx->key->conf.flags & (IEEE80211_KEY_FLAG_GENERATE_MMIC | IEEE80211_KEY_FLAG_PUT_MIC_SPACE))) { /* hwaccel - with no need for SW-generated MMIC or MIC space */ return TX_CONTINUE; } tail = MICHAEL_MIC_LEN; if (!info->control.hw_key) tail += IEEE80211_TKIP_ICV_LEN; if (WARN(skb_tailroom(skb) < tail || skb_headroom(skb) < IEEE80211_TKIP_IV_LEN, "mmic: not enough head/tail (%d/%d,%d/%d)\n", skb_headroom(skb), IEEE80211_TKIP_IV_LEN, skb_tailroom(skb), tail)) return TX_DROP; mic = skb_put(skb, MICHAEL_MIC_LEN); if (tx->key->conf.flags & IEEE80211_KEY_FLAG_PUT_MIC_SPACE) { /* Zeroed MIC can help with debug */ memset(mic, 0, MICHAEL_MIC_LEN); return TX_CONTINUE; } key = &tx->key->conf.key[NL80211_TKIP_DATA_OFFSET_TX_MIC_KEY]; michael_mic(key, hdr, data, data_len, mic); if (unlikely(info->flags & IEEE80211_TX_INTFL_TKIP_MIC_FAILURE)) mic[0]++; return TX_CONTINUE; } ieee80211_rx_result ieee80211_rx_h_michael_mic_verify(struct ieee80211_rx_data *rx) { u8 *data, *key = NULL; size_t data_len; unsigned int hdrlen; u8 mic[MICHAEL_MIC_LEN]; struct sk_buff *skb = rx->skb; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; /* * it makes no sense to check for MIC errors on anything other * than data frames. */ if (!ieee80211_is_data_present(hdr->frame_control)) return RX_CONTINUE; /* * No way to verify the MIC if the hardware stripped it or * the IV with the key index. In this case we have solely rely * on the driver to set RX_FLAG_MMIC_ERROR in the event of a * MIC failure report. */ if (status->flag & (RX_FLAG_MMIC_STRIPPED | RX_FLAG_IV_STRIPPED)) { if (status->flag & RX_FLAG_MMIC_ERROR) goto mic_fail_no_key; if (!(status->flag & RX_FLAG_IV_STRIPPED) && rx->key && rx->key->conf.cipher == WLAN_CIPHER_SUITE_TKIP) goto update_iv; return RX_CONTINUE; } /* * Some hardware seems to generate Michael MIC failure reports; even * though, the frame was not encrypted with TKIP and therefore has no * MIC. Ignore the flag them to avoid triggering countermeasures. */ if (!rx->key || rx->key->conf.cipher != WLAN_CIPHER_SUITE_TKIP || !(status->flag & RX_FLAG_DECRYPTED)) return RX_CONTINUE; if (rx->sdata->vif.type == NL80211_IFTYPE_AP && rx->key->conf.keyidx) { /* * APs with pairwise keys should never receive Michael MIC * errors for non-zero keyidx because these are reserved for * group keys and only the AP is sending real multicast * frames in the BSS. */ return RX_DROP_U_AP_RX_GROUPCAST; } if (status->flag & RX_FLAG_MMIC_ERROR) goto mic_fail; hdrlen = ieee80211_hdrlen(hdr->frame_control); if (skb->len < hdrlen + MICHAEL_MIC_LEN) return RX_DROP_U_SHORT_MMIC; if (skb_linearize(rx->skb)) return RX_DROP_U_OOM; hdr = (void *)skb->data; data = skb->data + hdrlen; data_len = skb->len - hdrlen - MICHAEL_MIC_LEN; key = &rx->key->conf.key[NL80211_TKIP_DATA_OFFSET_RX_MIC_KEY]; michael_mic(key, hdr, data, data_len, mic); if (crypto_memneq(mic, data + data_len, MICHAEL_MIC_LEN)) goto mic_fail; /* remove Michael MIC from payload */ skb_trim(skb, skb->len - MICHAEL_MIC_LEN); update_iv: /* update IV in key information to be able to detect replays */ rx->key->u.tkip.rx[rx->security_idx].iv32 = rx->tkip.iv32; rx->key->u.tkip.rx[rx->security_idx].iv16 = rx->tkip.iv16; return RX_CONTINUE; mic_fail: rx->key->u.tkip.mic_failures++; mic_fail_no_key: /* * In some cases the key can be unset - e.g. a multicast packet, in * a driver that supports HW encryption. Send up the key idx only if * the key is set. */ cfg80211_michael_mic_failure(rx->sdata->dev, hdr->addr2, is_multicast_ether_addr(hdr->addr1) ? NL80211_KEYTYPE_GROUP : NL80211_KEYTYPE_PAIRWISE, rx->key ? rx->key->conf.keyidx : -1, NULL, GFP_ATOMIC); return RX_DROP_U_MMIC_FAIL; } static int tkip_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct ieee80211_key *key = tx->key; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); unsigned int hdrlen; int len, tail; u64 pn; u8 *pos; if (info->control.hw_key && !(info->control.hw_key->flags & IEEE80211_KEY_FLAG_GENERATE_IV) && !(info->control.hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE)) { /* hwaccel - with no need for software-generated IV */ return 0; } hdrlen = ieee80211_hdrlen(hdr->frame_control); len = skb->len - hdrlen; if (info->control.hw_key) tail = 0; else tail = IEEE80211_TKIP_ICV_LEN; if (WARN_ON(skb_tailroom(skb) < tail || skb_headroom(skb) < IEEE80211_TKIP_IV_LEN)) return -1; pos = skb_push(skb, IEEE80211_TKIP_IV_LEN); memmove(pos, pos + IEEE80211_TKIP_IV_LEN, hdrlen); pos += hdrlen; /* the HW only needs room for the IV, but not the actual IV */ if (info->control.hw_key && (info->control.hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE)) return 0; /* Increase IV for the frame */ pn = atomic64_inc_return(&key->conf.tx_pn); pos = ieee80211_tkip_add_iv(pos, &key->conf, pn); /* hwaccel - with software IV */ if (info->control.hw_key) return 0; /* Add room for ICV */ skb_put(skb, IEEE80211_TKIP_ICV_LEN); return ieee80211_tkip_encrypt_data(&tx->local->wep_tx_ctx, key, skb, pos, len); } ieee80211_tx_result ieee80211_crypto_tkip_encrypt(struct ieee80211_tx_data *tx) { struct sk_buff *skb; ieee80211_tx_set_protected(tx); skb_queue_walk(&tx->skbs, skb) { if (tkip_encrypt_skb(tx, skb) < 0) return TX_DROP; } return TX_CONTINUE; } ieee80211_rx_result ieee80211_crypto_tkip_decrypt(struct ieee80211_rx_data *rx) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) rx->skb->data; int hdrlen, res, hwaccel = 0; struct ieee80211_key *key = rx->key; struct sk_buff *skb = rx->skb; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); hdrlen = ieee80211_hdrlen(hdr->frame_control); if (!ieee80211_is_data(hdr->frame_control)) return RX_CONTINUE; if (!rx->sta || skb->len - hdrlen < 12) return RX_DROP_U_SHORT_TKIP; /* it may be possible to optimize this a bit more */ if (skb_linearize(rx->skb)) return RX_DROP_U_OOM; hdr = (void *)skb->data; /* * Let TKIP code verify IV, but skip decryption. * In the case where hardware checks the IV as well, * we don't even get here, see ieee80211_rx_h_decrypt() */ if (status->flag & RX_FLAG_DECRYPTED) hwaccel = 1; res = ieee80211_tkip_decrypt_data(&rx->local->wep_rx_ctx, key, skb->data + hdrlen, skb->len - hdrlen, rx->sta->sta.addr, hdr->addr1, hwaccel, rx->security_idx, &rx->tkip.iv32, &rx->tkip.iv16); if (res != TKIP_DECRYPT_OK) return RX_DROP_U_TKIP_FAIL; /* Trim ICV */ if (!(status->flag & RX_FLAG_ICV_STRIPPED)) skb_trim(skb, skb->len - IEEE80211_TKIP_ICV_LEN); /* Remove IV */ memmove(skb->data + IEEE80211_TKIP_IV_LEN, skb->data, hdrlen); skb_pull(skb, IEEE80211_TKIP_IV_LEN); return RX_CONTINUE; } /* * Calculate AAD for CCMP/GCMP, returning qos_tid since we * need that in CCMP also for b_0. */ static u8 ccmp_gcmp_aad(struct sk_buff *skb, u8 *aad, bool spp_amsdu) { struct ieee80211_hdr *hdr = (void *)skb->data; __le16 mask_fc; int a4_included, mgmt; u8 qos_tid; u16 len_a = 22; /* * Mask FC: zero subtype b4 b5 b6 (if not mgmt) * Retry, PwrMgt, MoreData, Order (if Qos Data); set Protected */ mgmt = ieee80211_is_mgmt(hdr->frame_control); mask_fc = hdr->frame_control; mask_fc &= ~cpu_to_le16(IEEE80211_FCTL_RETRY | IEEE80211_FCTL_PM | IEEE80211_FCTL_MOREDATA); if (!mgmt) mask_fc &= ~cpu_to_le16(0x0070); mask_fc |= cpu_to_le16(IEEE80211_FCTL_PROTECTED); a4_included = ieee80211_has_a4(hdr->frame_control); if (a4_included) len_a += 6; if (ieee80211_is_data_qos(hdr->frame_control)) { qos_tid = *ieee80211_get_qos_ctl(hdr); if (spp_amsdu) qos_tid &= IEEE80211_QOS_CTL_TID_MASK | IEEE80211_QOS_CTL_A_MSDU_PRESENT; else qos_tid &= IEEE80211_QOS_CTL_TID_MASK; mask_fc &= ~cpu_to_le16(IEEE80211_FCTL_ORDER); len_a += 2; } else { qos_tid = 0; } /* AAD (extra authenticate-only data) / masked 802.11 header * FC | A1 | A2 | A3 | SC | [A4] | [QC] */ put_unaligned_be16(len_a, &aad[0]); put_unaligned(mask_fc, (__le16 *)&aad[2]); memcpy(&aad[4], &hdr->addrs, 3 * ETH_ALEN); /* Mask Seq#, leave Frag# */ aad[22] = *((u8 *) &hdr->seq_ctrl) & 0x0f; aad[23] = 0; if (a4_included) { memcpy(&aad[24], hdr->addr4, ETH_ALEN); aad[30] = qos_tid; aad[31] = 0; } else { memset(&aad[24], 0, ETH_ALEN + IEEE80211_QOS_CTL_LEN); aad[24] = qos_tid; } return qos_tid; } static void ccmp_special_blocks(struct sk_buff *skb, u8 *pn, u8 *b_0, u8 *aad, bool spp_amsdu) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; u8 qos_tid = ccmp_gcmp_aad(skb, aad, spp_amsdu); /* In CCM, the initial vectors (IV) used for CTR mode encryption and CBC * mode authentication are not allowed to collide, yet both are derived * from this vector b_0. We only set L := 1 here to indicate that the * data size can be represented in (L+1) bytes. The CCM layer will take * care of storing the data length in the top (L+1) bytes and setting * and clearing the other bits as is required to derive the two IVs. */ b_0[0] = 0x1; /* Nonce: Nonce Flags | A2 | PN * Nonce Flags: Priority (b0..b3) | Management (b4) | Reserved (b5..b7) */ b_0[1] = qos_tid | (ieee80211_is_mgmt(hdr->frame_control) << 4); memcpy(&b_0[2], hdr->addr2, ETH_ALEN); memcpy(&b_0[8], pn, IEEE80211_CCMP_PN_LEN); } static inline void ccmp_pn2hdr(u8 *hdr, u8 *pn, int key_id) { hdr[0] = pn[5]; hdr[1] = pn[4]; hdr[2] = 0; hdr[3] = 0x20 | (key_id << 6); hdr[4] = pn[3]; hdr[5] = pn[2]; hdr[6] = pn[1]; hdr[7] = pn[0]; } static inline void ccmp_hdr2pn(u8 *pn, u8 *hdr) { pn[0] = hdr[7]; pn[1] = hdr[6]; pn[2] = hdr[5]; pn[3] = hdr[4]; pn[4] = hdr[1]; pn[5] = hdr[0]; } static int ccmp_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb, unsigned int mic_len) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct ieee80211_key *key = tx->key; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); int hdrlen, len, tail; u8 *pos; u8 pn[6]; u64 pn64; u8 aad[CCM_AAD_LEN]; u8 b_0[AES_BLOCK_SIZE]; if (info->control.hw_key && !(info->control.hw_key->flags & IEEE80211_KEY_FLAG_GENERATE_IV) && !(info->control.hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE) && !((info->control.hw_key->flags & IEEE80211_KEY_FLAG_GENERATE_IV_MGMT) && ieee80211_is_mgmt(hdr->frame_control))) { /* * hwaccel has no need for preallocated room for CCMP * header or MIC fields */ return 0; } hdrlen = ieee80211_hdrlen(hdr->frame_control); len = skb->len - hdrlen; if (info->control.hw_key) tail = 0; else tail = mic_len; if (WARN_ON(skb_tailroom(skb) < tail || skb_headroom(skb) < IEEE80211_CCMP_HDR_LEN)) return -1; pos = skb_push(skb, IEEE80211_CCMP_HDR_LEN); memmove(pos, pos + IEEE80211_CCMP_HDR_LEN, hdrlen); /* the HW only needs room for the IV, but not the actual IV */ if (info->control.hw_key && (info->control.hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE)) return 0; pos += hdrlen; pn64 = atomic64_inc_return(&key->conf.tx_pn); pn[5] = pn64; pn[4] = pn64 >> 8; pn[3] = pn64 >> 16; pn[2] = pn64 >> 24; pn[1] = pn64 >> 32; pn[0] = pn64 >> 40; ccmp_pn2hdr(pos, pn, key->conf.keyidx); /* hwaccel - with software CCMP header */ if (info->control.hw_key) return 0; pos += IEEE80211_CCMP_HDR_LEN; ccmp_special_blocks(skb, pn, b_0, aad, key->conf.flags & IEEE80211_KEY_FLAG_SPP_AMSDU); return ieee80211_aes_ccm_encrypt(key->u.ccmp.tfm, b_0, aad, pos, len, skb_put(skb, mic_len)); } ieee80211_tx_result ieee80211_crypto_ccmp_encrypt(struct ieee80211_tx_data *tx, unsigned int mic_len) { struct sk_buff *skb; ieee80211_tx_set_protected(tx); skb_queue_walk(&tx->skbs, skb) { if (ccmp_encrypt_skb(tx, skb, mic_len) < 0) return TX_DROP; } return TX_CONTINUE; } ieee80211_rx_result ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx, unsigned int mic_len) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data; int hdrlen; struct ieee80211_key *key = rx->key; struct sk_buff *skb = rx->skb; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); u8 pn[IEEE80211_CCMP_PN_LEN]; int data_len; int queue; hdrlen = ieee80211_hdrlen(hdr->frame_control); if (!ieee80211_is_data(hdr->frame_control) && !ieee80211_is_robust_mgmt_frame(skb)) return RX_CONTINUE; if (status->flag & RX_FLAG_DECRYPTED) { if (!pskb_may_pull(rx->skb, hdrlen + IEEE80211_CCMP_HDR_LEN)) return RX_DROP_U_SHORT_CCMP; if (status->flag & RX_FLAG_MIC_STRIPPED) mic_len = 0; } else { if (skb_linearize(rx->skb)) return RX_DROP_U_OOM; } /* reload hdr - skb might have been reallocated */ hdr = (void *)rx->skb->data; data_len = skb->len - hdrlen - IEEE80211_CCMP_HDR_LEN - mic_len; if (!rx->sta || data_len < 0) return RX_DROP_U_SHORT_CCMP; if (!(status->flag & RX_FLAG_PN_VALIDATED)) { int res; ccmp_hdr2pn(pn, skb->data + hdrlen); queue = rx->security_idx; res = memcmp(pn, key->u.ccmp.rx_pn[queue], IEEE80211_CCMP_PN_LEN); if (res < 0 || (!res && !(status->flag & RX_FLAG_ALLOW_SAME_PN))) { key->u.ccmp.replays++; return RX_DROP_U_REPLAY; } if (!(status->flag & RX_FLAG_DECRYPTED)) { u8 aad[2 * AES_BLOCK_SIZE]; u8 b_0[AES_BLOCK_SIZE]; /* hardware didn't decrypt/verify MIC */ ccmp_special_blocks(skb, pn, b_0, aad, key->conf.flags & IEEE80211_KEY_FLAG_SPP_AMSDU); if (ieee80211_aes_ccm_decrypt( key->u.ccmp.tfm, b_0, aad, skb->data + hdrlen + IEEE80211_CCMP_HDR_LEN, data_len, skb->data + skb->len - mic_len)) return RX_DROP_U_MIC_FAIL; } memcpy(key->u.ccmp.rx_pn[queue], pn, IEEE80211_CCMP_PN_LEN); if (unlikely(ieee80211_is_frag(hdr))) memcpy(rx->ccm_gcm.pn, pn, IEEE80211_CCMP_PN_LEN); } /* Remove CCMP header and MIC */ if (pskb_trim(skb, skb->len - mic_len)) return RX_DROP_U_SHORT_CCMP_MIC; memmove(skb->data + IEEE80211_CCMP_HDR_LEN, skb->data, hdrlen); skb_pull(skb, IEEE80211_CCMP_HDR_LEN); return RX_CONTINUE; } static void gcmp_special_blocks(struct sk_buff *skb, u8 *pn, u8 *j_0, u8 *aad, bool spp_amsdu) { struct ieee80211_hdr *hdr = (void *)skb->data; memcpy(j_0, hdr->addr2, ETH_ALEN); memcpy(&j_0[ETH_ALEN], pn, IEEE80211_GCMP_PN_LEN); ccmp_gcmp_aad(skb, aad, spp_amsdu); } static inline void gcmp_pn2hdr(u8 *hdr, const u8 *pn, int key_id) { hdr[0] = pn[5]; hdr[1] = pn[4]; hdr[2] = 0; hdr[3] = 0x20 | (key_id << 6); hdr[4] = pn[3]; hdr[5] = pn[2]; hdr[6] = pn[1]; hdr[7] = pn[0]; } static inline void gcmp_hdr2pn(u8 *pn, const u8 *hdr) { pn[0] = hdr[7]; pn[1] = hdr[6]; pn[2] = hdr[5]; pn[3] = hdr[4]; pn[4] = hdr[1]; pn[5] = hdr[0]; } static int gcmp_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; struct ieee80211_key *key = tx->key; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); int hdrlen, len, tail; u8 *pos; u8 pn[6]; u64 pn64; u8 aad[GCM_AAD_LEN]; u8 j_0[AES_BLOCK_SIZE]; if (info->control.hw_key && !(info->control.hw_key->flags & IEEE80211_KEY_FLAG_GENERATE_IV) && !(info->control.hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE) && !((info->control.hw_key->flags & IEEE80211_KEY_FLAG_GENERATE_IV_MGMT) && ieee80211_is_mgmt(hdr->frame_control))) { /* hwaccel has no need for preallocated room for GCMP * header or MIC fields */ return 0; } hdrlen = ieee80211_hdrlen(hdr->frame_control); len = skb->len - hdrlen; if (info->control.hw_key) tail = 0; else tail = IEEE80211_GCMP_MIC_LEN; if (WARN_ON(skb_tailroom(skb) < tail || skb_headroom(skb) < IEEE80211_GCMP_HDR_LEN)) return -1; pos = skb_push(skb, IEEE80211_GCMP_HDR_LEN); memmove(pos, pos + IEEE80211_GCMP_HDR_LEN, hdrlen); skb_set_network_header(skb, skb_network_offset(skb) + IEEE80211_GCMP_HDR_LEN); /* the HW only needs room for the IV, but not the actual IV */ if (info->control.hw_key && (info->control.hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE)) return 0; pos += hdrlen; pn64 = atomic64_inc_return(&key->conf.tx_pn); pn[5] = pn64; pn[4] = pn64 >> 8; pn[3] = pn64 >> 16; pn[2] = pn64 >> 24; pn[1] = pn64 >> 32; pn[0] = pn64 >> 40; gcmp_pn2hdr(pos, pn, key->conf.keyidx); /* hwaccel - with software GCMP header */ if (info->control.hw_key) return 0; pos += IEEE80211_GCMP_HDR_LEN; gcmp_special_blocks(skb, pn, j_0, aad, key->conf.flags & IEEE80211_KEY_FLAG_SPP_AMSDU); return ieee80211_aes_gcm_encrypt(key->u.gcmp.tfm, j_0, aad, pos, len, skb_put(skb, IEEE80211_GCMP_MIC_LEN)); } ieee80211_tx_result ieee80211_crypto_gcmp_encrypt(struct ieee80211_tx_data *tx) { struct sk_buff *skb; ieee80211_tx_set_protected(tx); skb_queue_walk(&tx->skbs, skb) { if (gcmp_encrypt_skb(tx, skb) < 0) return TX_DROP; } return TX_CONTINUE; } ieee80211_rx_result ieee80211_crypto_gcmp_decrypt(struct ieee80211_rx_data *rx) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data; int hdrlen; struct ieee80211_key *key = rx->key; struct sk_buff *skb = rx->skb; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); u8 pn[IEEE80211_GCMP_PN_LEN]; int data_len, queue, mic_len = IEEE80211_GCMP_MIC_LEN; hdrlen = ieee80211_hdrlen(hdr->frame_control); if (!ieee80211_is_data(hdr->frame_control) && !ieee80211_is_robust_mgmt_frame(skb)) return RX_CONTINUE; if (status->flag & RX_FLAG_DECRYPTED) { if (!pskb_may_pull(rx->skb, hdrlen + IEEE80211_GCMP_HDR_LEN)) return RX_DROP_U_SHORT_GCMP; if (status->flag & RX_FLAG_MIC_STRIPPED) mic_len = 0; } else { if (skb_linearize(rx->skb)) return RX_DROP_U_OOM; } /* reload hdr - skb might have been reallocated */ hdr = (void *)rx->skb->data; data_len = skb->len - hdrlen - IEEE80211_GCMP_HDR_LEN - mic_len; if (!rx->sta || data_len < 0) return RX_DROP_U_SHORT_GCMP; if (!(status->flag & RX_FLAG_PN_VALIDATED)) { int res; gcmp_hdr2pn(pn, skb->data + hdrlen); queue = rx->security_idx; res = memcmp(pn, key->u.gcmp.rx_pn[queue], IEEE80211_GCMP_PN_LEN); if (res < 0 || (!res && !(status->flag & RX_FLAG_ALLOW_SAME_PN))) { key->u.gcmp.replays++; return RX_DROP_U_REPLAY; } if (!(status->flag & RX_FLAG_DECRYPTED)) { u8 aad[2 * AES_BLOCK_SIZE]; u8 j_0[AES_BLOCK_SIZE]; /* hardware didn't decrypt/verify MIC */ gcmp_special_blocks(skb, pn, j_0, aad, key->conf.flags & IEEE80211_KEY_FLAG_SPP_AMSDU); if (ieee80211_aes_gcm_decrypt( key->u.gcmp.tfm, j_0, aad, skb->data + hdrlen + IEEE80211_GCMP_HDR_LEN, data_len, skb->data + skb->len - IEEE80211_GCMP_MIC_LEN)) return RX_DROP_U_MIC_FAIL; } memcpy(key->u.gcmp.rx_pn[queue], pn, IEEE80211_GCMP_PN_LEN); if (unlikely(ieee80211_is_frag(hdr))) memcpy(rx->ccm_gcm.pn, pn, IEEE80211_CCMP_PN_LEN); } /* Remove GCMP header and MIC */ if (pskb_trim(skb, skb->len - mic_len)) return RX_DROP_U_SHORT_GCMP_MIC; memmove(skb->data + IEEE80211_GCMP_HDR_LEN, skb->data, hdrlen); skb_pull(skb, IEEE80211_GCMP_HDR_LEN); return RX_CONTINUE; } static void bip_aad(struct sk_buff *skb, u8 *aad) { __le16 mask_fc; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; /* BIP AAD: FC(masked) || A1 || A2 || A3 */ /* FC type/subtype */ /* Mask FC Retry, PwrMgt, MoreData flags to zero */ mask_fc = hdr->frame_control; mask_fc &= ~cpu_to_le16(IEEE80211_FCTL_RETRY | IEEE80211_FCTL_PM | IEEE80211_FCTL_MOREDATA); put_unaligned(mask_fc, (__le16 *) &aad[0]); /* A1 || A2 || A3 */ memcpy(aad + 2, &hdr->addrs, 3 * ETH_ALEN); } static inline void bip_ipn_set64(u8 *d, u64 pn) { *d++ = pn; *d++ = pn >> 8; *d++ = pn >> 16; *d++ = pn >> 24; *d++ = pn >> 32; *d = pn >> 40; } static inline void bip_ipn_swap(u8 *d, const u8 *s) { *d++ = s[5]; *d++ = s[4]; *d++ = s[3]; *d++ = s[2]; *d++ = s[1]; *d = s[0]; } ieee80211_tx_result ieee80211_crypto_aes_cmac_encrypt(struct ieee80211_tx_data *tx, unsigned int mic_len) { struct sk_buff *skb; struct ieee80211_tx_info *info; struct ieee80211_key *key = tx->key; struct ieee80211_mmie_var *mmie; size_t mmie_len; u8 aad[20]; u64 pn64; if (WARN_ON(skb_queue_len(&tx->skbs) != 1)) return TX_DROP; skb = skb_peek(&tx->skbs); info = IEEE80211_SKB_CB(skb); if (info->control.hw_key && !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIE)) return TX_CONTINUE; mmie_len = sizeof(*mmie) + mic_len; if (WARN_ON(skb_tailroom(skb) < mmie_len)) return TX_DROP; mmie = skb_put(skb, mmie_len); mmie->element_id = WLAN_EID_MMIE; mmie->length = mmie_len - 2; mmie->key_id = cpu_to_le16(key->conf.keyidx); /* PN = PN + 1 */ pn64 = atomic64_inc_return(&key->conf.tx_pn); bip_ipn_set64(mmie->sequence_number, pn64); if (info->control.hw_key) return TX_CONTINUE; bip_aad(skb, aad); if (ieee80211_aes_cmac(key->u.aes_cmac.tfm, aad, skb->data + 24, skb->len - 24, mmie->mic, mic_len)) return TX_DROP; return TX_CONTINUE; } ieee80211_rx_result ieee80211_crypto_aes_cmac_decrypt(struct ieee80211_rx_data *rx, unsigned int mic_len) { struct sk_buff *skb = rx->skb; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); struct ieee80211_key *key = rx->key; struct ieee80211_mmie_var *mmie; size_t mmie_len; u8 aad[20], mic[IEEE80211_CMAC_256_MIC_LEN], ipn[6]; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; if (!ieee80211_is_mgmt(hdr->frame_control)) return RX_CONTINUE; mmie_len = sizeof(*mmie) + mic_len; /* management frames are already linear */ if (skb->len < 24 + mmie_len) return mic_len == IEEE80211_CMAC_128_MIC_LEN ? RX_DROP_U_SHORT_CMAC : RX_DROP_U_SHORT_CMAC256; mmie = (struct ieee80211_mmie_var *)(skb->data + skb->len - mmie_len); if (mmie->element_id != WLAN_EID_MMIE || mmie->length != mmie_len - 2) return RX_DROP_U_BAD_MMIE; /* Invalid MMIE */ bip_ipn_swap(ipn, mmie->sequence_number); if (memcmp(ipn, key->u.aes_cmac.rx_pn, 6) <= 0) { key->u.aes_cmac.replays++; return RX_DROP_U_REPLAY; } if (!(status->flag & RX_FLAG_DECRYPTED)) { /* hardware didn't decrypt/verify MIC */ bip_aad(skb, aad); if (ieee80211_aes_cmac(key->u.aes_cmac.tfm, aad, skb->data + 24, skb->len - 24, mic, mic_len)) return RX_DROP_U_DECRYPT_FAIL; if (crypto_memneq(mic, mmie->mic, mic_len)) { key->u.aes_cmac.icverrors++; return RX_DROP_U_MIC_FAIL; } } memcpy(key->u.aes_cmac.rx_pn, ipn, 6); /* Remove MMIE */ skb_trim(skb, skb->len - mmie_len); return RX_CONTINUE; } ieee80211_tx_result ieee80211_crypto_aes_gmac_encrypt(struct ieee80211_tx_data *tx) { struct sk_buff *skb; struct ieee80211_tx_info *info; struct ieee80211_key *key = tx->key; struct ieee80211_mmie_16 *mmie; struct ieee80211_hdr *hdr; u8 aad[GMAC_AAD_LEN]; u64 pn64; u8 nonce[GMAC_NONCE_LEN]; if (WARN_ON(skb_queue_len(&tx->skbs) != 1)) return TX_DROP; skb = skb_peek(&tx->skbs); info = IEEE80211_SKB_CB(skb); if (info->control.hw_key && !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIE)) return TX_CONTINUE; if (WARN_ON(skb_tailroom(skb) < sizeof(*mmie))) return TX_DROP; mmie = skb_put(skb, sizeof(*mmie)); mmie->element_id = WLAN_EID_MMIE; mmie->length = sizeof(*mmie) - 2; mmie->key_id = cpu_to_le16(key->conf.keyidx); /* PN = PN + 1 */ pn64 = atomic64_inc_return(&key->conf.tx_pn); bip_ipn_set64(mmie->sequence_number, pn64); if (info->control.hw_key) return TX_CONTINUE; bip_aad(skb, aad); hdr = (struct ieee80211_hdr *)skb->data; memcpy(nonce, hdr->addr2, ETH_ALEN); bip_ipn_swap(nonce + ETH_ALEN, mmie->sequence_number); /* MIC = AES-GMAC(IGTK, AAD || Management Frame Body || MMIE, 128) */ if (ieee80211_aes_gmac(key->u.aes_gmac.tfm, aad, nonce, skb->data + 24, skb->len - 24, mmie->mic) < 0) return TX_DROP; return TX_CONTINUE; } ieee80211_rx_result ieee80211_crypto_aes_gmac_decrypt(struct ieee80211_rx_data *rx) { struct sk_buff *skb = rx->skb; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); struct ieee80211_key *key = rx->key; struct ieee80211_mmie_16 *mmie; u8 aad[GMAC_AAD_LEN], *mic, ipn[6], nonce[GMAC_NONCE_LEN]; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; if (!ieee80211_is_mgmt(hdr->frame_control)) return RX_CONTINUE; /* management frames are already linear */ if (skb->len < 24 + sizeof(*mmie)) return RX_DROP_U_SHORT_GMAC; mmie = (struct ieee80211_mmie_16 *) (skb->data + skb->len - sizeof(*mmie)); if (mmie->element_id != WLAN_EID_MMIE || mmie->length != sizeof(*mmie) - 2) return RX_DROP_U_BAD_MMIE; /* Invalid MMIE */ bip_ipn_swap(ipn, mmie->sequence_number); if (memcmp(ipn, key->u.aes_gmac.rx_pn, 6) <= 0) { key->u.aes_gmac.replays++; return RX_DROP_U_REPLAY; } if (!(status->flag & RX_FLAG_DECRYPTED)) { /* hardware didn't decrypt/verify MIC */ bip_aad(skb, aad); memcpy(nonce, hdr->addr2, ETH_ALEN); memcpy(nonce + ETH_ALEN, ipn, 6); mic = kmalloc(IEEE80211_GMAC_MIC_LEN, GFP_ATOMIC); if (!mic) return RX_DROP_U_OOM; if (ieee80211_aes_gmac(key->u.aes_gmac.tfm, aad, nonce, skb->data + 24, skb->len - 24, mic) < 0 || crypto_memneq(mic, mmie->mic, sizeof(mmie->mic))) { key->u.aes_gmac.icverrors++; kfree(mic); return RX_DROP_U_MIC_FAIL; } kfree(mic); } memcpy(key->u.aes_gmac.rx_pn, ipn, 6); /* Remove MMIE */ skb_trim(skb, skb->len - sizeof(*mmie)); return RX_CONTINUE; } |
| 28 28 22 13 15 27 27 17 17 12 12 13 17 17 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 | // SPDX-License-Identifier: GPL-2.0 /* * direct.c - Low-level direct PCI config space access */ #include <linux/pci.h> #include <linux/init.h> #include <linux/dmi.h> #include <asm/pci_x86.h> /* * Functions for accessing PCI base (first 256 bytes) and extended * (4096 bytes per PCI function) configuration space with type 1 * accesses. */ #define PCI_CONF1_ADDRESS(bus, devfn, reg) \ (0x80000000 | ((reg & 0xF00) << 16) | (bus << 16) \ | (devfn << 8) | (reg & 0xFC)) static int pci_conf1_read(unsigned int seg, unsigned int bus, unsigned int devfn, int reg, int len, u32 *value) { unsigned long flags; if (seg || (bus > 255) || (devfn > 255) || (reg > 4095)) { *value = -1; return -EINVAL; } raw_spin_lock_irqsave(&pci_config_lock, flags); outl(PCI_CONF1_ADDRESS(bus, devfn, reg), 0xCF8); switch (len) { case 1: *value = inb(0xCFC + (reg & 3)); break; case 2: *value = inw(0xCFC + (reg & 2)); break; case 4: *value = inl(0xCFC); break; } raw_spin_unlock_irqrestore(&pci_config_lock, flags); return 0; } static int pci_conf1_write(unsigned int seg, unsigned int bus, unsigned int devfn, int reg, int len, u32 value) { unsigned long flags; if (seg || (bus > 255) || (devfn > 255) || (reg > 4095)) return -EINVAL; raw_spin_lock_irqsave(&pci_config_lock, flags); outl(PCI_CONF1_ADDRESS(bus, devfn, reg), 0xCF8); switch (len) { case 1: outb((u8)value, 0xCFC + (reg & 3)); break; case 2: outw((u16)value, 0xCFC + (reg & 2)); break; case 4: outl((u32)value, 0xCFC); break; } raw_spin_unlock_irqrestore(&pci_config_lock, flags); return 0; } #undef PCI_CONF1_ADDRESS const struct pci_raw_ops pci_direct_conf1 = { .read = pci_conf1_read, .write = pci_conf1_write, }; /* * Functions for accessing PCI configuration space with type 2 accesses */ #define PCI_CONF2_ADDRESS(dev, reg) (u16)(0xC000 | (dev << 8) | reg) static int pci_conf2_read(unsigned int seg, unsigned int bus, unsigned int devfn, int reg, int len, u32 *value) { unsigned long flags; int dev, fn; WARN_ON(seg); if ((bus > 255) || (devfn > 255) || (reg > 255)) { *value = -1; return -EINVAL; } dev = PCI_SLOT(devfn); fn = PCI_FUNC(devfn); if (dev & 0x10) return PCIBIOS_DEVICE_NOT_FOUND; raw_spin_lock_irqsave(&pci_config_lock, flags); outb((u8)(0xF0 | (fn << 1)), 0xCF8); outb((u8)bus, 0xCFA); switch (len) { case 1: *value = inb(PCI_CONF2_ADDRESS(dev, reg)); break; case 2: *value = inw(PCI_CONF2_ADDRESS(dev, reg)); break; case 4: *value = inl(PCI_CONF2_ADDRESS(dev, reg)); break; } outb(0, 0xCF8); raw_spin_unlock_irqrestore(&pci_config_lock, flags); return 0; } static int pci_conf2_write(unsigned int seg, unsigned int bus, unsigned int devfn, int reg, int len, u32 value) { unsigned long flags; int dev, fn; WARN_ON(seg); if ((bus > 255) || (devfn > 255) || (reg > 255)) return -EINVAL; dev = PCI_SLOT(devfn); fn = PCI_FUNC(devfn); if (dev & 0x10) return PCIBIOS_DEVICE_NOT_FOUND; raw_spin_lock_irqsave(&pci_config_lock, flags); outb((u8)(0xF0 | (fn << 1)), 0xCF8); outb((u8)bus, 0xCFA); switch (len) { case 1: outb((u8)value, PCI_CONF2_ADDRESS(dev, reg)); break; case 2: outw((u16)value, PCI_CONF2_ADDRESS(dev, reg)); break; case 4: outl((u32)value, PCI_CONF2_ADDRESS(dev, reg)); break; } outb(0, 0xCF8); raw_spin_unlock_irqrestore(&pci_config_lock, flags); return 0; } #undef PCI_CONF2_ADDRESS static const struct pci_raw_ops pci_direct_conf2 = { .read = pci_conf2_read, .write = pci_conf2_write, }; /* * Before we decide to use direct hardware access mechanisms, we try to do some * trivial checks to ensure it at least _seems_ to be working -- we just test * whether bus 00 contains a host bridge (this is similar to checking * techniques used in XFree86, but ours should be more reliable since we * attempt to make use of direct access hints provided by the PCI BIOS). * * This should be close to trivial, but it isn't, because there are buggy * chipsets (yes, you guessed it, by Intel and Compaq) that have no class ID. */ static int __init pci_sanity_check(const struct pci_raw_ops *o) { u32 x = 0; int devfn; if (pci_probe & PCI_NO_CHECKS) return 1; /* Assume Type 1 works for newer systems. This handles machines that don't have anything on PCI Bus 0. */ if (dmi_get_bios_year() >= 2001) return 1; for (devfn = 0; devfn < 0x100; devfn++) { if (o->read(0, 0, devfn, PCI_CLASS_DEVICE, 2, &x)) continue; if (x == PCI_CLASS_BRIDGE_HOST || x == PCI_CLASS_DISPLAY_VGA) return 1; if (o->read(0, 0, devfn, PCI_VENDOR_ID, 2, &x)) continue; if (x == PCI_VENDOR_ID_INTEL || x == PCI_VENDOR_ID_COMPAQ) return 1; } DBG(KERN_WARNING "PCI: Sanity check failed\n"); return 0; } static int __init pci_check_type1(void) { unsigned long flags; unsigned int tmp; int works = 0; local_irq_save(flags); outb(0x01, 0xCFB); tmp = inl(0xCF8); outl(0x80000000, 0xCF8); if (inl(0xCF8) == 0x80000000 && pci_sanity_check(&pci_direct_conf1)) { works = 1; } outl(tmp, 0xCF8); local_irq_restore(flags); return works; } static int __init pci_check_type2(void) { unsigned long flags; int works = 0; local_irq_save(flags); outb(0x00, 0xCFB); outb(0x00, 0xCF8); outb(0x00, 0xCFA); if (inb(0xCF8) == 0x00 && inb(0xCFA) == 0x00 && pci_sanity_check(&pci_direct_conf2)) { works = 1; } local_irq_restore(flags); return works; } void __init pci_direct_init(int type) { if (type == 0) return; printk(KERN_INFO "PCI: Using configuration type %d for base access\n", type); if (type == 1) { raw_pci_ops = &pci_direct_conf1; if (raw_pci_ext_ops) return; if (!(pci_probe & PCI_HAS_IO_ECS)) return; printk(KERN_INFO "PCI: Using configuration type 1 " "for extended access\n"); raw_pci_ext_ops = &pci_direct_conf1; return; } raw_pci_ops = &pci_direct_conf2; } int __init pci_direct_probe(void) { if ((pci_probe & PCI_PROBE_CONF1) == 0) goto type2; if (!request_region(0xCF8, 8, "PCI conf1")) goto type2; if (pci_check_type1()) { raw_pci_ops = &pci_direct_conf1; port_cf9_safe = true; return 1; } release_region(0xCF8, 8); type2: if ((pci_probe & PCI_PROBE_CONF2) == 0) return 0; if (!request_region(0xCF8, 4, "PCI conf2")) return 0; if (!request_region(0xC000, 0x1000, "PCI conf2")) goto fail2; if (pci_check_type2()) { raw_pci_ops = &pci_direct_conf2; port_cf9_safe = true; return 2; } release_region(0xC000, 0x1000); fail2: release_region(0xCF8, 4); return 0; } |
| 746 744 744 741 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 | // SPDX-License-Identifier: GPL-2.0-or-later /* * x86 instruction attribute tables * * Written by Masami Hiramatsu <mhiramat@redhat.com> */ #include <asm/insn.h> /* __ignore_sync_check__ */ /* Attribute tables are generated from opcode map */ #include "inat-tables.c" /* Attribute search APIs */ insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode) { return inat_primary_table[opcode]; } int inat_get_last_prefix_id(insn_byte_t last_pfx) { insn_attr_t lpfx_attr; lpfx_attr = inat_get_opcode_attribute(last_pfx); return inat_last_prefix_id(lpfx_attr); } insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, int lpfx_id, insn_attr_t esc_attr) { const insn_attr_t *table; int n; n = inat_escape_id(esc_attr); table = inat_escape_tables[n][0]; if (!table) return 0; if (inat_has_variant(table[opcode]) && lpfx_id) { table = inat_escape_tables[n][lpfx_id]; if (!table) return 0; } return table[opcode]; } insn_attr_t inat_get_group_attribute(insn_byte_t modrm, int lpfx_id, insn_attr_t grp_attr) { const insn_attr_t *table; int n; n = inat_group_id(grp_attr); table = inat_group_tables[n][0]; if (!table) return inat_group_common_attribute(grp_attr); if (inat_has_variant(table[X86_MODRM_REG(modrm)]) && lpfx_id) { table = inat_group_tables[n][lpfx_id]; if (!table) return inat_group_common_attribute(grp_attr); } return table[X86_MODRM_REG(modrm)] | inat_group_common_attribute(grp_attr); } insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m, insn_byte_t vex_p) { const insn_attr_t *table; if (vex_m > X86_VEX_M_MAX || vex_p > INAT_LSTPFX_MAX) return 0; /* At first, this checks the master table */ table = inat_avx_tables[vex_m][0]; if (!table) return 0; if (!inat_is_group(table[opcode]) && vex_p) { /* If this is not a group, get attribute directly */ table = inat_avx_tables[vex_m][vex_p]; if (!table) return 0; } return table[opcode]; } insn_attr_t inat_get_xop_attribute(insn_byte_t opcode, insn_byte_t map_select) { const insn_attr_t *table; if (map_select < X86_XOP_M_MIN || map_select > X86_XOP_M_MAX) return 0; map_select -= X86_XOP_M_MIN; /* At first, this checks the master table */ table = inat_xop_tables[map_select]; if (!table) return 0; return table[opcode]; } |
| 177 178 178 178 208 209 209 209 209 209 209 209 209 202 203 202 203 203 202 203 209 208 209 209 208 209 208 208 209 209 157 209 209 208 209 209 209 209 209 209 209 209 2 209 209 209 209 208 208 209 209 209 208 208 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 | // SPDX-License-Identifier: GPL-2.0 /* * fs/mpage.c * * Copyright (C) 2002, Linus Torvalds. * * Contains functions related to preparing and submitting BIOs which contain * multiple pagecache pages. * * 15May2002 Andrew Morton * Initial version * 27Jun2002 axboe@suse.de * use bio_add_page() to build bio's just the right size */ #include <linux/kernel.h> #include <linux/export.h> #include <linux/mm.h> #include <linux/kdev_t.h> #include <linux/gfp.h> #include <linux/bio.h> #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/blkdev.h> #include <linux/highmem.h> #include <linux/prefetch.h> #include <linux/mpage.h> #include <linux/mm_inline.h> #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/pagevec.h> #include "internal.h" /* * I/O completion handler for multipage BIOs. * * The mpage code never puts partial pages into a BIO (except for end-of-file). * If a page does not map to a contiguous run of blocks then it simply falls * back to block_read_full_folio(). * * Why is this? If a page's completion depends on a number of different BIOs * which can complete in any order (or at the same time) then determining the * status of that page is hard. See end_buffer_async_read() for the details. * There is no point in duplicating all that complexity. */ static void mpage_read_end_io(struct bio *bio) { struct folio_iter fi; int err = blk_status_to_errno(bio->bi_status); bio_for_each_folio_all(fi, bio) folio_end_read(fi.folio, err == 0); bio_put(bio); } static void mpage_write_end_io(struct bio *bio) { struct folio_iter fi; int err = blk_status_to_errno(bio->bi_status); bio_for_each_folio_all(fi, bio) { if (err) mapping_set_error(fi.folio->mapping, err); folio_end_writeback(fi.folio); } bio_put(bio); } static struct bio *mpage_bio_submit_read(struct bio *bio) { bio->bi_end_io = mpage_read_end_io; guard_bio_eod(bio); submit_bio(bio); return NULL; } static struct bio *mpage_bio_submit_write(struct bio *bio) { bio->bi_end_io = mpage_write_end_io; guard_bio_eod(bio); submit_bio(bio); return NULL; } /* * support function for mpage_readahead. The fs supplied get_block might * return an up to date buffer. This is used to map that buffer into * the page, which allows read_folio to avoid triggering a duplicate call * to get_block. * * The idea is to avoid adding buffers to pages that don't already have * them. So when the buffer is up to date and the page size == block size, * this marks the page up to date instead of adding new buffers. */ static void map_buffer_to_folio(struct folio *folio, struct buffer_head *bh, int page_block) { struct inode *inode = folio->mapping->host; struct buffer_head *page_bh, *head; int block = 0; head = folio_buffers(folio); if (!head) { /* * don't make any buffers if there is only one buffer on * the folio and the folio just needs to be set up to date */ if (inode->i_blkbits == folio_shift(folio) && buffer_uptodate(bh)) { folio_mark_uptodate(folio); return; } head = create_empty_buffers(folio, i_blocksize(inode), 0); } page_bh = head; do { if (block == page_block) { page_bh->b_state = bh->b_state; page_bh->b_bdev = bh->b_bdev; page_bh->b_blocknr = bh->b_blocknr; break; } page_bh = page_bh->b_this_page; block++; } while (page_bh != head); } struct mpage_readpage_args { struct bio *bio; struct folio *folio; unsigned int nr_pages; bool is_readahead; sector_t last_block_in_bio; struct buffer_head map_bh; unsigned long first_logical_block; get_block_t *get_block; }; /* * This is the worker routine which does all the work of mapping the disk * blocks and constructs largest possible bios, submits them for IO if the * blocks are not contiguous on the disk. * * We pass a buffer_head back and forth and use its buffer_mapped() flag to * represent the validity of its disk mapping and to decide when to do the next * get_block() call. */ static void do_mpage_readpage(struct mpage_readpage_args *args) { struct folio *folio = args->folio; struct inode *inode = folio->mapping->host; const unsigned blkbits = inode->i_blkbits; const unsigned blocks_per_folio = folio_size(folio) >> blkbits; const unsigned blocksize = 1 << blkbits; struct buffer_head *map_bh = &args->map_bh; sector_t block_in_file; sector_t last_block; sector_t last_block_in_file; sector_t first_block; unsigned page_block; unsigned first_hole = blocks_per_folio; struct block_device *bdev = NULL; int length; int fully_mapped = 1; blk_opf_t opf = REQ_OP_READ; unsigned nblocks; unsigned relative_block; gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL); if (args->is_readahead) { opf |= REQ_RAHEAD; gfp |= __GFP_NORETRY | __GFP_NOWARN; } if (folio_buffers(folio)) goto confused; block_in_file = folio_pos(folio) >> blkbits; last_block = block_in_file + ((args->nr_pages * PAGE_SIZE) >> blkbits); last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; if (last_block > last_block_in_file) last_block = last_block_in_file; page_block = 0; /* * Map blocks using the result from the previous get_blocks call first. */ nblocks = map_bh->b_size >> blkbits; if (buffer_mapped(map_bh) && block_in_file > args->first_logical_block && block_in_file < (args->first_logical_block + nblocks)) { unsigned map_offset = block_in_file - args->first_logical_block; unsigned last = nblocks - map_offset; first_block = map_bh->b_blocknr + map_offset; for (relative_block = 0; ; relative_block++) { if (relative_block == last) { clear_buffer_mapped(map_bh); break; } if (page_block == blocks_per_folio) break; page_block++; block_in_file++; } bdev = map_bh->b_bdev; } /* * Then do more get_blocks calls until we are done with this folio. */ map_bh->b_folio = folio; while (page_block < blocks_per_folio) { map_bh->b_state = 0; map_bh->b_size = 0; if (block_in_file < last_block) { map_bh->b_size = (last_block-block_in_file) << blkbits; if (args->get_block(inode, block_in_file, map_bh, 0)) goto confused; args->first_logical_block = block_in_file; } if (!buffer_mapped(map_bh)) { fully_mapped = 0; if (first_hole == blocks_per_folio) first_hole = page_block; page_block++; block_in_file++; continue; } /* some filesystems will copy data into the page during * the get_block call, in which case we don't want to * read it again. map_buffer_to_folio copies the data * we just collected from get_block into the folio's buffers * so read_folio doesn't have to repeat the get_block call */ if (buffer_uptodate(map_bh)) { map_buffer_to_folio(folio, map_bh, page_block); goto confused; } if (first_hole != blocks_per_folio) goto confused; /* hole -> non-hole */ /* Contiguous blocks? */ if (!page_block) first_block = map_bh->b_blocknr; else if (first_block + page_block != map_bh->b_blocknr) goto confused; nblocks = map_bh->b_size >> blkbits; for (relative_block = 0; ; relative_block++) { if (relative_block == nblocks) { clear_buffer_mapped(map_bh); break; } else if (page_block == blocks_per_folio) break; page_block++; block_in_file++; } bdev = map_bh->b_bdev; } if (first_hole != blocks_per_folio) { folio_zero_segment(folio, first_hole << blkbits, folio_size(folio)); if (first_hole == 0) { folio_mark_uptodate(folio); folio_unlock(folio); goto out; } } else if (fully_mapped) { folio_set_mappedtodisk(folio); } /* * This folio will go to BIO. Do we need to send this BIO off first? */ if (args->bio && (args->last_block_in_bio != first_block - 1)) args->bio = mpage_bio_submit_read(args->bio); alloc_new: if (args->bio == NULL) { args->bio = bio_alloc(bdev, bio_max_segs(args->nr_pages), opf, gfp); if (args->bio == NULL) goto confused; args->bio->bi_iter.bi_sector = first_block << (blkbits - 9); } length = first_hole << blkbits; if (!bio_add_folio(args->bio, folio, length, 0)) { args->bio = mpage_bio_submit_read(args->bio); goto alloc_new; } relative_block = block_in_file - args->first_logical_block; nblocks = map_bh->b_size >> blkbits; if ((buffer_boundary(map_bh) && relative_block == nblocks) || (first_hole != blocks_per_folio)) args->bio = mpage_bio_submit_read(args->bio); else args->last_block_in_bio = first_block + blocks_per_folio - 1; out: return; confused: if (args->bio) args->bio = mpage_bio_submit_read(args->bio); if (!folio_test_uptodate(folio)) block_read_full_folio(folio, args->get_block); else folio_unlock(folio); goto out; } /** * mpage_readahead - start reads against pages * @rac: Describes which pages to read. * @get_block: The filesystem's block mapper function. * * This function walks the pages and the blocks within each page, building and * emitting large BIOs. * * If anything unusual happens, such as: * * - encountering a page which has buffers * - encountering a page which has a non-hole after a hole * - encountering a page with non-contiguous blocks * * then this code just gives up and calls the buffer_head-based read function. * It does handle a page which has holes at the end - that is a common case: * the end-of-file on blocksize < PAGE_SIZE setups. * * BH_Boundary explanation: * * There is a problem. The mpage read code assembles several pages, gets all * their disk mappings, and then submits them all. That's fine, but obtaining * the disk mappings may require I/O. Reads of indirect blocks, for example. * * So an mpage read of the first 16 blocks of an ext2 file will cause I/O to be * submitted in the following order: * * 12 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16 * * because the indirect block has to be read to get the mappings of blocks * 13,14,15,16. Obviously, this impacts performance. * * So what we do it to allow the filesystem's get_block() function to set * BH_Boundary when it maps block 11. BH_Boundary says: mapping of the block * after this one will require I/O against a block which is probably close to * this one. So you should push what I/O you have currently accumulated. * * This all causes the disk requests to be issued in the correct order. */ void mpage_readahead(struct readahead_control *rac, get_block_t get_block) { struct folio *folio; struct mpage_readpage_args args = { .get_block = get_block, .is_readahead = true, }; while ((folio = readahead_folio(rac))) { prefetchw(&folio->flags); args.folio = folio; args.nr_pages = readahead_count(rac); do_mpage_readpage(&args); /* * If read ahead failed synchronously, it may cause by removed * device, or some filesystem metadata error. */ if (!folio_test_locked(folio) && !folio_test_uptodate(folio)) break; } if (args.bio) mpage_bio_submit_read(args.bio); } EXPORT_SYMBOL(mpage_readahead); /* * This isn't called much at all */ int mpage_read_folio(struct folio *folio, get_block_t get_block) { struct mpage_readpage_args args = { .folio = folio, .nr_pages = folio_nr_pages(folio), .get_block = get_block, }; do_mpage_readpage(&args); if (args.bio) mpage_bio_submit_read(args.bio); return 0; } EXPORT_SYMBOL(mpage_read_folio); /* * Writing is not so simple. * * If the page has buffers then they will be used for obtaining the disk * mapping. We only support pages which are fully mapped-and-dirty, with a * special case for pages which are unmapped at the end: end-of-file. * * If the page has no buffers (preferred) then the page is mapped here. * * If all blocks are found to be contiguous then the page can go into the * BIO. Otherwise fall back to the mapping's writepage(). * * FIXME: This code wants an estimate of how many pages are still to be * written, so it can intelligently allocate a suitably-sized BIO. For now, * just allocate full-size (16-page) BIOs. */ struct mpage_data { struct bio *bio; sector_t last_block_in_bio; get_block_t *get_block; }; /* * We have our BIO, so we can now mark the buffers clean. Make * sure to only clean buffers which we know we'll be writing. */ static void clean_buffers(struct folio *folio, unsigned first_unmapped) { unsigned buffer_counter = 0; struct buffer_head *bh, *head = folio_buffers(folio); if (!head) return; bh = head; do { if (buffer_counter++ == first_unmapped) break; clear_buffer_dirty(bh); bh = bh->b_this_page; } while (bh != head); /* * we cannot drop the bh if the page is not uptodate or a concurrent * read_folio would fail to serialize with the bh and it would read from * disk before we reach the platter. */ if (buffer_heads_over_limit && folio_test_uptodate(folio)) try_to_free_buffers(folio); } static int mpage_write_folio(struct writeback_control *wbc, struct folio *folio, struct mpage_data *mpd) { struct bio *bio = mpd->bio; struct address_space *mapping = folio->mapping; struct inode *inode = mapping->host; const unsigned blkbits = inode->i_blkbits; const unsigned blocks_per_folio = folio_size(folio) >> blkbits; sector_t last_block; sector_t block_in_file; sector_t first_block; unsigned page_block; unsigned first_unmapped = blocks_per_folio; struct block_device *bdev = NULL; int boundary = 0; sector_t boundary_block = 0; struct block_device *boundary_bdev = NULL; size_t length; struct buffer_head map_bh; loff_t i_size = i_size_read(inode); int ret = 0; struct buffer_head *head = folio_buffers(folio); if (head) { struct buffer_head *bh = head; /* If they're all mapped and dirty, do it */ page_block = 0; do { BUG_ON(buffer_locked(bh)); if (!buffer_mapped(bh)) { /* * unmapped dirty buffers are created by * block_dirty_folio -> mmapped data */ if (buffer_dirty(bh)) goto confused; if (first_unmapped == blocks_per_folio) first_unmapped = page_block; continue; } if (first_unmapped != blocks_per_folio) goto confused; /* hole -> non-hole */ if (!buffer_dirty(bh) || !buffer_uptodate(bh)) goto confused; if (page_block) { if (bh->b_blocknr != first_block + page_block) goto confused; } else { first_block = bh->b_blocknr; } page_block++; boundary = buffer_boundary(bh); if (boundary) { boundary_block = bh->b_blocknr; boundary_bdev = bh->b_bdev; } bdev = bh->b_bdev; } while ((bh = bh->b_this_page) != head); if (first_unmapped) goto page_is_mapped; /* * Page has buffers, but they are all unmapped. The page was * created by pagein or read over a hole which was handled by * block_read_full_folio(). If this address_space is also * using mpage_readahead then this can rarely happen. */ goto confused; } /* * The page has no buffers: map it to disk */ BUG_ON(!folio_test_uptodate(folio)); block_in_file = folio_pos(folio) >> blkbits; /* * Whole page beyond EOF? Skip allocating blocks to avoid leaking * space. */ if (block_in_file >= (i_size + (1 << blkbits) - 1) >> blkbits) goto page_is_mapped; last_block = (i_size - 1) >> blkbits; map_bh.b_folio = folio; for (page_block = 0; page_block < blocks_per_folio; ) { map_bh.b_state = 0; map_bh.b_size = 1 << blkbits; if (mpd->get_block(inode, block_in_file, &map_bh, 1)) goto confused; if (!buffer_mapped(&map_bh)) goto confused; if (buffer_new(&map_bh)) clean_bdev_bh_alias(&map_bh); if (buffer_boundary(&map_bh)) { boundary_block = map_bh.b_blocknr; boundary_bdev = map_bh.b_bdev; } if (page_block) { if (map_bh.b_blocknr != first_block + page_block) goto confused; } else { first_block = map_bh.b_blocknr; } page_block++; boundary = buffer_boundary(&map_bh); bdev = map_bh.b_bdev; if (block_in_file == last_block) break; block_in_file++; } BUG_ON(page_block == 0); first_unmapped = page_block; page_is_mapped: /* Don't bother writing beyond EOF, truncate will discard the folio */ if (folio_pos(folio) >= i_size) goto confused; length = folio_size(folio); if (folio_pos(folio) + length > i_size) { /* * The page straddles i_size. It must be zeroed out on each * and every writepage invocation because it may be mmapped. * "A file is mapped in multiples of the page size. For a file * that is not a multiple of the page size, the remaining memory * is zeroed when mapped, and writes to that region are not * written out to the file." */ length = i_size - folio_pos(folio); folio_zero_segment(folio, length, folio_size(folio)); } /* * This page will go to BIO. Do we need to send this BIO off first? */ if (bio && mpd->last_block_in_bio != first_block - 1) bio = mpage_bio_submit_write(bio); alloc_new: if (bio == NULL) { bio = bio_alloc(bdev, BIO_MAX_VECS, REQ_OP_WRITE | wbc_to_write_flags(wbc), GFP_NOFS); bio->bi_iter.bi_sector = first_block << (blkbits - 9); wbc_init_bio(wbc, bio); bio->bi_write_hint = inode->i_write_hint; } /* * Must try to add the page before marking the buffer clean or * the confused fail path above (OOM) will be very confused when * it finds all bh marked clean (i.e. it will not write anything) */ wbc_account_cgroup_owner(wbc, folio, folio_size(folio)); length = first_unmapped << blkbits; if (!bio_add_folio(bio, folio, length, 0)) { bio = mpage_bio_submit_write(bio); goto alloc_new; } clean_buffers(folio, first_unmapped); BUG_ON(folio_test_writeback(folio)); folio_start_writeback(folio); folio_unlock(folio); if (boundary || (first_unmapped != blocks_per_folio)) { bio = mpage_bio_submit_write(bio); if (boundary_block) { write_boundary_block(boundary_bdev, boundary_block, 1 << blkbits); } } else { mpd->last_block_in_bio = first_block + blocks_per_folio - 1; } goto out; confused: if (bio) bio = mpage_bio_submit_write(bio); /* * The caller has a ref on the inode, so *mapping is stable */ ret = block_write_full_folio(folio, wbc, mpd->get_block); mapping_set_error(mapping, ret); out: mpd->bio = bio; return ret; } /** * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them * @mapping: address space structure to write * @wbc: subtract the number of written pages from *@wbc->nr_to_write * @get_block: the filesystem's block mapper function. * * This is a library function, which implements the writepages() * address_space_operation. */ int mpage_writepages(struct address_space *mapping, struct writeback_control *wbc, get_block_t get_block) { struct mpage_data mpd = { .get_block = get_block, }; struct folio *folio = NULL; struct blk_plug plug; int error; blk_start_plug(&plug); while ((folio = writeback_iter(mapping, wbc, folio, &error))) error = mpage_write_folio(wbc, folio, &mpd); if (mpd.bio) mpage_bio_submit_write(mpd.bio); blk_finish_plug(&plug); return error; } EXPORT_SYMBOL(mpage_writepages); |
| 72 248 247 72 248 283 283 283 210 210 210 22 113 703 276 456 2 16 253 3 253 1 3 291 7 254 251 291 254 249 4 2 247 2 2 2 246 247 251 220 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001-2003 Intel Corp. * * This file is part of the SCTP kernel implementation * * The base lksctp header. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Xingang Guo <xingang.guo@intel.com> * Jon Grimm <jgrimm@us.ibm.com> * Daisy Chang <daisyc@us.ibm.com> * Sridhar Samudrala <sri@us.ibm.com> * Ardelle Fan <ardelle.fan@intel.com> * Ryan Layer <rmlayer@us.ibm.com> * Kevin Gao <kevin.gao@intel.com> */ #ifndef __net_sctp_h__ #define __net_sctp_h__ /* Header Strategy. * Start getting some control over the header file dependencies: * includes * constants * structs * prototypes * macros, externs, and inlines * * Move test_frame specific items out of the kernel headers * and into the test frame headers. This is not perfect in any sense * and will continue to evolve. */ #include <linux/types.h> #include <linux/slab.h> #include <linux/in.h> #include <linux/tty.h> #include <linux/proc_fs.h> #include <linux/spinlock.h> #include <linux/jiffies.h> #include <linux/idr.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> #include <net/ip6_route.h> #endif #include <linux/uaccess.h> #include <asm/page.h> #include <net/sock.h> #include <net/snmp.h> #include <net/sctp/structs.h> #include <net/sctp/constants.h> #ifdef CONFIG_IP_SCTP_MODULE #define SCTP_PROTOSW_FLAG 0 #else /* static! */ #define SCTP_PROTOSW_FLAG INET_PROTOSW_PERMANENT #endif /* * Function declarations. */ /* * sctp/protocol.c */ int sctp_copy_local_addr_list(struct net *net, struct sctp_bind_addr *addr, enum sctp_scope, gfp_t gfp, int flags); struct sctp_pf *sctp_get_pf_specific(sa_family_t family); int sctp_register_pf(struct sctp_pf *, sa_family_t); void sctp_addr_wq_mgmt(struct net *, struct sctp_sockaddr_entry *, int); int sctp_udp_sock_start(struct net *net); void sctp_udp_sock_stop(struct net *net); /* * sctp/socket.c */ int sctp_inet_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags); int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb); int sctp_inet_listen(struct socket *sock, int backlog); void sctp_write_space(struct sock *sk); void sctp_data_ready(struct sock *sk); __poll_t sctp_poll(struct file *file, struct socket *sock, poll_table *wait); void sctp_sock_rfree(struct sk_buff *skb); extern struct percpu_counter sctp_sockets_allocated; int sctp_asconf_mgmt(struct sctp_sock *, struct sctp_sockaddr_entry *); struct sk_buff *sctp_skb_recv_datagram(struct sock *, int, int *); typedef int (*sctp_callback_t)(struct sctp_endpoint *, struct sctp_transport *, void *); void sctp_transport_walk_start(struct rhashtable_iter *iter); void sctp_transport_walk_stop(struct rhashtable_iter *iter); struct sctp_transport *sctp_transport_get_next(struct net *net, struct rhashtable_iter *iter); struct sctp_transport *sctp_transport_get_idx(struct net *net, struct rhashtable_iter *iter, int pos); int sctp_transport_lookup_process(sctp_callback_t cb, struct net *net, const union sctp_addr *laddr, const union sctp_addr *paddr, void *p, int dif); int sctp_transport_traverse_process(sctp_callback_t cb, sctp_callback_t cb_done, struct net *net, int *pos, void *p); int sctp_for_each_endpoint(int (*cb)(struct sctp_endpoint *, void *), void *p); int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc, struct sctp_info *info); /* * sctp/primitive.c */ int sctp_primitive_ASSOCIATE(struct net *, struct sctp_association *, void *arg); int sctp_primitive_SHUTDOWN(struct net *, struct sctp_association *, void *arg); int sctp_primitive_ABORT(struct net *, struct sctp_association *, void *arg); int sctp_primitive_SEND(struct net *, struct sctp_association *, void *arg); int sctp_primitive_REQUESTHEARTBEAT(struct net *, struct sctp_association *, void *arg); int sctp_primitive_ASCONF(struct net *, struct sctp_association *, void *arg); int sctp_primitive_RECONF(struct net *net, struct sctp_association *asoc, void *arg); /* * sctp/input.c */ int sctp_rcv(struct sk_buff *skb); int sctp_v4_err(struct sk_buff *skb, u32 info); int sctp_hash_endpoint(struct sctp_endpoint *ep); void sctp_unhash_endpoint(struct sctp_endpoint *); struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *, struct sctphdr *, struct sctp_association **, struct sctp_transport **); void sctp_err_finish(struct sock *, struct sctp_transport *); int sctp_udp_v4_err(struct sock *sk, struct sk_buff *skb); int sctp_udp_v6_err(struct sock *sk, struct sk_buff *skb); void sctp_icmp_frag_needed(struct sock *, struct sctp_association *, struct sctp_transport *t, __u32 pmtu); void sctp_icmp_redirect(struct sock *, struct sctp_transport *, struct sk_buff *); void sctp_icmp_proto_unreachable(struct sock *sk, struct sctp_association *asoc, struct sctp_transport *t); int sctp_transport_hashtable_init(void); void sctp_transport_hashtable_destroy(void); int sctp_hash_transport(struct sctp_transport *t); void sctp_unhash_transport(struct sctp_transport *t); struct sctp_transport *sctp_addrs_lookup_transport( struct net *net, const union sctp_addr *laddr, const union sctp_addr *paddr, int dif, int sdif); struct sctp_transport *sctp_epaddr_lookup_transport( const struct sctp_endpoint *ep, const union sctp_addr *paddr); bool sctp_sk_bound_dev_eq(struct net *net, int bound_dev_if, int dif, int sdif); /* * sctp/proc.c */ int __net_init sctp_proc_init(struct net *net); /* * sctp/offload.c */ int sctp_offload_init(void); /* * sctp/stream_sched.c */ void sctp_sched_ops_init(void); /* * sctp/stream.c */ int sctp_send_reset_streams(struct sctp_association *asoc, struct sctp_reset_streams *params); int sctp_send_reset_assoc(struct sctp_association *asoc); int sctp_send_add_streams(struct sctp_association *asoc, struct sctp_add_streams *params); /* * Module global variables */ /* * sctp/protocol.c */ extern struct kmem_cache *sctp_chunk_cachep __read_mostly; extern struct kmem_cache *sctp_bucket_cachep __read_mostly; extern long sysctl_sctp_mem[3]; extern int sysctl_sctp_rmem[3]; extern int sysctl_sctp_wmem[3]; /* * Section: Macros, externs, and inlines */ /* SCTP SNMP MIB stats handlers */ #define SCTP_INC_STATS(net, field) SNMP_INC_STATS((net)->sctp.sctp_statistics, field) #define __SCTP_INC_STATS(net, field) __SNMP_INC_STATS((net)->sctp.sctp_statistics, field) #define SCTP_DEC_STATS(net, field) SNMP_DEC_STATS((net)->sctp.sctp_statistics, field) /* sctp mib definitions */ enum { SCTP_MIB_NUM = 0, SCTP_MIB_CURRESTAB, /* CurrEstab */ SCTP_MIB_ACTIVEESTABS, /* ActiveEstabs */ SCTP_MIB_PASSIVEESTABS, /* PassiveEstabs */ SCTP_MIB_ABORTEDS, /* Aborteds */ SCTP_MIB_SHUTDOWNS, /* Shutdowns */ SCTP_MIB_OUTOFBLUES, /* OutOfBlues */ SCTP_MIB_CHECKSUMERRORS, /* ChecksumErrors */ SCTP_MIB_OUTCTRLCHUNKS, /* OutCtrlChunks */ SCTP_MIB_OUTORDERCHUNKS, /* OutOrderChunks */ SCTP_MIB_OUTUNORDERCHUNKS, /* OutUnorderChunks */ SCTP_MIB_INCTRLCHUNKS, /* InCtrlChunks */ SCTP_MIB_INORDERCHUNKS, /* InOrderChunks */ SCTP_MIB_INUNORDERCHUNKS, /* InUnorderChunks */ SCTP_MIB_FRAGUSRMSGS, /* FragUsrMsgs */ SCTP_MIB_REASMUSRMSGS, /* ReasmUsrMsgs */ SCTP_MIB_OUTSCTPPACKS, /* OutSCTPPacks */ SCTP_MIB_INSCTPPACKS, /* InSCTPPacks */ SCTP_MIB_T1_INIT_EXPIREDS, SCTP_MIB_T1_COOKIE_EXPIREDS, SCTP_MIB_T2_SHUTDOWN_EXPIREDS, SCTP_MIB_T3_RTX_EXPIREDS, SCTP_MIB_T4_RTO_EXPIREDS, SCTP_MIB_T5_SHUTDOWN_GUARD_EXPIREDS, SCTP_MIB_DELAY_SACK_EXPIREDS, SCTP_MIB_AUTOCLOSE_EXPIREDS, SCTP_MIB_T1_RETRANSMITS, SCTP_MIB_T3_RETRANSMITS, SCTP_MIB_PMTUD_RETRANSMITS, SCTP_MIB_FAST_RETRANSMITS, SCTP_MIB_IN_PKT_SOFTIRQ, SCTP_MIB_IN_PKT_BACKLOG, SCTP_MIB_IN_PKT_DISCARDS, SCTP_MIB_IN_DATA_CHUNK_DISCARDS, __SCTP_MIB_MAX }; #define SCTP_MIB_MAX __SCTP_MIB_MAX struct sctp_mib { unsigned long mibs[SCTP_MIB_MAX]; }; /* helper function to track stats about max rto and related transport */ static inline void sctp_max_rto(struct sctp_association *asoc, struct sctp_transport *trans) { if (asoc->stats.max_obs_rto < (__u64)trans->rto) { asoc->stats.max_obs_rto = trans->rto; memset(&asoc->stats.obs_rto_ipaddr, 0, sizeof(struct sockaddr_storage)); memcpy(&asoc->stats.obs_rto_ipaddr, &trans->ipaddr, trans->af_specific->sockaddr_len); } } /* * Macros for keeping a global reference of object allocations. */ #ifdef CONFIG_SCTP_DBG_OBJCNT extern atomic_t sctp_dbg_objcnt_sock; extern atomic_t sctp_dbg_objcnt_ep; extern atomic_t sctp_dbg_objcnt_assoc; extern atomic_t sctp_dbg_objcnt_transport; extern atomic_t sctp_dbg_objcnt_chunk; extern atomic_t sctp_dbg_objcnt_bind_addr; extern atomic_t sctp_dbg_objcnt_bind_bucket; extern atomic_t sctp_dbg_objcnt_addr; extern atomic_t sctp_dbg_objcnt_datamsg; extern atomic_t sctp_dbg_objcnt_keys; /* Macros to atomically increment/decrement objcnt counters. */ #define SCTP_DBG_OBJCNT_INC(name) \ atomic_inc(&sctp_dbg_objcnt_## name) #define SCTP_DBG_OBJCNT_DEC(name) \ atomic_dec(&sctp_dbg_objcnt_## name) #define SCTP_DBG_OBJCNT(name) \ atomic_t sctp_dbg_objcnt_## name = ATOMIC_INIT(0) /* Macro to help create new entries in the global array of * objcnt counters. */ #define SCTP_DBG_OBJCNT_ENTRY(name) \ {.label= #name, .counter= &sctp_dbg_objcnt_## name} void sctp_dbg_objcnt_init(struct net *); #else #define SCTP_DBG_OBJCNT_INC(name) #define SCTP_DBG_OBJCNT_DEC(name) static inline void sctp_dbg_objcnt_init(struct net *net) { return; } #endif /* CONFIG_SCTP_DBG_OBJCOUNT */ #if defined CONFIG_SYSCTL void sctp_sysctl_register(void); void sctp_sysctl_unregister(void); int sctp_sysctl_net_register(struct net *net); void sctp_sysctl_net_unregister(struct net *net); #else static inline void sctp_sysctl_register(void) { return; } static inline void sctp_sysctl_unregister(void) { return; } static inline int sctp_sysctl_net_register(struct net *net) { return 0; } static inline void sctp_sysctl_net_unregister(struct net *net) { return; } #endif /* Size of Supported Address Parameter for 'x' address types. */ #define SCTP_SAT_LEN(x) (sizeof(struct sctp_paramhdr) + (x) * sizeof(__u16)) #if IS_ENABLED(CONFIG_IPV6) void sctp_v6_pf_init(void); void sctp_v6_pf_exit(void); int sctp_v6_protosw_init(void); void sctp_v6_protosw_exit(void); int sctp_v6_add_protocol(void); void sctp_v6_del_protocol(void); #else /* #ifdef defined(CONFIG_IPV6) */ static inline void sctp_v6_pf_init(void) { return; } static inline void sctp_v6_pf_exit(void) { return; } static inline int sctp_v6_protosw_init(void) { return 0; } static inline void sctp_v6_protosw_exit(void) { return; } static inline int sctp_v6_add_protocol(void) { return 0; } static inline void sctp_v6_del_protocol(void) { return; } #endif /* #if defined(CONFIG_IPV6) */ /* Map an association to an assoc_id. */ static inline sctp_assoc_t sctp_assoc2id(const struct sctp_association *asoc) { return asoc ? asoc->assoc_id : 0; } static inline enum sctp_sstat_state sctp_assoc_to_state(const struct sctp_association *asoc) { /* SCTP's uapi always had SCTP_EMPTY(=0) as a dummy state, but we * got rid of it in kernel space. Therefore SCTP_CLOSED et al * start at =1 in user space, but actually as =0 in kernel space. * Now that we can not break user space and SCTP_EMPTY is exposed * there, we need to fix it up with an ugly offset not to break * applications. :( */ return asoc->state + 1; } /* Look up the association by its id. */ struct sctp_association *sctp_id2assoc(struct sock *sk, sctp_assoc_t id); /* A macro to walk a list of skbs. */ #define sctp_skb_for_each(pos, head, tmp) \ skb_queue_walk_safe(head, pos, tmp) /** * sctp_list_dequeue - remove from the head of the queue * @list: list to dequeue from * * Remove the head of the list. The head item is * returned or %NULL if the list is empty. */ static inline struct list_head *sctp_list_dequeue(struct list_head *list) { struct list_head *result = NULL; if (!list_empty(list)) { result = list->next; list_del_init(result); } return result; } /* SCTP version of skb_set_owner_r. We need this one because * of the way we have to do receive buffer accounting on bundled * chunks. */ static inline void sctp_skb_set_owner_r(struct sk_buff *skb, struct sock *sk) { struct sctp_ulpevent *event = sctp_skb2event(skb); skb_orphan(skb); skb->sk = sk; skb->destructor = sctp_sock_rfree; atomic_add(event->rmem_len, &sk->sk_rmem_alloc); /* * This mimics the behavior of skb_set_owner_r */ sk_mem_charge(sk, event->rmem_len); } /* Tests if the list has one and only one entry. */ static inline int sctp_list_single_entry(struct list_head *head) { return list_is_singular(head); } static inline bool sctp_chunk_pending(const struct sctp_chunk *chunk) { return !list_empty(&chunk->list); } /* Walk through a list of TLV parameters. Don't trust the * individual parameter lengths and instead depend on * the chunk length to indicate when to stop. Make sure * there is room for a param header too. */ #define sctp_walk_params(pos, chunk)\ _sctp_walk_params((pos), (chunk), ntohs((chunk)->chunk_hdr.length)) #define _sctp_walk_params(pos, chunk, end)\ for (pos.v = (u8 *)(chunk + 1);\ (pos.v + offsetof(struct sctp_paramhdr, length) + sizeof(pos.p->length) <=\ (void *)chunk + end) &&\ pos.v <= (void *)chunk + end - ntohs(pos.p->length) &&\ ntohs(pos.p->length) >= sizeof(struct sctp_paramhdr);\ pos.v += SCTP_PAD4(ntohs(pos.p->length))) #define sctp_walk_errors(err, chunk_hdr)\ _sctp_walk_errors((err), (chunk_hdr), ntohs((chunk_hdr)->length)) #define _sctp_walk_errors(err, chunk_hdr, end)\ for (err = (struct sctp_errhdr *)((void *)chunk_hdr + \ sizeof(struct sctp_chunkhdr));\ ((void *)err + offsetof(struct sctp_errhdr, length) + sizeof(err->length) <=\ (void *)chunk_hdr + end) &&\ (void *)err <= (void *)chunk_hdr + end - ntohs(err->length) &&\ ntohs(err->length) >= sizeof(struct sctp_errhdr); \ err = (struct sctp_errhdr *)((void *)err + SCTP_PAD4(ntohs(err->length)))) #define sctp_walk_fwdtsn(pos, chunk)\ _sctp_walk_fwdtsn((pos), (chunk), ntohs((chunk)->chunk_hdr->length) - sizeof(struct sctp_fwdtsn_chunk)) #define _sctp_walk_fwdtsn(pos, chunk, end)\ for (pos = (void *)(chunk->subh.fwdtsn_hdr + 1);\ (void *)pos <= (void *)(chunk->subh.fwdtsn_hdr + 1) + end - sizeof(struct sctp_fwdtsn_skip);\ pos++) /* External references. */ extern struct proto sctp_prot; extern struct proto sctpv6_prot; void sctp_put_port(struct sock *sk); extern struct idr sctp_assocs_id; extern spinlock_t sctp_assocs_id_lock; /* Static inline functions. */ /* Convert from an IP version number to an Address Family symbol. */ static inline int ipver2af(__u8 ipver) { switch (ipver) { case 4: return AF_INET; case 6: return AF_INET6; default: return 0; } } /* Convert from an address parameter type to an address family. */ static inline int param_type2af(__be16 type) { switch (type) { case SCTP_PARAM_IPV4_ADDRESS: return AF_INET; case SCTP_PARAM_IPV6_ADDRESS: return AF_INET6; default: return 0; } } /* Warning: The following hash functions assume a power of two 'size'. */ /* This is the hash function for the SCTP port hash table. */ static inline int sctp_phashfn(struct net *net, __u16 lport) { return (net_hash_mix(net) + lport) & (sctp_port_hashsize - 1); } /* This is the hash function for the endpoint hash table. */ static inline int sctp_ep_hashfn(struct net *net, __u16 lport) { return (net_hash_mix(net) + lport) & (sctp_ep_hashsize - 1); } #define sctp_for_each_hentry(ep, head) \ hlist_for_each_entry(ep, head, node) /* Is a socket of this style? */ #define sctp_style(sk, style) __sctp_style((sk), (SCTP_SOCKET_##style)) static inline int __sctp_style(const struct sock *sk, enum sctp_socket_type style) { return sctp_sk(sk)->type == style; } /* Is the association in this state? */ #define sctp_state(asoc, state) __sctp_state((asoc), (SCTP_STATE_##state)) static inline int __sctp_state(const struct sctp_association *asoc, enum sctp_state state) { return asoc->state == state; } /* Is the socket in this state? */ #define sctp_sstate(sk, state) __sctp_sstate((sk), (SCTP_SS_##state)) static inline int __sctp_sstate(const struct sock *sk, enum sctp_sock_state state) { return sk->sk_state == state; } /* Map v4-mapped v6 address back to v4 address */ static inline void sctp_v6_map_v4(union sctp_addr *addr) { addr->v4.sin_family = AF_INET; addr->v4.sin_port = addr->v6.sin6_port; addr->v4.sin_addr.s_addr = addr->v6.sin6_addr.s6_addr32[3]; } /* Map v4 address to v4-mapped v6 address */ static inline void sctp_v4_map_v6(union sctp_addr *addr) { __be16 port; port = addr->v4.sin_port; addr->v6.sin6_addr.s6_addr32[3] = addr->v4.sin_addr.s_addr; addr->v6.sin6_port = port; addr->v6.sin6_family = AF_INET6; addr->v6.sin6_flowinfo = 0; addr->v6.sin6_scope_id = 0; addr->v6.sin6_addr.s6_addr32[0] = 0; addr->v6.sin6_addr.s6_addr32[1] = 0; addr->v6.sin6_addr.s6_addr32[2] = htonl(0x0000ffff); } /* The cookie is always 0 since this is how it's used in the * pmtu code. */ static inline struct dst_entry *sctp_transport_dst_check(struct sctp_transport *t) { if (t->dst && !dst_check(t->dst, t->dst_cookie)) sctp_transport_dst_release(t); return t->dst; } /* Calculate max payload size given a MTU, or the total overhead if * given MTU is zero */ static inline __u32 __sctp_mtu_payload(const struct sctp_sock *sp, const struct sctp_transport *t, __u32 mtu, __u32 extra) { __u32 overhead = sizeof(struct sctphdr) + extra; if (sp) { overhead += sp->pf->af->net_header_len; if (sp->udp_port && (!t || t->encap_port)) overhead += sizeof(struct udphdr); } else { overhead += sizeof(struct ipv6hdr); } if (WARN_ON_ONCE(mtu && mtu <= overhead)) mtu = overhead; return mtu ? mtu - overhead : overhead; } static inline __u32 sctp_mtu_payload(const struct sctp_sock *sp, __u32 mtu, __u32 extra) { return __sctp_mtu_payload(sp, NULL, mtu, extra); } static inline __u32 sctp_dst_mtu(const struct dst_entry *dst) { return SCTP_TRUNC4(max_t(__u32, dst_mtu(dst), SCTP_DEFAULT_MINSEGMENT)); } static inline bool sctp_transport_pmtu_check(struct sctp_transport *t) { __u32 pmtu = sctp_dst_mtu(t->dst); if (t->pathmtu == pmtu) return true; t->pathmtu = pmtu; return false; } static inline __u32 sctp_min_frag_point(struct sctp_sock *sp, __u16 datasize) { return sctp_mtu_payload(sp, SCTP_DEFAULT_MINSEGMENT, datasize); } static inline int sctp_transport_pl_hlen(struct sctp_transport *t) { return __sctp_mtu_payload(sctp_sk(t->asoc->base.sk), t, 0, 0) - sizeof(struct sctphdr); } static inline void sctp_transport_pl_reset(struct sctp_transport *t) { if (t->probe_interval && (t->param_flags & SPP_PMTUD_ENABLE) && (t->state == SCTP_ACTIVE || t->state == SCTP_UNKNOWN)) { if (t->pl.state == SCTP_PL_DISABLED) { t->pl.state = SCTP_PL_BASE; t->pl.pmtu = SCTP_BASE_PLPMTU; t->pl.probe_size = SCTP_BASE_PLPMTU; sctp_transport_reset_probe_timer(t); } } else { if (t->pl.state != SCTP_PL_DISABLED) { if (timer_delete(&t->probe_timer)) sctp_transport_put(t); t->pl.state = SCTP_PL_DISABLED; } } } static inline void sctp_transport_pl_update(struct sctp_transport *t) { if (t->pl.state == SCTP_PL_DISABLED) return; t->pl.state = SCTP_PL_BASE; t->pl.pmtu = SCTP_BASE_PLPMTU; t->pl.probe_size = SCTP_BASE_PLPMTU; sctp_transport_reset_probe_timer(t); } static inline bool sctp_transport_pl_enabled(struct sctp_transport *t) { return t->pl.state != SCTP_PL_DISABLED; } static inline bool sctp_newsk_ready(const struct sock *sk) { return sock_flag(sk, SOCK_DEAD) || sk->sk_socket; } static inline void sctp_sock_set_nodelay(struct sock *sk) { lock_sock(sk); sctp_sk(sk)->nodelay = true; release_sock(sk); } #endif /* __net_sctp_h__ */ |
| 6 15 16 110 14 110 110 110 110 373 110 372 2 2 4 2 2 2 2 2 3 1 1 6 6 6 1 4 1 6 5 5 5 5 5 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 | // SPDX-License-Identifier: GPL-2.0 #include <linux/debugfs.h> #include "netdevsim.h" #define NSIM_DEV_HWSTATS_TRAFFIC_MS 100 static struct list_head * nsim_dev_hwstats_get_list_head(struct nsim_dev_hwstats *hwstats, enum netdev_offload_xstats_type type) { switch (type) { case NETDEV_OFFLOAD_XSTATS_TYPE_L3: return &hwstats->l3_list; } WARN_ON_ONCE(1); return NULL; } static void nsim_dev_hwstats_traffic_bump(struct nsim_dev_hwstats *hwstats, enum netdev_offload_xstats_type type) { struct nsim_dev_hwstats_netdev *hwsdev; struct list_head *hwsdev_list; hwsdev_list = nsim_dev_hwstats_get_list_head(hwstats, type); if (WARN_ON(!hwsdev_list)) return; list_for_each_entry(hwsdev, hwsdev_list, list) { if (hwsdev->enabled) { hwsdev->stats.rx_packets += 1; hwsdev->stats.tx_packets += 2; hwsdev->stats.rx_bytes += 100; hwsdev->stats.tx_bytes += 300; } } } static void nsim_dev_hwstats_traffic_work(struct work_struct *work) { struct nsim_dev_hwstats *hwstats; hwstats = container_of(work, struct nsim_dev_hwstats, traffic_dw.work); mutex_lock(&hwstats->hwsdev_list_lock); nsim_dev_hwstats_traffic_bump(hwstats, NETDEV_OFFLOAD_XSTATS_TYPE_L3); mutex_unlock(&hwstats->hwsdev_list_lock); schedule_delayed_work(&hwstats->traffic_dw, msecs_to_jiffies(NSIM_DEV_HWSTATS_TRAFFIC_MS)); } static struct nsim_dev_hwstats_netdev * nsim_dev_hwslist_find_hwsdev(struct list_head *hwsdev_list, int ifindex) { struct nsim_dev_hwstats_netdev *hwsdev; list_for_each_entry(hwsdev, hwsdev_list, list) { if (hwsdev->netdev->ifindex == ifindex) return hwsdev; } return NULL; } static int nsim_dev_hwsdev_enable(struct nsim_dev_hwstats_netdev *hwsdev, struct netlink_ext_ack *extack) { if (hwsdev->fail_enable) { hwsdev->fail_enable = false; NL_SET_ERR_MSG_MOD(extack, "Stats enablement set to fail"); return -ECANCELED; } hwsdev->enabled = true; return 0; } static void nsim_dev_hwsdev_disable(struct nsim_dev_hwstats_netdev *hwsdev) { hwsdev->enabled = false; memset(&hwsdev->stats, 0, sizeof(hwsdev->stats)); } static int nsim_dev_hwsdev_report_delta(struct nsim_dev_hwstats_netdev *hwsdev, struct netdev_notifier_offload_xstats_info *info) { netdev_offload_xstats_report_delta(info->report_delta, &hwsdev->stats); memset(&hwsdev->stats, 0, sizeof(hwsdev->stats)); return 0; } static void nsim_dev_hwsdev_report_used(struct nsim_dev_hwstats_netdev *hwsdev, struct netdev_notifier_offload_xstats_info *info) { if (hwsdev->enabled) netdev_offload_xstats_report_used(info->report_used); } static int nsim_dev_hwstats_event_off_xstats(struct nsim_dev_hwstats *hwstats, struct net_device *dev, unsigned long event, void *ptr) { struct netdev_notifier_offload_xstats_info *info; struct nsim_dev_hwstats_netdev *hwsdev; struct list_head *hwsdev_list; int err = 0; info = ptr; hwsdev_list = nsim_dev_hwstats_get_list_head(hwstats, info->type); if (!hwsdev_list) return 0; mutex_lock(&hwstats->hwsdev_list_lock); hwsdev = nsim_dev_hwslist_find_hwsdev(hwsdev_list, dev->ifindex); if (!hwsdev) goto out; switch (event) { case NETDEV_OFFLOAD_XSTATS_ENABLE: err = nsim_dev_hwsdev_enable(hwsdev, info->info.extack); break; case NETDEV_OFFLOAD_XSTATS_DISABLE: nsim_dev_hwsdev_disable(hwsdev); break; case NETDEV_OFFLOAD_XSTATS_REPORT_USED: nsim_dev_hwsdev_report_used(hwsdev, info); break; case NETDEV_OFFLOAD_XSTATS_REPORT_DELTA: err = nsim_dev_hwsdev_report_delta(hwsdev, info); break; } out: mutex_unlock(&hwstats->hwsdev_list_lock); return err; } static void nsim_dev_hwsdev_fini(struct nsim_dev_hwstats_netdev *hwsdev) { dev_put(hwsdev->netdev); kfree(hwsdev); } static void __nsim_dev_hwstats_event_unregister(struct nsim_dev_hwstats *hwstats, struct net_device *dev, enum netdev_offload_xstats_type type) { struct nsim_dev_hwstats_netdev *hwsdev; struct list_head *hwsdev_list; hwsdev_list = nsim_dev_hwstats_get_list_head(hwstats, type); if (WARN_ON(!hwsdev_list)) return; hwsdev = nsim_dev_hwslist_find_hwsdev(hwsdev_list, dev->ifindex); if (!hwsdev) return; list_del(&hwsdev->list); nsim_dev_hwsdev_fini(hwsdev); } static void nsim_dev_hwstats_event_unregister(struct nsim_dev_hwstats *hwstats, struct net_device *dev) { mutex_lock(&hwstats->hwsdev_list_lock); __nsim_dev_hwstats_event_unregister(hwstats, dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3); mutex_unlock(&hwstats->hwsdev_list_lock); } static int nsim_dev_hwstats_event(struct nsim_dev_hwstats *hwstats, struct net_device *dev, unsigned long event, void *ptr) { switch (event) { case NETDEV_OFFLOAD_XSTATS_ENABLE: case NETDEV_OFFLOAD_XSTATS_DISABLE: case NETDEV_OFFLOAD_XSTATS_REPORT_USED: case NETDEV_OFFLOAD_XSTATS_REPORT_DELTA: return nsim_dev_hwstats_event_off_xstats(hwstats, dev, event, ptr); case NETDEV_UNREGISTER: nsim_dev_hwstats_event_unregister(hwstats, dev); break; } return 0; } static int nsim_dev_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct nsim_dev_hwstats *hwstats; int err = 0; hwstats = container_of(nb, struct nsim_dev_hwstats, netdevice_nb); err = nsim_dev_hwstats_event(hwstats, dev, event, ptr); if (err) return notifier_from_errno(err); return NOTIFY_OK; } static int nsim_dev_hwstats_enable_ifindex(struct nsim_dev_hwstats *hwstats, int ifindex, enum netdev_offload_xstats_type type, struct list_head *hwsdev_list) { struct nsim_dev_hwstats_netdev *hwsdev; struct nsim_dev *nsim_dev; struct net_device *netdev; struct net *net; int err = 0; nsim_dev = container_of(hwstats, struct nsim_dev, hwstats); net = nsim_dev_net(nsim_dev); rtnl_lock(); mutex_lock(&hwstats->hwsdev_list_lock); hwsdev = nsim_dev_hwslist_find_hwsdev(hwsdev_list, ifindex); if (hwsdev) goto out_unlock_list; netdev = dev_get_by_index(net, ifindex); if (!netdev) { err = -ENODEV; goto out_unlock_list; } hwsdev = kzalloc(sizeof(*hwsdev), GFP_KERNEL); if (!hwsdev) { err = -ENOMEM; goto out_put_netdev; } hwsdev->netdev = netdev; list_add_tail(&hwsdev->list, hwsdev_list); mutex_unlock(&hwstats->hwsdev_list_lock); if (netdev_offload_xstats_enabled(netdev, type)) { nsim_dev_hwsdev_enable(hwsdev, NULL); rtnl_offload_xstats_notify(netdev); } rtnl_unlock(); return err; out_put_netdev: dev_put(netdev); out_unlock_list: mutex_unlock(&hwstats->hwsdev_list_lock); rtnl_unlock(); return err; } static int nsim_dev_hwstats_disable_ifindex(struct nsim_dev_hwstats *hwstats, int ifindex, enum netdev_offload_xstats_type type, struct list_head *hwsdev_list) { struct nsim_dev_hwstats_netdev *hwsdev; int err = 0; rtnl_lock(); mutex_lock(&hwstats->hwsdev_list_lock); hwsdev = nsim_dev_hwslist_find_hwsdev(hwsdev_list, ifindex); if (hwsdev) list_del(&hwsdev->list); mutex_unlock(&hwstats->hwsdev_list_lock); if (!hwsdev) { err = -ENOENT; goto unlock_out; } if (netdev_offload_xstats_enabled(hwsdev->netdev, type)) { netdev_offload_xstats_push_delta(hwsdev->netdev, type, &hwsdev->stats); rtnl_offload_xstats_notify(hwsdev->netdev); } nsim_dev_hwsdev_fini(hwsdev); unlock_out: rtnl_unlock(); return err; } static int nsim_dev_hwstats_fail_ifindex(struct nsim_dev_hwstats *hwstats, int ifindex, enum netdev_offload_xstats_type type, struct list_head *hwsdev_list) { struct nsim_dev_hwstats_netdev *hwsdev; int err = 0; mutex_lock(&hwstats->hwsdev_list_lock); hwsdev = nsim_dev_hwslist_find_hwsdev(hwsdev_list, ifindex); if (!hwsdev) { err = -ENOENT; goto err_hwsdev_list_unlock; } hwsdev->fail_enable = true; err_hwsdev_list_unlock: mutex_unlock(&hwstats->hwsdev_list_lock); return err; } enum nsim_dev_hwstats_do { NSIM_DEV_HWSTATS_DO_DISABLE, NSIM_DEV_HWSTATS_DO_ENABLE, NSIM_DEV_HWSTATS_DO_FAIL, }; struct nsim_dev_hwstats_fops { enum nsim_dev_hwstats_do action; enum netdev_offload_xstats_type type; }; static ssize_t nsim_dev_hwstats_do_write(struct file *file, const char __user *data, size_t count, loff_t *ppos) { struct nsim_dev_hwstats *hwstats = file->private_data; const struct nsim_dev_hwstats_fops *hwsfops; struct list_head *hwsdev_list; int ifindex; int err; hwsfops = debugfs_get_aux(file); err = kstrtoint_from_user(data, count, 0, &ifindex); if (err) return err; hwsdev_list = nsim_dev_hwstats_get_list_head(hwstats, hwsfops->type); if (WARN_ON(!hwsdev_list)) return -EINVAL; switch (hwsfops->action) { case NSIM_DEV_HWSTATS_DO_DISABLE: err = nsim_dev_hwstats_disable_ifindex(hwstats, ifindex, hwsfops->type, hwsdev_list); break; case NSIM_DEV_HWSTATS_DO_ENABLE: err = nsim_dev_hwstats_enable_ifindex(hwstats, ifindex, hwsfops->type, hwsdev_list); break; case NSIM_DEV_HWSTATS_DO_FAIL: err = nsim_dev_hwstats_fail_ifindex(hwstats, ifindex, hwsfops->type, hwsdev_list); break; } if (err) return err; return count; } static struct debugfs_short_fops debugfs_ops = { .write = nsim_dev_hwstats_do_write, .llseek = generic_file_llseek, }; #define NSIM_DEV_HWSTATS_FOPS(ACTION, TYPE) \ { \ .action = ACTION, \ .type = TYPE, \ } static const struct nsim_dev_hwstats_fops nsim_dev_hwstats_l3_disable_fops = NSIM_DEV_HWSTATS_FOPS(NSIM_DEV_HWSTATS_DO_DISABLE, NETDEV_OFFLOAD_XSTATS_TYPE_L3); static const struct nsim_dev_hwstats_fops nsim_dev_hwstats_l3_enable_fops = NSIM_DEV_HWSTATS_FOPS(NSIM_DEV_HWSTATS_DO_ENABLE, NETDEV_OFFLOAD_XSTATS_TYPE_L3); static const struct nsim_dev_hwstats_fops nsim_dev_hwstats_l3_fail_fops = NSIM_DEV_HWSTATS_FOPS(NSIM_DEV_HWSTATS_DO_FAIL, NETDEV_OFFLOAD_XSTATS_TYPE_L3); #undef NSIM_DEV_HWSTATS_FOPS int nsim_dev_hwstats_init(struct nsim_dev *nsim_dev) { struct nsim_dev_hwstats *hwstats = &nsim_dev->hwstats; struct net *net = nsim_dev_net(nsim_dev); int err; mutex_init(&hwstats->hwsdev_list_lock); INIT_LIST_HEAD(&hwstats->l3_list); hwstats->netdevice_nb.notifier_call = nsim_dev_netdevice_event; err = register_netdevice_notifier_net(net, &hwstats->netdevice_nb); if (err) goto err_mutex_destroy; hwstats->ddir = debugfs_create_dir("hwstats", nsim_dev->ddir); if (IS_ERR(hwstats->ddir)) { err = PTR_ERR(hwstats->ddir); goto err_unregister_notifier; } hwstats->l3_ddir = debugfs_create_dir("l3", hwstats->ddir); if (IS_ERR(hwstats->l3_ddir)) { err = PTR_ERR(hwstats->l3_ddir); goto err_remove_hwstats_recursive; } debugfs_create_file_aux("enable_ifindex", 0200, hwstats->l3_ddir, hwstats, &nsim_dev_hwstats_l3_enable_fops, &debugfs_ops); debugfs_create_file_aux("disable_ifindex", 0200, hwstats->l3_ddir, hwstats, &nsim_dev_hwstats_l3_disable_fops, &debugfs_ops); debugfs_create_file_aux("fail_next_enable", 0200, hwstats->l3_ddir, hwstats, &nsim_dev_hwstats_l3_fail_fops, &debugfs_ops); INIT_DELAYED_WORK(&hwstats->traffic_dw, &nsim_dev_hwstats_traffic_work); schedule_delayed_work(&hwstats->traffic_dw, msecs_to_jiffies(NSIM_DEV_HWSTATS_TRAFFIC_MS)); return 0; err_remove_hwstats_recursive: debugfs_remove_recursive(hwstats->ddir); err_unregister_notifier: unregister_netdevice_notifier_net(net, &hwstats->netdevice_nb); err_mutex_destroy: mutex_destroy(&hwstats->hwsdev_list_lock); return err; } static void nsim_dev_hwsdev_list_wipe(struct nsim_dev_hwstats *hwstats, enum netdev_offload_xstats_type type) { struct nsim_dev_hwstats_netdev *hwsdev, *tmp; struct list_head *hwsdev_list; hwsdev_list = nsim_dev_hwstats_get_list_head(hwstats, type); if (WARN_ON(!hwsdev_list)) return; mutex_lock(&hwstats->hwsdev_list_lock); list_for_each_entry_safe(hwsdev, tmp, hwsdev_list, list) { list_del(&hwsdev->list); nsim_dev_hwsdev_fini(hwsdev); } mutex_unlock(&hwstats->hwsdev_list_lock); } void nsim_dev_hwstats_exit(struct nsim_dev *nsim_dev) { struct nsim_dev_hwstats *hwstats = &nsim_dev->hwstats; struct net *net = nsim_dev_net(nsim_dev); cancel_delayed_work_sync(&hwstats->traffic_dw); debugfs_remove_recursive(hwstats->ddir); unregister_netdevice_notifier_net(net, &hwstats->netdevice_nb); nsim_dev_hwsdev_list_wipe(hwstats, NETDEV_OFFLOAD_XSTATS_TYPE_L3); mutex_destroy(&hwstats->hwsdev_list_lock); } |
| 2 2 2 2 2 2 2 2 2 2 1 1 1 2 2 2 2 1 2 2 2 2 3 2 2 2 2 2 2 3 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 | // SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2003-2008 Takahiro Hirofuchi * Copyright (C) 2015-2016 Nobuo Iwata */ #include <linux/kthread.h> #include <linux/file.h> #include <linux/net.h> #include <linux/platform_device.h> #include <linux/slab.h> /* Hardening for Spectre-v1 */ #include <linux/nospec.h> #include "usbip_common.h" #include "vhci.h" /* TODO: refine locking ?*/ /* * output example: * hub port sta spd dev sockfd local_busid * hs 0000 004 000 00000000 000003 1-2.3 * ................................................ * ss 0008 004 000 00000000 000004 2-3.4 * ................................................ * * Output includes socket fd instead of socket pointer address to avoid * leaking kernel memory address in: * /sys/devices/platform/vhci_hcd.0/status and in debug output. * The socket pointer address is not used at the moment and it was made * visible as a convenient way to find IP address from socket pointer * address by looking up /proc/net/{tcp,tcp6}. As this opens a security * hole, the change is made to use sockfd instead. * */ static void port_show_vhci(char **out, int hub, int port, struct vhci_device *vdev) { if (hub == HUB_SPEED_HIGH) *out += sprintf(*out, "hs %04u %03u ", port, vdev->ud.status); else /* hub == HUB_SPEED_SUPER */ *out += sprintf(*out, "ss %04u %03u ", port, vdev->ud.status); if (vdev->ud.status == VDEV_ST_USED) { *out += sprintf(*out, "%03u %08x ", vdev->speed, vdev->devid); *out += sprintf(*out, "%06u %s", vdev->ud.sockfd, dev_name(&vdev->udev->dev)); } else { *out += sprintf(*out, "000 00000000 "); *out += sprintf(*out, "000000 0-0"); } *out += sprintf(*out, "\n"); } /* Sysfs entry to show port status */ static ssize_t status_show_vhci(int pdev_nr, char *out) { struct platform_device *pdev = vhcis[pdev_nr].pdev; struct vhci *vhci; struct usb_hcd *hcd; struct vhci_hcd *vhci_hcd; char *s = out; int i; unsigned long flags; if (!pdev || !out) { usbip_dbg_vhci_sysfs("show status error\n"); return 0; } hcd = platform_get_drvdata(pdev); vhci_hcd = hcd_to_vhci_hcd(hcd); vhci = vhci_hcd->vhci; spin_lock_irqsave(&vhci->lock, flags); for (i = 0; i < VHCI_HC_PORTS; i++) { struct vhci_device *vdev = &vhci->vhci_hcd_hs->vdev[i]; spin_lock(&vdev->ud.lock); port_show_vhci(&out, HUB_SPEED_HIGH, pdev_nr * VHCI_PORTS + i, vdev); spin_unlock(&vdev->ud.lock); } for (i = 0; i < VHCI_HC_PORTS; i++) { struct vhci_device *vdev = &vhci->vhci_hcd_ss->vdev[i]; spin_lock(&vdev->ud.lock); port_show_vhci(&out, HUB_SPEED_SUPER, pdev_nr * VHCI_PORTS + VHCI_HC_PORTS + i, vdev); spin_unlock(&vdev->ud.lock); } spin_unlock_irqrestore(&vhci->lock, flags); return out - s; } static ssize_t status_show_not_ready(int pdev_nr, char *out) { char *s = out; int i = 0; for (i = 0; i < VHCI_HC_PORTS; i++) { out += sprintf(out, "hs %04u %03u ", (pdev_nr * VHCI_PORTS) + i, VDEV_ST_NOTASSIGNED); out += sprintf(out, "000 00000000 0000000000000000 0-0"); out += sprintf(out, "\n"); } for (i = 0; i < VHCI_HC_PORTS; i++) { out += sprintf(out, "ss %04u %03u ", (pdev_nr * VHCI_PORTS) + VHCI_HC_PORTS + i, VDEV_ST_NOTASSIGNED); out += sprintf(out, "000 00000000 0000000000000000 0-0"); out += sprintf(out, "\n"); } return out - s; } static int status_name_to_id(const char *name) { char *c; long val; int ret; c = strchr(name, '.'); if (c == NULL) return 0; ret = kstrtol(c+1, 10, &val); if (ret < 0) return ret; return val; } static ssize_t status_show(struct device *dev, struct device_attribute *attr, char *out) { char *s = out; int pdev_nr; out += sprintf(out, "hub port sta spd dev sockfd local_busid\n"); pdev_nr = status_name_to_id(attr->attr.name); if (pdev_nr < 0) out += status_show_not_ready(pdev_nr, out); else out += status_show_vhci(pdev_nr, out); return out - s; } static ssize_t nports_show(struct device *dev, struct device_attribute *attr, char *out) { char *s = out; /* * Half the ports are for SPEED_HIGH and half for SPEED_SUPER, * thus the * 2. */ out += sprintf(out, "%d\n", VHCI_PORTS * vhci_num_controllers); return out - s; } static DEVICE_ATTR_RO(nports); /* Sysfs entry to shutdown a virtual connection */ static int vhci_port_disconnect(struct vhci_hcd *vhci_hcd, __u32 rhport) { struct vhci_device *vdev = &vhci_hcd->vdev[rhport]; struct vhci *vhci = vhci_hcd->vhci; unsigned long flags; usbip_dbg_vhci_sysfs("enter\n"); mutex_lock(&vdev->ud.sysfs_lock); /* lock */ spin_lock_irqsave(&vhci->lock, flags); spin_lock(&vdev->ud.lock); if (vdev->ud.status == VDEV_ST_NULL) { pr_err("not connected %d\n", vdev->ud.status); /* unlock */ spin_unlock(&vdev->ud.lock); spin_unlock_irqrestore(&vhci->lock, flags); mutex_unlock(&vdev->ud.sysfs_lock); return -EINVAL; } /* unlock */ spin_unlock(&vdev->ud.lock); spin_unlock_irqrestore(&vhci->lock, flags); usbip_event_add(&vdev->ud, VDEV_EVENT_DOWN); mutex_unlock(&vdev->ud.sysfs_lock); return 0; } static int valid_port(__u32 *pdev_nr, __u32 *rhport) { if (*pdev_nr >= vhci_num_controllers) { pr_err("pdev %u\n", *pdev_nr); return 0; } *pdev_nr = array_index_nospec(*pdev_nr, vhci_num_controllers); if (*rhport >= VHCI_HC_PORTS) { pr_err("rhport %u\n", *rhport); return 0; } *rhport = array_index_nospec(*rhport, VHCI_HC_PORTS); return 1; } static ssize_t detach_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { __u32 port = 0, pdev_nr = 0, rhport = 0; struct usb_hcd *hcd; struct vhci_hcd *vhci_hcd; int ret; if (kstrtoint(buf, 10, &port) < 0) return -EINVAL; pdev_nr = port_to_pdev_nr(port); rhport = port_to_rhport(port); if (!valid_port(&pdev_nr, &rhport)) return -EINVAL; hcd = platform_get_drvdata(vhcis[pdev_nr].pdev); if (hcd == NULL) { dev_err(dev, "port is not ready %u\n", port); return -EAGAIN; } usbip_dbg_vhci_sysfs("rhport %d\n", rhport); if ((port / VHCI_HC_PORTS) % 2) vhci_hcd = hcd_to_vhci_hcd(hcd)->vhci->vhci_hcd_ss; else vhci_hcd = hcd_to_vhci_hcd(hcd)->vhci->vhci_hcd_hs; ret = vhci_port_disconnect(vhci_hcd, rhport); if (ret < 0) return -EINVAL; usbip_dbg_vhci_sysfs("Leave\n"); return count; } static DEVICE_ATTR_WO(detach); static int valid_args(__u32 *pdev_nr, __u32 *rhport, enum usb_device_speed speed) { if (!valid_port(pdev_nr, rhport)) { return 0; } switch (speed) { case USB_SPEED_LOW: case USB_SPEED_FULL: case USB_SPEED_HIGH: case USB_SPEED_WIRELESS: case USB_SPEED_SUPER: case USB_SPEED_SUPER_PLUS: break; default: pr_err("Failed attach request for unsupported USB speed: %s\n", usb_speed_string(speed)); return 0; } return 1; } /* Sysfs entry to establish a virtual connection */ /* * To start a new USB/IP attachment, a userland program needs to setup a TCP * connection and then write its socket descriptor with remote device * information into this sysfs file. * * A remote device is virtually attached to the root-hub port of @rhport with * @speed. @devid is embedded into a request to specify the remote device in a * server host. * * write() returns 0 on success, else negative errno. */ static ssize_t attach_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct socket *socket; int sockfd = 0; __u32 port = 0, pdev_nr = 0, rhport = 0, devid = 0, speed = 0; struct usb_hcd *hcd; struct vhci_hcd *vhci_hcd; struct vhci_device *vdev; struct vhci *vhci; int err; unsigned long flags; struct task_struct *tcp_rx = NULL; struct task_struct *tcp_tx = NULL; /* * @rhport: port number of vhci_hcd * @sockfd: socket descriptor of an established TCP connection * @devid: unique device identifier in a remote host * @speed: usb device speed in a remote host */ if (sscanf(buf, "%u %u %u %u", &port, &sockfd, &devid, &speed) != 4) return -EINVAL; pdev_nr = port_to_pdev_nr(port); rhport = port_to_rhport(port); usbip_dbg_vhci_sysfs("port(%u) pdev(%d) rhport(%u)\n", port, pdev_nr, rhport); usbip_dbg_vhci_sysfs("sockfd(%u) devid(%u) speed(%u)\n", sockfd, devid, speed); /* check received parameters */ if (!valid_args(&pdev_nr, &rhport, speed)) return -EINVAL; hcd = platform_get_drvdata(vhcis[pdev_nr].pdev); if (hcd == NULL) { dev_err(dev, "port %d is not ready\n", port); return -EAGAIN; } vhci_hcd = hcd_to_vhci_hcd(hcd); vhci = vhci_hcd->vhci; if (speed >= USB_SPEED_SUPER) vdev = &vhci->vhci_hcd_ss->vdev[rhport]; else vdev = &vhci->vhci_hcd_hs->vdev[rhport]; mutex_lock(&vdev->ud.sysfs_lock); /* Extract socket from fd. */ socket = sockfd_lookup(sockfd, &err); if (!socket) { dev_err(dev, "failed to lookup sock"); err = -EINVAL; goto unlock_mutex; } if (socket->type != SOCK_STREAM) { dev_err(dev, "Expecting SOCK_STREAM - found %d", socket->type); sockfd_put(socket); err = -EINVAL; goto unlock_mutex; } /* create threads before locking */ tcp_rx = kthread_create(vhci_rx_loop, &vdev->ud, "vhci_rx"); if (IS_ERR(tcp_rx)) { sockfd_put(socket); err = -EINVAL; goto unlock_mutex; } tcp_tx = kthread_create(vhci_tx_loop, &vdev->ud, "vhci_tx"); if (IS_ERR(tcp_tx)) { kthread_stop(tcp_rx); sockfd_put(socket); err = -EINVAL; goto unlock_mutex; } /* get task structs now */ get_task_struct(tcp_rx); get_task_struct(tcp_tx); /* now begin lock until setting vdev status set */ spin_lock_irqsave(&vhci->lock, flags); spin_lock(&vdev->ud.lock); if (vdev->ud.status != VDEV_ST_NULL) { /* end of the lock */ spin_unlock(&vdev->ud.lock); spin_unlock_irqrestore(&vhci->lock, flags); sockfd_put(socket); kthread_stop_put(tcp_rx); kthread_stop_put(tcp_tx); dev_err(dev, "port %d already used\n", rhport); /* * Will be retried from userspace * if there's another free port. */ err = -EBUSY; goto unlock_mutex; } dev_info(dev, "pdev(%u) rhport(%u) sockfd(%d)\n", pdev_nr, rhport, sockfd); dev_info(dev, "devid(%u) speed(%u) speed_str(%s)\n", devid, speed, usb_speed_string(speed)); vdev->devid = devid; vdev->speed = speed; vdev->ud.sockfd = sockfd; vdev->ud.tcp_socket = socket; vdev->ud.tcp_rx = tcp_rx; vdev->ud.tcp_tx = tcp_tx; vdev->ud.status = VDEV_ST_NOTASSIGNED; usbip_kcov_handle_init(&vdev->ud); spin_unlock(&vdev->ud.lock); spin_unlock_irqrestore(&vhci->lock, flags); /* end the lock */ wake_up_process(vdev->ud.tcp_rx); wake_up_process(vdev->ud.tcp_tx); rh_port_connect(vdev, speed); dev_info(dev, "Device attached\n"); mutex_unlock(&vdev->ud.sysfs_lock); return count; unlock_mutex: mutex_unlock(&vdev->ud.sysfs_lock); return err; } static DEVICE_ATTR_WO(attach); #define MAX_STATUS_NAME 16 struct status_attr { struct device_attribute attr; char name[MAX_STATUS_NAME+1]; }; static struct status_attr *status_attrs; static void set_status_attr(int id) { struct status_attr *status; status = status_attrs + id; if (id == 0) strcpy(status->name, "status"); else snprintf(status->name, MAX_STATUS_NAME+1, "status.%d", id); status->attr.attr.name = status->name; status->attr.attr.mode = S_IRUGO; status->attr.show = status_show; sysfs_attr_init(&status->attr.attr); } static int init_status_attrs(void) { int id; status_attrs = kcalloc(vhci_num_controllers, sizeof(struct status_attr), GFP_KERNEL); if (status_attrs == NULL) return -ENOMEM; for (id = 0; id < vhci_num_controllers; id++) set_status_attr(id); return 0; } static void finish_status_attrs(void) { kfree(status_attrs); } struct attribute_group vhci_attr_group = { .attrs = NULL, }; int vhci_init_attr_group(void) { struct attribute **attrs; int ret, i; attrs = kcalloc((vhci_num_controllers + 5), sizeof(struct attribute *), GFP_KERNEL); if (attrs == NULL) return -ENOMEM; ret = init_status_attrs(); if (ret) { kfree(attrs); return ret; } *attrs = &dev_attr_nports.attr; *(attrs + 1) = &dev_attr_detach.attr; *(attrs + 2) = &dev_attr_attach.attr; *(attrs + 3) = &dev_attr_usbip_debug.attr; for (i = 0; i < vhci_num_controllers; i++) *(attrs + i + 4) = &((status_attrs + i)->attr.attr); vhci_attr_group.attrs = attrs; return 0; } void vhci_finish_attr_group(void) { finish_status_attrs(); kfree(vhci_attr_group.attrs); } |
| 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_PKT_CLS_H #define __NET_PKT_CLS_H #include <linux/pkt_cls.h> #include <linux/workqueue.h> #include <net/sch_generic.h> #include <net/act_api.h> #include <net/net_namespace.h> /* TC action not accessible from user space */ #define TC_ACT_CONSUMED (TC_ACT_VALUE_MAX + 1) /* Basic packet classifier frontend definitions. */ struct tcf_walker { int stop; int skip; int count; bool nonempty; unsigned long cookie; int (*fn)(struct tcf_proto *, void *node, struct tcf_walker *); }; int register_tcf_proto_ops(struct tcf_proto_ops *ops); void unregister_tcf_proto_ops(struct tcf_proto_ops *ops); #define NET_CLS_ALIAS_PREFIX "net-cls-" #define MODULE_ALIAS_NET_CLS(kind) MODULE_ALIAS(NET_CLS_ALIAS_PREFIX kind) struct tcf_block_ext_info { enum flow_block_binder_type binder_type; tcf_chain_head_change_t *chain_head_change; void *chain_head_change_priv; u32 block_index; }; struct tcf_qevent { struct tcf_block *block; struct tcf_block_ext_info info; struct tcf_proto __rcu *filter_chain; }; struct tcf_block_cb; bool tcf_queue_work(struct rcu_work *rwork, work_func_t func); #ifdef CONFIG_NET_CLS struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block, u32 chain_index); void tcf_chain_put_by_act(struct tcf_chain *chain); struct tcf_chain *tcf_get_next_chain(struct tcf_block *block, struct tcf_chain *chain); struct tcf_proto *tcf_get_next_proto(struct tcf_chain *chain, struct tcf_proto *tp); void tcf_block_netif_keep_dst(struct tcf_block *block); int tcf_block_get(struct tcf_block **p_block, struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, struct netlink_ext_ack *extack); int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, struct tcf_block_ext_info *ei, struct netlink_ext_ack *extack); void tcf_block_put(struct tcf_block *block); void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, struct tcf_block_ext_info *ei); int tcf_exts_init_ex(struct tcf_exts *exts, struct net *net, int action, int police, struct tcf_proto *tp, u32 handle, bool used_action_miss); static inline bool tcf_block_shared(struct tcf_block *block) { return block->index; } static inline bool tcf_block_non_null_shared(struct tcf_block *block) { return block && block->index; } #ifdef CONFIG_NET_CLS_ACT DECLARE_STATIC_KEY_FALSE(tcf_sw_enabled_key); static inline bool tcf_block_bypass_sw(struct tcf_block *block) { return block && !atomic_read(&block->useswcnt); } #endif static inline struct Qdisc *tcf_block_q(struct tcf_block *block) { WARN_ON(tcf_block_shared(block)); return block->q; } int tcf_classify(struct sk_buff *skb, const struct tcf_block *block, const struct tcf_proto *tp, struct tcf_result *res, bool compat_mode); static inline bool tc_cls_stats_dump(struct tcf_proto *tp, struct tcf_walker *arg, void *filter) { if (arg->count >= arg->skip && arg->fn(tp, filter, arg) < 0) { arg->stop = 1; return false; } arg->count++; return true; } #else static inline bool tcf_block_shared(struct tcf_block *block) { return false; } static inline bool tcf_block_non_null_shared(struct tcf_block *block) { return false; } static inline int tcf_block_get(struct tcf_block **p_block, struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, struct netlink_ext_ack *extack) { return 0; } static inline int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, struct tcf_block_ext_info *ei, struct netlink_ext_ack *extack) { return 0; } static inline void tcf_block_put(struct tcf_block *block) { } static inline void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, struct tcf_block_ext_info *ei) { } static inline struct Qdisc *tcf_block_q(struct tcf_block *block) { return NULL; } static inline int tcf_classify(struct sk_buff *skb, const struct tcf_block *block, const struct tcf_proto *tp, struct tcf_result *res, bool compat_mode) { return TC_ACT_UNSPEC; } #endif static inline unsigned long __cls_set_class(unsigned long *clp, unsigned long cl) { return xchg(clp, cl); } static inline void __tcf_bind_filter(struct Qdisc *q, struct tcf_result *r, unsigned long base) { unsigned long cl; cl = q->ops->cl_ops->bind_tcf(q, base, r->classid); cl = __cls_set_class(&r->class, cl); if (cl) q->ops->cl_ops->unbind_tcf(q, cl); } static inline void tcf_bind_filter(struct tcf_proto *tp, struct tcf_result *r, unsigned long base) { struct Qdisc *q = tp->chain->block->q; /* Check q as it is not set for shared blocks. In that case, * setting class is not supported. */ if (!q) return; sch_tree_lock(q); __tcf_bind_filter(q, r, base); sch_tree_unlock(q); } static inline void __tcf_unbind_filter(struct Qdisc *q, struct tcf_result *r) { unsigned long cl; if ((cl = __cls_set_class(&r->class, 0)) != 0) q->ops->cl_ops->unbind_tcf(q, cl); } static inline void tcf_unbind_filter(struct tcf_proto *tp, struct tcf_result *r) { struct Qdisc *q = tp->chain->block->q; if (!q) return; __tcf_unbind_filter(q, r); } static inline void tc_cls_bind_class(u32 classid, unsigned long cl, void *q, struct tcf_result *res, unsigned long base) { if (res->classid == classid) { if (cl) __tcf_bind_filter(q, res, base); else __tcf_unbind_filter(q, res); } } struct tcf_exts { #ifdef CONFIG_NET_CLS_ACT __u32 type; /* for backward compat(TCA_OLD_COMPAT) */ int nr_actions; struct tc_action **actions; struct net *net; netns_tracker ns_tracker; struct tcf_exts_miss_cookie_node *miss_cookie_node; #endif /* Map to export classifier specific extension TLV types to the * generic extensions API. Unsupported extensions must be set to 0. */ int action; int police; }; static inline int tcf_exts_init(struct tcf_exts *exts, struct net *net, int action, int police) { #ifdef CONFIG_NET_CLS return tcf_exts_init_ex(exts, net, action, police, NULL, 0, false); #else return -EOPNOTSUPP; #endif } /* Return false if the netns is being destroyed in cleanup_net(). Callers * need to do cleanup synchronously in this case, otherwise may race with * tc_action_net_exit(). Return true for other cases. */ static inline bool tcf_exts_get_net(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT exts->net = maybe_get_net(exts->net); if (exts->net) netns_tracker_alloc(exts->net, &exts->ns_tracker, GFP_KERNEL); return exts->net != NULL; #else return true; #endif } static inline void tcf_exts_put_net(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT if (exts->net) put_net_track(exts->net, &exts->ns_tracker); #endif } #ifdef CONFIG_NET_CLS_ACT #define tcf_exts_for_each_action(i, a, exts) \ for (i = 0; i < TCA_ACT_MAX_PRIO && ((a) = (exts)->actions[i]); i++) #else #define tcf_exts_for_each_action(i, a, exts) \ for (; 0; (void)(i), (void)(a), (void)(exts)) #endif #define tcf_act_for_each_action(i, a, actions) \ for (i = 0; i < TCA_ACT_MAX_PRIO && ((a) = actions[i]); i++) static inline bool tc_act_in_hw(struct tc_action *act) { return !!act->in_hw_count; } static inline void tcf_exts_hw_stats_update(const struct tcf_exts *exts, struct flow_stats *stats, bool use_act_stats) { #ifdef CONFIG_NET_CLS_ACT int i; for (i = 0; i < exts->nr_actions; i++) { struct tc_action *a = exts->actions[i]; if (use_act_stats || tc_act_in_hw(a)) { if (!tcf_action_update_hw_stats(a)) continue; } preempt_disable(); tcf_action_stats_update(a, stats->bytes, stats->pkts, stats->drops, stats->lastused, true); preempt_enable(); a->used_hw_stats = stats->used_hw_stats; a->used_hw_stats_valid = stats->used_hw_stats_valid; } #endif } /** * tcf_exts_has_actions - check if at least one action is present * @exts: tc filter extensions handle * * Returns: true if at least one action is present. */ static inline bool tcf_exts_has_actions(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT return exts->nr_actions; #else return false; #endif } /** * tcf_exts_exec - execute tc filter extensions * @skb: socket buffer * @exts: tc filter extensions handle * @res: desired result * * Executes all configured extensions. Returns TC_ACT_OK on a normal execution, * a negative number if the filter must be considered unmatched or * a positive action code (TC_ACT_*) which must be returned to the * underlying layer. */ static inline int tcf_exts_exec(struct sk_buff *skb, struct tcf_exts *exts, struct tcf_result *res) { #ifdef CONFIG_NET_CLS_ACT return tcf_action_exec(skb, exts->actions, exts->nr_actions, res); #endif return TC_ACT_OK; } static inline int tcf_exts_exec_ex(struct sk_buff *skb, struct tcf_exts *exts, int act_index, struct tcf_result *res) { #ifdef CONFIG_NET_CLS_ACT return tcf_action_exec(skb, exts->actions + act_index, exts->nr_actions - act_index, res); #else return TC_ACT_OK; #endif } int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, struct nlattr *rate_tlv, struct tcf_exts *exts, u32 flags, struct netlink_ext_ack *extack); int tcf_exts_validate_ex(struct net *net, struct tcf_proto *tp, struct nlattr **tb, struct nlattr *rate_tlv, struct tcf_exts *exts, u32 flags, u32 fl_flags, struct netlink_ext_ack *extack); void tcf_exts_destroy(struct tcf_exts *exts); void tcf_exts_change(struct tcf_exts *dst, struct tcf_exts *src); int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts); int tcf_exts_terse_dump(struct sk_buff *skb, struct tcf_exts *exts); int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts); /** * struct tcf_pkt_info - packet information * * @ptr: start of the pkt data * @nexthdr: offset of the next header */ struct tcf_pkt_info { unsigned char * ptr; int nexthdr; }; #ifdef CONFIG_NET_EMATCH struct tcf_ematch_ops; /** * struct tcf_ematch - extended match (ematch) * * @matchid: identifier to allow userspace to reidentify a match * @flags: flags specifying attributes and the relation to other matches * @ops: the operations lookup table of the corresponding ematch module * @datalen: length of the ematch specific configuration data * @data: ematch specific data * @net: the network namespace */ struct tcf_ematch { struct tcf_ematch_ops * ops; unsigned long data; unsigned int datalen; u16 matchid; u16 flags; struct net *net; }; static inline int tcf_em_is_container(struct tcf_ematch *em) { return !em->ops; } static inline int tcf_em_is_simple(struct tcf_ematch *em) { return em->flags & TCF_EM_SIMPLE; } static inline int tcf_em_is_inverted(struct tcf_ematch *em) { return em->flags & TCF_EM_INVERT; } static inline int tcf_em_last_match(struct tcf_ematch *em) { return (em->flags & TCF_EM_REL_MASK) == TCF_EM_REL_END; } static inline int tcf_em_early_end(struct tcf_ematch *em, int result) { if (tcf_em_last_match(em)) return 1; if (result == 0 && em->flags & TCF_EM_REL_AND) return 1; if (result != 0 && em->flags & TCF_EM_REL_OR) return 1; return 0; } /** * struct tcf_ematch_tree - ematch tree handle * * @hdr: ematch tree header supplied by userspace * @matches: array of ematches */ struct tcf_ematch_tree { struct tcf_ematch_tree_hdr hdr; struct tcf_ematch * matches; }; /** * struct tcf_ematch_ops - ematch module operations * * @kind: identifier (kind) of this ematch module * @datalen: length of expected configuration data (optional) * @change: called during validation (optional) * @match: called during ematch tree evaluation, must return 1/0 * @destroy: called during destroyage (optional) * @dump: called during dumping process (optional) * @owner: owner, must be set to THIS_MODULE * @link: link to previous/next ematch module (internal use) */ struct tcf_ematch_ops { int kind; int datalen; int (*change)(struct net *net, void *, int, struct tcf_ematch *); int (*match)(struct sk_buff *, struct tcf_ematch *, struct tcf_pkt_info *); void (*destroy)(struct tcf_ematch *); int (*dump)(struct sk_buff *, struct tcf_ematch *); struct module *owner; struct list_head link; }; int tcf_em_register(struct tcf_ematch_ops *); void tcf_em_unregister(struct tcf_ematch_ops *); int tcf_em_tree_validate(struct tcf_proto *, struct nlattr *, struct tcf_ematch_tree *); void tcf_em_tree_destroy(struct tcf_ematch_tree *); int tcf_em_tree_dump(struct sk_buff *, struct tcf_ematch_tree *, int); int __tcf_em_tree_match(struct sk_buff *, struct tcf_ematch_tree *, struct tcf_pkt_info *); /** * tcf_em_tree_match - evaluate an ematch tree * * @skb: socket buffer of the packet in question * @tree: ematch tree to be used for evaluation * @info: packet information examined by classifier * * This function matches @skb against the ematch tree in @tree by going * through all ematches respecting their logic relations returning * as soon as the result is obvious. * * Returns: 1 if the ematch tree as-one matches, no ematches are configured * or ematch is not enabled in the kernel, otherwise 0 is returned. */ static inline int tcf_em_tree_match(struct sk_buff *skb, struct tcf_ematch_tree *tree, struct tcf_pkt_info *info) { if (tree->hdr.nmatches) return __tcf_em_tree_match(skb, tree, info); else return 1; } #define MODULE_ALIAS_TCF_EMATCH(kind) MODULE_ALIAS("ematch-kind-" __stringify(kind)) #else /* CONFIG_NET_EMATCH */ struct tcf_ematch_tree { }; #define tcf_em_tree_validate(tp, tb, t) ((void)(t), 0) #define tcf_em_tree_destroy(t) do { (void)(t); } while(0) #define tcf_em_tree_dump(skb, t, tlv) (0) #define tcf_em_tree_match(skb, t, info) ((void)(info), 1) #endif /* CONFIG_NET_EMATCH */ static inline unsigned char * tcf_get_base_ptr(struct sk_buff *skb, int layer) { switch (layer) { case TCF_LAYER_LINK: return skb_mac_header(skb); case TCF_LAYER_NETWORK: return skb_network_header(skb); case TCF_LAYER_TRANSPORT: if (!skb_transport_header_was_set(skb)) break; return skb_transport_header(skb); } return NULL; } static inline int tcf_valid_offset(const struct sk_buff *skb, const unsigned char *ptr, const int len) { return likely((ptr + len) <= skb_tail_pointer(skb) && ptr >= skb->head && (ptr <= (ptr + len))); } static inline int tcf_change_indev(struct net *net, struct nlattr *indev_tlv, struct netlink_ext_ack *extack) { char indev[IFNAMSIZ]; struct net_device *dev; if (nla_strscpy(indev, indev_tlv, IFNAMSIZ) < 0) { NL_SET_ERR_MSG_ATTR(extack, indev_tlv, "Interface name too long"); return -EINVAL; } dev = __dev_get_by_name(net, indev); if (!dev) { NL_SET_ERR_MSG_ATTR(extack, indev_tlv, "Network device not found"); return -ENODEV; } return dev->ifindex; } static inline bool tcf_match_indev(struct sk_buff *skb, int ifindex) { if (!ifindex) return true; if (!skb->skb_iif) return false; return ifindex == skb->skb_iif; } int tc_setup_offload_action(struct flow_action *flow_action, const struct tcf_exts *exts, struct netlink_ext_ack *extack); void tc_cleanup_offload_action(struct flow_action *flow_action); int tc_setup_action(struct flow_action *flow_action, struct tc_action *actions[], u32 miss_cookie_base, struct netlink_ext_ack *extack); int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type, void *type_data, bool err_stop, bool rtnl_held); int tc_setup_cb_add(struct tcf_block *block, struct tcf_proto *tp, enum tc_setup_type type, void *type_data, bool err_stop, u32 *flags, unsigned int *in_hw_count, bool rtnl_held); int tc_setup_cb_replace(struct tcf_block *block, struct tcf_proto *tp, enum tc_setup_type type, void *type_data, bool err_stop, u32 *old_flags, unsigned int *old_in_hw_count, u32 *new_flags, unsigned int *new_in_hw_count, bool rtnl_held); int tc_setup_cb_destroy(struct tcf_block *block, struct tcf_proto *tp, enum tc_setup_type type, void *type_data, bool err_stop, u32 *flags, unsigned int *in_hw_count, bool rtnl_held); int tc_setup_cb_reoffload(struct tcf_block *block, struct tcf_proto *tp, bool add, flow_setup_cb_t *cb, enum tc_setup_type type, void *type_data, void *cb_priv, u32 *flags, unsigned int *in_hw_count); unsigned int tcf_exts_num_actions(struct tcf_exts *exts); #ifdef CONFIG_NET_CLS_ACT int tcf_qevent_init(struct tcf_qevent *qe, struct Qdisc *sch, enum flow_block_binder_type binder_type, struct nlattr *block_index_attr, struct netlink_ext_ack *extack); void tcf_qevent_destroy(struct tcf_qevent *qe, struct Qdisc *sch); int tcf_qevent_validate_change(struct tcf_qevent *qe, struct nlattr *block_index_attr, struct netlink_ext_ack *extack); struct sk_buff *tcf_qevent_handle(struct tcf_qevent *qe, struct Qdisc *sch, struct sk_buff *skb, struct sk_buff **to_free, int *ret); int tcf_qevent_dump(struct sk_buff *skb, int attr_name, struct tcf_qevent *qe); #else static inline int tcf_qevent_init(struct tcf_qevent *qe, struct Qdisc *sch, enum flow_block_binder_type binder_type, struct nlattr *block_index_attr, struct netlink_ext_ack *extack) { return 0; } static inline void tcf_qevent_destroy(struct tcf_qevent *qe, struct Qdisc *sch) { } static inline int tcf_qevent_validate_change(struct tcf_qevent *qe, struct nlattr *block_index_attr, struct netlink_ext_ack *extack) { return 0; } static inline struct sk_buff * tcf_qevent_handle(struct tcf_qevent *qe, struct Qdisc *sch, struct sk_buff *skb, struct sk_buff **to_free, int *ret) { return skb; } static inline int tcf_qevent_dump(struct sk_buff *skb, int attr_name, struct tcf_qevent *qe) { return 0; } #endif struct tc_cls_u32_knode { struct tcf_exts *exts; struct tcf_result *res; struct tc_u32_sel *sel; u32 handle; u32 val; u32 mask; u32 link_handle; u8 fshift; }; struct tc_cls_u32_hnode { u32 handle; u32 prio; unsigned int divisor; }; enum tc_clsu32_command { TC_CLSU32_NEW_KNODE, TC_CLSU32_REPLACE_KNODE, TC_CLSU32_DELETE_KNODE, TC_CLSU32_NEW_HNODE, TC_CLSU32_REPLACE_HNODE, TC_CLSU32_DELETE_HNODE, }; struct tc_cls_u32_offload { struct flow_cls_common_offload common; /* knode values */ enum tc_clsu32_command command; union { struct tc_cls_u32_knode knode; struct tc_cls_u32_hnode hnode; }; }; static inline bool tc_can_offload(const struct net_device *dev) { return dev->features & NETIF_F_HW_TC; } static inline bool tc_can_offload_extack(const struct net_device *dev, struct netlink_ext_ack *extack) { bool can = tc_can_offload(dev); if (!can) NL_SET_ERR_MSG(extack, "TC offload is disabled on net device"); return can; } static inline bool tc_cls_can_offload_and_chain0(const struct net_device *dev, struct flow_cls_common_offload *common) { if (!tc_can_offload_extack(dev, common->extack)) return false; if (common->chain_index) { NL_SET_ERR_MSG(common->extack, "Driver supports only offload of chain 0"); return false; } return true; } static inline bool tc_skip_hw(u32 flags) { return (flags & TCA_CLS_FLAGS_SKIP_HW) ? true : false; } static inline bool tc_skip_sw(u32 flags) { return (flags & TCA_CLS_FLAGS_SKIP_SW) ? true : false; } /* SKIP_HW and SKIP_SW are mutually exclusive flags. */ static inline bool tc_flags_valid(u32 flags) { if (flags & ~(TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW | TCA_CLS_FLAGS_VERBOSE)) return false; flags &= TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW; if (!(flags ^ (TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW))) return false; return true; } static inline bool tc_in_hw(u32 flags) { return (flags & TCA_CLS_FLAGS_IN_HW) ? true : false; } static inline void tc_cls_common_offload_init(struct flow_cls_common_offload *cls_common, const struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { cls_common->chain_index = tp->chain->index; cls_common->protocol = tp->protocol; cls_common->prio = tp->prio >> 16; cls_common->skip_sw = tc_skip_sw(flags); if (tc_skip_sw(flags) || flags & TCA_CLS_FLAGS_VERBOSE) cls_common->extack = extack; } static inline void tcf_proto_update_usesw(struct tcf_proto *tp, u32 flags) { if (tp->usesw) return; if (tc_skip_sw(flags) && tc_in_hw(flags)) return; tp->usesw = true; } #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) static inline struct tc_skb_ext *tc_skb_ext_alloc(struct sk_buff *skb) { struct tc_skb_ext *tc_skb_ext = skb_ext_add(skb, TC_SKB_EXT); if (tc_skb_ext) memset(tc_skb_ext, 0, sizeof(*tc_skb_ext)); return tc_skb_ext; } #endif enum tc_matchall_command { TC_CLSMATCHALL_REPLACE, TC_CLSMATCHALL_DESTROY, TC_CLSMATCHALL_STATS, }; struct tc_cls_matchall_offload { struct flow_cls_common_offload common; enum tc_matchall_command command; struct flow_rule *rule; struct flow_stats stats; bool use_act_stats; unsigned long cookie; }; enum tc_clsbpf_command { TC_CLSBPF_OFFLOAD, TC_CLSBPF_STATS, }; struct tc_cls_bpf_offload { struct flow_cls_common_offload common; enum tc_clsbpf_command command; struct tcf_exts *exts; struct bpf_prog *prog; struct bpf_prog *oldprog; const char *name; bool exts_integrated; }; /* This structure holds cookie structure that is passed from user * to the kernel for actions and classifiers */ struct tc_cookie { u8 *data; u32 len; struct rcu_head rcu; }; struct tc_qopt_offload_stats { struct gnet_stats_basic_sync *bstats; struct gnet_stats_queue *qstats; }; enum tc_mq_command { TC_MQ_CREATE, TC_MQ_DESTROY, TC_MQ_STATS, TC_MQ_GRAFT, }; struct tc_mq_opt_offload_graft_params { unsigned long queue; u32 child_handle; }; struct tc_mq_qopt_offload { enum tc_mq_command command; u32 handle; union { struct tc_qopt_offload_stats stats; struct tc_mq_opt_offload_graft_params graft_params; }; }; enum tc_htb_command { /* Root */ TC_HTB_CREATE, /* Initialize HTB offload. */ TC_HTB_DESTROY, /* Destroy HTB offload. */ /* Classes */ /* Allocate qid and create leaf. */ TC_HTB_LEAF_ALLOC_QUEUE, /* Convert leaf to inner, preserve and return qid, create new leaf. */ TC_HTB_LEAF_TO_INNER, /* Delete leaf, while siblings remain. */ TC_HTB_LEAF_DEL, /* Delete leaf, convert parent to leaf, preserving qid. */ TC_HTB_LEAF_DEL_LAST, /* TC_HTB_LEAF_DEL_LAST, but delete driver data on hardware errors. */ TC_HTB_LEAF_DEL_LAST_FORCE, /* Modify parameters of a node. */ TC_HTB_NODE_MODIFY, /* Class qdisc */ TC_HTB_LEAF_QUERY_QUEUE, /* Query qid by classid. */ }; struct tc_htb_qopt_offload { struct netlink_ext_ack *extack; enum tc_htb_command command; u32 parent_classid; u16 classid; u16 qid; u32 quantum; u64 rate; u64 ceil; u8 prio; }; #define TC_HTB_CLASSID_ROOT U32_MAX enum tc_red_command { TC_RED_REPLACE, TC_RED_DESTROY, TC_RED_STATS, TC_RED_XSTATS, TC_RED_GRAFT, }; struct tc_red_qopt_offload_params { u32 min; u32 max; u32 probability; u32 limit; bool is_ecn; bool is_harddrop; bool is_nodrop; struct gnet_stats_queue *qstats; }; struct tc_red_qopt_offload { enum tc_red_command command; u32 handle; u32 parent; union { struct tc_red_qopt_offload_params set; struct tc_qopt_offload_stats stats; struct red_stats *xstats; u32 child_handle; }; }; enum tc_gred_command { TC_GRED_REPLACE, TC_GRED_DESTROY, TC_GRED_STATS, }; struct tc_gred_vq_qopt_offload_params { bool present; u32 limit; u32 prio; u32 min; u32 max; bool is_ecn; bool is_harddrop; u32 probability; /* Only need backlog, see struct tc_prio_qopt_offload_params */ u32 *backlog; }; struct tc_gred_qopt_offload_params { bool grio_on; bool wred_on; unsigned int dp_cnt; unsigned int dp_def; struct gnet_stats_queue *qstats; struct tc_gred_vq_qopt_offload_params tab[MAX_DPs]; }; struct tc_gred_qopt_offload_stats { struct gnet_stats_basic_sync bstats[MAX_DPs]; struct gnet_stats_queue qstats[MAX_DPs]; struct red_stats *xstats[MAX_DPs]; }; struct tc_gred_qopt_offload { enum tc_gred_command command; u32 handle; u32 parent; union { struct tc_gred_qopt_offload_params set; struct tc_gred_qopt_offload_stats stats; }; }; enum tc_prio_command { TC_PRIO_REPLACE, TC_PRIO_DESTROY, TC_PRIO_STATS, TC_PRIO_GRAFT, }; struct tc_prio_qopt_offload_params { int bands; u8 priomap[TC_PRIO_MAX + 1]; /* At the point of un-offloading the Qdisc, the reported backlog and * qlen need to be reduced by the portion that is in HW. */ struct gnet_stats_queue *qstats; }; struct tc_prio_qopt_offload_graft_params { u8 band; u32 child_handle; }; struct tc_prio_qopt_offload { enum tc_prio_command command; u32 handle; u32 parent; union { struct tc_prio_qopt_offload_params replace_params; struct tc_qopt_offload_stats stats; struct tc_prio_qopt_offload_graft_params graft_params; }; }; enum tc_root_command { TC_ROOT_GRAFT, }; struct tc_root_qopt_offload { enum tc_root_command command; u32 handle; bool ingress; }; enum tc_ets_command { TC_ETS_REPLACE, TC_ETS_DESTROY, TC_ETS_STATS, TC_ETS_GRAFT, }; struct tc_ets_qopt_offload_replace_params { unsigned int bands; u8 priomap[TC_PRIO_MAX + 1]; unsigned int quanta[TCQ_ETS_MAX_BANDS]; /* 0 for strict bands. */ unsigned int weights[TCQ_ETS_MAX_BANDS]; struct gnet_stats_queue *qstats; }; struct tc_ets_qopt_offload_graft_params { u8 band; u32 child_handle; }; struct tc_ets_qopt_offload { enum tc_ets_command command; u32 handle; u32 parent; union { struct tc_ets_qopt_offload_replace_params replace_params; struct tc_qopt_offload_stats stats; struct tc_ets_qopt_offload_graft_params graft_params; }; }; enum tc_tbf_command { TC_TBF_REPLACE, TC_TBF_DESTROY, TC_TBF_STATS, TC_TBF_GRAFT, }; struct tc_tbf_qopt_offload_replace_params { struct psched_ratecfg rate; u32 max_size; struct gnet_stats_queue *qstats; }; struct tc_tbf_qopt_offload { enum tc_tbf_command command; u32 handle; u32 parent; union { struct tc_tbf_qopt_offload_replace_params replace_params; struct tc_qopt_offload_stats stats; u32 child_handle; }; }; enum tc_fifo_command { TC_FIFO_REPLACE, TC_FIFO_DESTROY, TC_FIFO_STATS, }; struct tc_fifo_qopt_offload { enum tc_fifo_command command; u32 handle; u32 parent; union { struct tc_qopt_offload_stats stats; }; }; #ifdef CONFIG_NET_CLS_ACT DECLARE_STATIC_KEY_FALSE(tc_skb_ext_tc); void tc_skb_ext_tc_enable(void); void tc_skb_ext_tc_disable(void); #define tc_skb_ext_tc_enabled() static_branch_unlikely(&tc_skb_ext_tc) #else /* CONFIG_NET_CLS_ACT */ static inline void tc_skb_ext_tc_enable(void) { } static inline void tc_skb_ext_tc_disable(void) { } #define tc_skb_ext_tc_enabled() false #endif #endif |
| 3 3 3 2066 2063 3 3 16 20 10 7 26 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 | /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef _ASM_X86_APIC_H #define _ASM_X86_APIC_H #include <linux/cpumask.h> #include <linux/static_call.h> #include <asm/alternative.h> #include <asm/cpufeature.h> #include <asm/apicdef.h> #include <linux/atomic.h> #include <asm/fixmap.h> #include <asm/mpspec.h> #include <asm/msr.h> #include <asm/hardirq.h> #include <asm/io.h> #include <asm/posted_intr.h> #define ARCH_APICTIMER_STOPS_ON_C3 1 /* Macros for apic_extnmi which controls external NMI masking */ #define APIC_EXTNMI_BSP 0 /* Default */ #define APIC_EXTNMI_ALL 1 #define APIC_EXTNMI_NONE 2 /* * Debugging macros */ #define APIC_QUIET 0 #define APIC_VERBOSE 1 #define APIC_DEBUG 2 /* * Define the default level of output to be very little This can be turned * up by using apic=verbose for more information and apic=debug for _lots_ * of information. apic_verbosity is defined in apic.c */ #define apic_printk(v, s, a...) \ do { \ if ((v) <= apic_verbosity) \ printk(s, ##a); \ } while (0) #define apic_pr_verbose(s, a...) apic_printk(APIC_VERBOSE, KERN_INFO s, ##a) #define apic_pr_debug(s, a...) apic_printk(APIC_DEBUG, KERN_DEBUG s, ##a) #define apic_pr_debug_cont(s, a...) apic_printk(APIC_DEBUG, KERN_CONT s, ##a) /* Unconditional debug prints for code which is guarded by apic_verbosity already */ #define apic_dbg(s, a...) printk(KERN_DEBUG s, ##a) #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) extern void x86_32_probe_apic(void); #else static inline void x86_32_probe_apic(void) { } #endif extern u32 cpuid_to_apicid[]; #define CPU_ACPIID_INVALID U32_MAX #ifdef CONFIG_X86_LOCAL_APIC extern int apic_verbosity; extern int local_apic_timer_c2_ok; extern bool apic_is_disabled; extern unsigned int lapic_timer_period; extern enum apic_intr_mode_id apic_intr_mode; enum apic_intr_mode_id { APIC_PIC, APIC_VIRTUAL_WIRE, APIC_VIRTUAL_WIRE_NO_CONFIG, APIC_SYMMETRIC_IO, APIC_SYMMETRIC_IO_NO_ROUTING }; /* * With 82489DX we can't rely on apic feature bit * retrieved via cpuid but still have to deal with * such an apic chip so we assume that SMP configuration * is found from MP table (64bit case uses ACPI mostly * which set smp presence flag as well so we are safe * to use this helper too). */ static inline bool apic_from_smp_config(void) { return smp_found_config && !apic_is_disabled; } /* * Basic functions accessing APICs. */ #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #endif static inline void native_apic_mem_write(u32 reg, u32 v) { volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg); alternative_io("movl %0, %1", "xchgl %0, %1", X86_BUG_11AP, ASM_OUTPUT("=r" (v), "=m" (*addr)), ASM_INPUT("0" (v), "m" (*addr))); } static inline u32 native_apic_mem_read(u32 reg) { return readl((void __iomem *)(APIC_BASE + reg)); } static inline void native_apic_mem_eoi(void) { native_apic_mem_write(APIC_EOI, APIC_EOI_ACK); } extern void native_apic_icr_write(u32 low, u32 id); extern u64 native_apic_icr_read(void); static inline bool apic_is_x2apic_enabled(void) { u64 msr; if (rdmsrq_safe(MSR_IA32_APICBASE, &msr)) return false; return msr & X2APIC_ENABLE; } extern void enable_IR_x2apic(void); extern int lapic_get_maxlvt(void); extern void clear_local_APIC(void); extern void disconnect_bsp_APIC(int virt_wire_setup); extern void disable_local_APIC(void); extern void apic_soft_disable(void); extern void lapic_shutdown(void); extern void sync_Arb_IDs(void); extern void init_bsp_APIC(void); extern void apic_intr_mode_select(void); extern void apic_intr_mode_init(void); extern void init_apic_mappings(void); void register_lapic_address(unsigned long address); extern void setup_boot_APIC_clock(void); extern void setup_secondary_APIC_clock(void); extern void lapic_update_tsc_freq(void); #ifdef CONFIG_X86_64 static inline bool apic_force_enable(unsigned long addr) { return false; } #else extern bool apic_force_enable(unsigned long addr); #endif extern void apic_ap_setup(void); /* * On 32bit this is mach-xxx local */ #ifdef CONFIG_X86_64 extern int apic_is_clustered_box(void); #else static inline int apic_is_clustered_box(void) { return 0; } #endif extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask); extern void lapic_assign_system_vectors(void); extern void lapic_assign_legacy_vector(unsigned int isairq, bool replace); extern void lapic_update_legacy_vectors(void); extern void lapic_online(void); extern void lapic_offline(void); extern bool apic_needs_pit(void); extern void apic_send_IPI_allbutself(unsigned int vector); extern void topology_register_apic(u32 apic_id, u32 acpi_id, bool present); extern void topology_register_boot_apic(u32 apic_id); extern int topology_hotplug_apic(u32 apic_id, u32 acpi_id); extern void topology_hotunplug_apic(unsigned int cpu); extern void topology_apply_cmdline_limits_early(void); extern void topology_init_possible_cpus(void); extern void topology_reset_possible_cpus_up(void); #else /* !CONFIG_X86_LOCAL_APIC */ static inline void lapic_shutdown(void) { } #define local_apic_timer_c2_ok 1 static inline void init_apic_mappings(void) { } static inline void disable_local_APIC(void) { } # define setup_boot_APIC_clock x86_init_noop # define setup_secondary_APIC_clock x86_init_noop static inline void lapic_update_tsc_freq(void) { } static inline void init_bsp_APIC(void) { } static inline void apic_intr_mode_select(void) { } static inline void apic_intr_mode_init(void) { } static inline void lapic_assign_system_vectors(void) { } static inline void lapic_assign_legacy_vector(unsigned int i, bool r) { } static inline bool apic_needs_pit(void) { return true; } static inline void topology_apply_cmdline_limits_early(void) { } static inline void topology_init_possible_cpus(void) { } #endif /* !CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_X2APIC static inline void native_apic_msr_write(u32 reg, u32 v) { if (reg == APIC_DFR || reg == APIC_ID || reg == APIC_LDR || reg == APIC_LVR) return; wrmsrq(APIC_BASE_MSR + (reg >> 4), v); } static inline void native_apic_msr_eoi(void) { native_wrmsrq(APIC_BASE_MSR + (APIC_EOI >> 4), APIC_EOI_ACK); } static inline u32 native_apic_msr_read(u32 reg) { u64 msr; if (reg == APIC_DFR) return -1; rdmsrq(APIC_BASE_MSR + (reg >> 4), msr); return (u32)msr; } static inline void native_x2apic_icr_write(u32 low, u32 id) { wrmsrq(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low); } static inline u64 native_x2apic_icr_read(void) { unsigned long val; rdmsrq(APIC_BASE_MSR + (APIC_ICR >> 4), val); return val; } extern int x2apic_mode; extern int x2apic_phys; extern void __init x2apic_set_max_apicid(u32 apicid); extern void x2apic_setup(void); static inline int x2apic_enabled(void) { return boot_cpu_has(X86_FEATURE_X2APIC) && apic_is_x2apic_enabled(); } #define x2apic_supported() (boot_cpu_has(X86_FEATURE_X2APIC)) #else /* !CONFIG_X86_X2APIC */ static inline void x2apic_setup(void) { } static inline int x2apic_enabled(void) { return 0; } static inline u32 native_apic_msr_read(u32 reg) { BUG(); } #define x2apic_mode (0) #define x2apic_supported() (0) #endif /* !CONFIG_X86_X2APIC */ extern void __init check_x2apic(void); struct irq_data; /* * Copyright 2004 James Cleverdon, IBM. * * Generic APIC sub-arch data struct. * * Hacked for x86-64 by James Cleverdon from i386 architecture code by * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and * James Cleverdon. */ struct apic { /* Hotpath functions first */ void (*eoi)(void); void (*native_eoi)(void); void (*write)(u32 reg, u32 v); u32 (*read)(u32 reg); /* IPI related functions */ void (*wait_icr_idle)(void); u32 (*safe_wait_icr_idle)(void); void (*send_IPI)(int cpu, int vector); void (*send_IPI_mask)(const struct cpumask *mask, int vector); void (*send_IPI_mask_allbutself)(const struct cpumask *msk, int vec); void (*send_IPI_allbutself)(int vector); void (*send_IPI_all)(int vector); void (*send_IPI_self)(int vector); u32 disable_esr : 1, dest_mode_logical : 1, x2apic_set_max_apicid : 1, nmi_to_offline_cpu : 1; u32 (*calc_dest_apicid)(unsigned int cpu); /* ICR related functions */ u64 (*icr_read)(void); void (*icr_write)(u32 low, u32 high); /* The limit of the APIC ID space. */ u32 max_apic_id; /* Probe, setup and smpboot functions */ int (*probe)(void); void (*setup)(void); void (*teardown)(void); int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id); void (*init_apic_ldr)(void); u32 (*cpu_present_to_apicid)(int mps_cpu); u32 (*get_apic_id)(u32 id); /* wakeup_secondary_cpu */ int (*wakeup_secondary_cpu)(u32 apicid, unsigned long start_eip, unsigned int cpu); /* wakeup secondary CPU using 64-bit wakeup point */ int (*wakeup_secondary_cpu_64)(u32 apicid, unsigned long start_eip, unsigned int cpu); void (*update_vector)(unsigned int cpu, unsigned int vector, bool set); char *name; }; struct apic_override { void (*eoi)(void); void (*native_eoi)(void); void (*write)(u32 reg, u32 v); u32 (*read)(u32 reg); void (*send_IPI)(int cpu, int vector); void (*send_IPI_mask)(const struct cpumask *mask, int vector); void (*send_IPI_mask_allbutself)(const struct cpumask *msk, int vec); void (*send_IPI_allbutself)(int vector); void (*send_IPI_all)(int vector); void (*send_IPI_self)(int vector); u64 (*icr_read)(void); void (*icr_write)(u32 low, u32 high); int (*wakeup_secondary_cpu)(u32 apicid, unsigned long start_eip, unsigned int cpu); int (*wakeup_secondary_cpu_64)(u32 apicid, unsigned long start_eip, unsigned int cpu); }; /* * Pointer to the local APIC driver in use on this system (there's * always just one such driver in use - the kernel decides via an * early probing process which one it picks - and then sticks to it): */ extern struct apic *apic; /* * APIC drivers are probed based on how they are listed in the .apicdrivers * section. So the order is important and enforced by the ordering * of different apic driver files in the Makefile. */ #define apic_driver(sym) \ static const struct apic *__apicdrivers_##sym __used \ __aligned(sizeof(struct apic *)) \ __section(".apicdrivers") = { &sym } extern struct apic *__apicdrivers[], *__apicdrivers_end[]; /* * APIC functionality to boot other CPUs - only used on SMP: */ #ifdef CONFIG_SMP extern int lapic_can_unplug_cpu(void); #endif #ifdef CONFIG_X86_LOCAL_APIC extern struct apic_override __x86_apic_override; void __init apic_setup_apic_calls(void); void __init apic_install_driver(struct apic *driver); #define apic_update_callback(_callback, _fn) { \ __x86_apic_override._callback = _fn; \ apic->_callback = _fn; \ static_call_update(apic_call_##_callback, _fn); \ pr_info("APIC: %s() replaced with %ps()\n", #_callback, _fn); \ } #define DECLARE_APIC_CALL(__cb) \ DECLARE_STATIC_CALL(apic_call_##__cb, *apic->__cb) DECLARE_APIC_CALL(eoi); DECLARE_APIC_CALL(native_eoi); DECLARE_APIC_CALL(icr_read); DECLARE_APIC_CALL(icr_write); DECLARE_APIC_CALL(read); DECLARE_APIC_CALL(send_IPI); DECLARE_APIC_CALL(send_IPI_mask); DECLARE_APIC_CALL(send_IPI_mask_allbutself); DECLARE_APIC_CALL(send_IPI_allbutself); DECLARE_APIC_CALL(send_IPI_all); DECLARE_APIC_CALL(send_IPI_self); DECLARE_APIC_CALL(wait_icr_idle); DECLARE_APIC_CALL(wakeup_secondary_cpu); DECLARE_APIC_CALL(wakeup_secondary_cpu_64); DECLARE_APIC_CALL(write); static __always_inline u32 apic_read(u32 reg) { return static_call(apic_call_read)(reg); } static __always_inline void apic_write(u32 reg, u32 val) { static_call(apic_call_write)(reg, val); } static __always_inline void apic_eoi(void) { static_call(apic_call_eoi)(); } static __always_inline void apic_native_eoi(void) { static_call(apic_call_native_eoi)(); } static __always_inline u64 apic_icr_read(void) { return static_call(apic_call_icr_read)(); } static __always_inline void apic_icr_write(u32 low, u32 high) { static_call(apic_call_icr_write)(low, high); } static __always_inline void __apic_send_IPI(int cpu, int vector) { static_call(apic_call_send_IPI)(cpu, vector); } static __always_inline void __apic_send_IPI_mask(const struct cpumask *mask, int vector) { static_call_mod(apic_call_send_IPI_mask)(mask, vector); } static __always_inline void __apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) { static_call(apic_call_send_IPI_mask_allbutself)(mask, vector); } static __always_inline void __apic_send_IPI_allbutself(int vector) { static_call(apic_call_send_IPI_allbutself)(vector); } static __always_inline void __apic_send_IPI_all(int vector) { static_call(apic_call_send_IPI_all)(vector); } static __always_inline void __apic_send_IPI_self(int vector) { static_call_mod(apic_call_send_IPI_self)(vector); } static __always_inline void apic_wait_icr_idle(void) { static_call_cond(apic_call_wait_icr_idle)(); } static __always_inline u32 safe_apic_wait_icr_idle(void) { return apic->safe_wait_icr_idle ? apic->safe_wait_icr_idle() : 0; } static __always_inline bool apic_id_valid(u32 apic_id) { return apic_id <= apic->max_apic_id; } static __always_inline void apic_update_vector(unsigned int cpu, unsigned int vector, bool set) { if (apic->update_vector) apic->update_vector(cpu, vector, set); } #else /* CONFIG_X86_LOCAL_APIC */ static inline u32 apic_read(u32 reg) { return 0; } static inline void apic_write(u32 reg, u32 val) { } static inline void apic_eoi(void) { } static inline u64 apic_icr_read(void) { return 0; } static inline void apic_icr_write(u32 low, u32 high) { } static inline void apic_wait_icr_idle(void) { } static inline u32 safe_apic_wait_icr_idle(void) { return 0; } static inline void apic_native_eoi(void) { WARN_ON_ONCE(1); } static inline void apic_setup_apic_calls(void) { } static inline void apic_update_vector(unsigned int cpu, unsigned int vector, bool set) { } #define apic_update_callback(_callback, _fn) do { } while (0) #endif /* CONFIG_X86_LOCAL_APIC */ extern void apic_ack_irq(struct irq_data *data); #define APIC_VECTOR_TO_BIT_NUMBER(v) ((unsigned int)(v) % 32) #define APIC_VECTOR_TO_REG_OFFSET(v) ((unsigned int)(v) / 32 * 0x10) static inline bool lapic_vector_set_in_irr(unsigned int vector) { u32 irr = apic_read(APIC_IRR + APIC_VECTOR_TO_REG_OFFSET(vector)); return !!(irr & (1U << APIC_VECTOR_TO_BIT_NUMBER(vector))); } static inline bool is_vector_pending(unsigned int vector) { return lapic_vector_set_in_irr(vector) || pi_pending_this_cpu(vector); } #define MAX_APIC_VECTOR 256 #define APIC_VECTORS_PER_REG 32 /* * Vector states are maintained by APIC in 32-bit registers that are * 16 bytes aligned. The status of each vector is kept in a single * bit. */ static inline int apic_find_highest_vector(void *bitmap) { int vec; u32 *reg; for (vec = MAX_APIC_VECTOR - APIC_VECTORS_PER_REG; vec >= 0; vec -= APIC_VECTORS_PER_REG) { reg = bitmap + APIC_VECTOR_TO_REG_OFFSET(vec); if (*reg) return __fls(*reg) + vec; } return -1; } static inline u32 apic_get_reg(void *regs, int reg) { return *((u32 *) (regs + reg)); } static inline void apic_set_reg(void *regs, int reg, u32 val) { *((u32 *) (regs + reg)) = val; } static __always_inline u64 apic_get_reg64(void *regs, int reg) { BUILD_BUG_ON(reg != APIC_ICR); return *((u64 *) (regs + reg)); } static __always_inline void apic_set_reg64(void *regs, int reg, u64 val) { BUILD_BUG_ON(reg != APIC_ICR); *((u64 *) (regs + reg)) = val; } static inline void apic_clear_vector(int vec, void *bitmap) { clear_bit(APIC_VECTOR_TO_BIT_NUMBER(vec), bitmap + APIC_VECTOR_TO_REG_OFFSET(vec)); } static inline void apic_set_vector(int vec, void *bitmap) { set_bit(APIC_VECTOR_TO_BIT_NUMBER(vec), bitmap + APIC_VECTOR_TO_REG_OFFSET(vec)); } static inline int apic_test_vector(int vec, void *bitmap) { return test_bit(APIC_VECTOR_TO_BIT_NUMBER(vec), bitmap + APIC_VECTOR_TO_REG_OFFSET(vec)); } /* * Warm reset vector position: */ #define TRAMPOLINE_PHYS_LOW 0x467 #define TRAMPOLINE_PHYS_HIGH 0x469 #ifdef CONFIG_X86_LOCAL_APIC #include <asm/smp.h> extern struct apic apic_noop; static inline u32 read_apic_id(void) { u32 reg = apic_read(APIC_ID); return apic->get_apic_id(reg); } #ifdef CONFIG_X86_64 typedef int (*wakeup_cpu_handler)(int apicid, unsigned long start_eip); extern int default_acpi_madt_oem_check(char *, char *); extern void x86_64_probe_apic(void); #else static inline int default_acpi_madt_oem_check(char *a, char *b) { return 0; } static inline void x86_64_probe_apic(void) { } #endif extern u32 apic_default_calc_apicid(unsigned int cpu); extern u32 apic_flat_calc_apicid(unsigned int cpu); extern u32 default_cpu_present_to_apicid(int mps_cpu); void apic_send_nmi_to_offline_cpu(unsigned int cpu); #else /* CONFIG_X86_LOCAL_APIC */ static inline u32 read_apic_id(void) { return 0; } #endif /* !CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_SMP void apic_smt_update(void); #else static inline void apic_smt_update(void) { } #endif struct msi_msg; struct irq_cfg; extern void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg, bool dmar); extern void ioapic_zap_locks(void); #endif /* _ASM_X86_APIC_H */ |
| 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 | /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef __KVM_IODEV_H__ #define __KVM_IODEV_H__ #include <linux/kvm_types.h> #include <linux/errno.h> struct kvm_io_device; struct kvm_vcpu; /** * kvm_io_device_ops are called under kvm slots_lock. * read and write handlers return 0 if the transaction has been handled, * or non-zero to have it passed to the next device. **/ struct kvm_io_device_ops { int (*read)(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr, int len, void *val); int (*write)(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr, int len, const void *val); void (*destructor)(struct kvm_io_device *this); }; struct kvm_io_device { const struct kvm_io_device_ops *ops; }; static inline void kvm_iodevice_init(struct kvm_io_device *dev, const struct kvm_io_device_ops *ops) { dev->ops = ops; } static inline int kvm_iodevice_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, gpa_t addr, int l, void *v) { return dev->ops->read ? dev->ops->read(vcpu, dev, addr, l, v) : -EOPNOTSUPP; } static inline int kvm_iodevice_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, gpa_t addr, int l, const void *v) { return dev->ops->write ? dev->ops->write(vcpu, dev, addr, l, v) : -EOPNOTSUPP; } #endif /* __KVM_IODEV_H__ */ |
| 2 2 1 2 2 2 2 2 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 | // SPDX-License-Identifier: GPL-2.0-only /* * MCE grading rules. * Copyright 2008, 2009 Intel Corporation. * * Author: Andi Kleen */ #include <linux/kernel.h> #include <linux/seq_file.h> #include <linux/init.h> #include <linux/debugfs.h> #include <linux/uaccess.h> #include <asm/mce.h> #include <asm/cpu_device_id.h> #include <asm/traps.h> #include <asm/insn.h> #include <asm/insn-eval.h> #include "internal.h" /* * Grade an mce by severity. In general the most severe ones are processed * first. Since there are quite a lot of combinations test the bits in a * table-driven way. The rules are simply processed in order, first * match wins. * * Note this is only used for machine check exceptions, the corrected * errors use much simpler rules. The exceptions still check for the corrected * errors, but only to leave them alone for the CMCI handler (except for * panic situations) */ enum context { IN_KERNEL = 1, IN_USER = 2, IN_KERNEL_RECOV = 3 }; enum ser { SER_REQUIRED = 1, NO_SER = 2 }; enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 }; static struct severity { u64 mask; u64 result; unsigned char sev; unsigned short mcgmask; unsigned short mcgres; unsigned char ser; unsigned char context; unsigned char excp; unsigned char covered; unsigned int cpu_vfm; unsigned char cpu_minstepping; unsigned char bank_lo, bank_hi; char *msg; } severities[] = { #define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c } #define BANK_RANGE(l, h) .bank_lo = l, .bank_hi = h #define VFM_STEPPING(m, s) .cpu_vfm = m, .cpu_minstepping = s #define KERNEL .context = IN_KERNEL #define USER .context = IN_USER #define KERNEL_RECOV .context = IN_KERNEL_RECOV #define SER .ser = SER_REQUIRED #define NOSER .ser = NO_SER #define EXCP .excp = EXCP_CONTEXT #define NOEXCP .excp = NO_EXCP #define BITCLR(x) .mask = x, .result = 0 #define BITSET(x) .mask = x, .result = x #define MCGMASK(x, y) .mcgmask = x, .mcgres = y #define MASK(x, y) .mask = x, .result = y #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) #define MCI_UC_AR (MCI_STATUS_UC|MCI_STATUS_AR) #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) #define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV) MCESEV( NO, "Invalid", BITCLR(MCI_STATUS_VAL) ), MCESEV( NO, "Not enabled", EXCP, BITCLR(MCI_STATUS_EN) ), MCESEV( PANIC, "Processor context corrupt", BITSET(MCI_STATUS_PCC) ), /* When MCIP is not set something is very confused */ MCESEV( PANIC, "MCIP not set in MCA handler", EXCP, MCGMASK(MCG_STATUS_MCIP, 0) ), /* Neither return not error IP -- no chance to recover -> PANIC */ MCESEV( PANIC, "Neither restart nor error IP", EXCP, MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0) ), MCESEV( PANIC, "In kernel and no restart IP", EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0) ), MCESEV( PANIC, "In kernel and no restart IP", EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0) ), MCESEV( KEEP, "Corrected error", NOSER, BITCLR(MCI_STATUS_UC) ), /* * known AO MCACODs reported via MCE or CMC: * * SRAO could be signaled either via a machine check exception or * CMCI with the corresponding bit S 1 or 0. So we don't need to * check bit S for SRAO. */ MCESEV( AO, "Action optional: memory scrubbing error", SER, MASK(MCI_UC_AR|MCACOD_SCRUBMSK, MCI_STATUS_UC|MCACOD_SCRUB) ), MCESEV( AO, "Action optional: last level cache writeback error", SER, MASK(MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB) ), /* * Quirk for Skylake/Cascade Lake. Patrol scrubber may be configured * to report uncorrected errors using CMCI with a special signature. * UC=0, MSCOD=0x0010, MCACOD=binary(000X 0000 1100 XXXX) reported * in one of the memory controller banks. * Set severity to "AO" for same action as normal patrol scrub error. */ MCESEV( AO, "Uncorrected Patrol Scrub Error", SER, MASK(MCI_STATUS_UC|MCI_ADDR|0xffffeff0, MCI_ADDR|0x001000c0), VFM_STEPPING(INTEL_SKYLAKE_X, 4), BANK_RANGE(13, 18) ), /* ignore OVER for UCNA */ MCESEV( UCNA, "Uncorrected no action required", SER, MASK(MCI_UC_SAR, MCI_STATUS_UC) ), MCESEV( PANIC, "Illegal combination (UCNA with AR=1)", SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR) ), MCESEV( KEEP, "Non signaled machine check", SER, BITCLR(MCI_STATUS_S) ), MCESEV( PANIC, "Action required with lost events", SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR) ), /* known AR MCACODs: */ #ifdef CONFIG_MEMORY_FAILURE MCESEV( KEEP, "Action required but unaffected thread is continuable", SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR, MCI_UC_SAR|MCI_ADDR), MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV) ), MCESEV( AR, "Action required: data load in error recoverable area of kernel", SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), KERNEL_RECOV ), MCESEV( AR, "Action required: data load error in a user process", SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), USER ), MCESEV( AR, "Action required: instruction fetch error in a user process", SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR), USER ), MCESEV( AR, "Data load error in SEAM non-root mode", SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), MCGMASK(MCG_STATUS_SEAM_NR, MCG_STATUS_SEAM_NR), KERNEL ), MCESEV( AR, "Instruction fetch error in SEAM non-root mode", SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR), MCGMASK(MCG_STATUS_SEAM_NR, MCG_STATUS_SEAM_NR), KERNEL ), MCESEV( PANIC, "Data load in unrecoverable area of kernel", SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), KERNEL ), MCESEV( PANIC, "Instruction fetch error in kernel", SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR), KERNEL ), #endif MCESEV( PANIC, "Action required: unknown MCACOD", SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR) ), MCESEV( SOME, "Action optional: unknown MCACOD", SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S) ), MCESEV( SOME, "Action optional with lost events", SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_S) ), MCESEV( PANIC, "Overflowed uncorrected", BITSET(MCI_STATUS_OVER|MCI_STATUS_UC) ), MCESEV( PANIC, "Uncorrected in kernel", BITSET(MCI_STATUS_UC), KERNEL ), MCESEV( UC, "Uncorrected", BITSET(MCI_STATUS_UC) ), MCESEV( SOME, "No match", BITSET(0) ) /* always matches. keep at end */ }; #define mc_recoverable(mcg) (((mcg) & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) == \ (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) static bool is_copy_from_user(struct pt_regs *regs) { u8 insn_buf[MAX_INSN_SIZE]; unsigned long addr; struct insn insn; int ret; if (!regs) return false; if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip, MAX_INSN_SIZE)) return false; ret = insn_decode_kernel(&insn, insn_buf); if (ret < 0) return false; switch (insn.opcode.value) { /* MOV mem,reg */ case 0x8A: case 0x8B: /* MOVZ mem,reg */ case 0xB60F: case 0xB70F: addr = (unsigned long)insn_get_addr_ref(&insn, regs); break; /* REP MOVS */ case 0xA4: case 0xA5: addr = regs->si; break; default: return false; } if (fault_in_kernel_space(addr)) return false; current->mce_vaddr = (void __user *)addr; return true; } /* * If mcgstatus indicated that ip/cs on the stack were * no good, then "m->cs" will be zero and we will have * to assume the worst case (IN_KERNEL) as we actually * have no idea what we were executing when the machine * check hit. * If we do have a good "m->cs" (or a faked one in the * case we were executing in VM86 mode) we can use it to * distinguish an exception taken in user from from one * taken in the kernel. */ static noinstr int error_context(struct mce *m, struct pt_regs *regs) { int fixup_type; bool copy_user; if ((m->cs & 3) == 3) return IN_USER; if (!mc_recoverable(m->mcgstatus)) return IN_KERNEL; /* Allow instrumentation around external facilities usage. */ instrumentation_begin(); fixup_type = ex_get_fixup_type(m->ip); copy_user = is_copy_from_user(regs); instrumentation_end(); if (copy_user) { m->kflags |= MCE_IN_KERNEL_COPYIN | MCE_IN_KERNEL_RECOV; return IN_KERNEL_RECOV; } switch (fixup_type) { case EX_TYPE_FAULT_MCE_SAFE: case EX_TYPE_DEFAULT_MCE_SAFE: m->kflags |= MCE_IN_KERNEL_RECOV; return IN_KERNEL_RECOV; default: return IN_KERNEL; } } /* See AMD PPR(s) section Machine Check Error Handling. */ static noinstr int mce_severity_amd(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp) { char *panic_msg = NULL; int ret; /* * Default return value: Action required, the error must be handled * immediately. */ ret = MCE_AR_SEVERITY; /* Processor Context Corrupt, no need to fumble too much, die! */ if (m->status & MCI_STATUS_PCC) { panic_msg = "Processor Context Corrupt"; ret = MCE_PANIC_SEVERITY; goto out; } if (m->status & MCI_STATUS_DEFERRED) { ret = MCE_DEFERRED_SEVERITY; goto out; } /* * If the UC bit is not set, the system either corrected or deferred * the error. No action will be required after logging the error. */ if (!(m->status & MCI_STATUS_UC)) { ret = MCE_KEEP_SEVERITY; goto out; } /* * On MCA overflow, without the MCA overflow recovery feature the * system will not be able to recover, panic. */ if ((m->status & MCI_STATUS_OVER) && !mce_flags.overflow_recov) { panic_msg = "Overflowed uncorrected error without MCA Overflow Recovery"; ret = MCE_PANIC_SEVERITY; goto out; } if (!mce_flags.succor) { panic_msg = "Uncorrected error without MCA Recovery"; ret = MCE_PANIC_SEVERITY; goto out; } if (error_context(m, regs) == IN_KERNEL) { panic_msg = "Uncorrected unrecoverable error in kernel context"; ret = MCE_PANIC_SEVERITY; } out: if (msg && panic_msg) *msg = panic_msg; return ret; } static noinstr int mce_severity_intel(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp) { enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP); enum context ctx = error_context(m, regs); struct severity *s; for (s = severities;; s++) { if ((m->status & s->mask) != s->result) continue; if ((m->mcgstatus & s->mcgmask) != s->mcgres) continue; if (s->ser == SER_REQUIRED && !mca_cfg.ser) continue; if (s->ser == NO_SER && mca_cfg.ser) continue; if (s->context && ctx != s->context) continue; if (s->excp && excp != s->excp) continue; if (s->cpu_vfm && boot_cpu_data.x86_vfm != s->cpu_vfm) continue; if (s->cpu_minstepping && boot_cpu_data.x86_stepping < s->cpu_minstepping) continue; if (s->bank_lo && (m->bank < s->bank_lo || m->bank > s->bank_hi)) continue; if (msg) *msg = s->msg; s->covered = 1; return s->sev; } } int noinstr mce_severity(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp) { if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) return mce_severity_amd(m, regs, msg, is_excp); else return mce_severity_intel(m, regs, msg, is_excp); } #ifdef CONFIG_DEBUG_FS static void *s_start(struct seq_file *f, loff_t *pos) { if (*pos >= ARRAY_SIZE(severities)) return NULL; return &severities[*pos]; } static void *s_next(struct seq_file *f, void *data, loff_t *pos) { if (++(*pos) >= ARRAY_SIZE(severities)) return NULL; return &severities[*pos]; } static void s_stop(struct seq_file *f, void *data) { } static int s_show(struct seq_file *f, void *data) { struct severity *ser = data; seq_printf(f, "%d\t%s\n", ser->covered, ser->msg); return 0; } static const struct seq_operations severities_seq_ops = { .start = s_start, .next = s_next, .stop = s_stop, .show = s_show, }; static int severities_coverage_open(struct inode *inode, struct file *file) { return seq_open(file, &severities_seq_ops); } static ssize_t severities_coverage_write(struct file *file, const char __user *ubuf, size_t count, loff_t *ppos) { int i; for (i = 0; i < ARRAY_SIZE(severities); i++) severities[i].covered = 0; return count; } static const struct file_operations severities_coverage_fops = { .open = severities_coverage_open, .release = seq_release, .read = seq_read, .write = severities_coverage_write, .llseek = seq_lseek, }; static int __init severities_debugfs_init(void) { struct dentry *dmce; dmce = mce_get_debugfs_dir(); debugfs_create_file("severities-coverage", 0444, dmce, NULL, &severities_coverage_fops); return 0; } late_initcall(severities_debugfs_init); #endif /* CONFIG_DEBUG_FS */ |
| 11 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 | // SPDX-License-Identifier: GPL-2.0-only /* * Landlock - Unique identification number generator * * Copyright © 2024-2025 Microsoft Corporation */ #include <kunit/test.h> #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/random.h> #include <linux/spinlock.h> #include "common.h" #include "id.h" #define COUNTER_PRE_INIT 0 static atomic64_t next_id = ATOMIC64_INIT(COUNTER_PRE_INIT); static void __init init_id(atomic64_t *const counter, const u32 random_32bits) { u64 init; /* * Ensures sure 64-bit values are always used by user space (or may * fail with -EOVERFLOW), and makes this testable. */ init = BIT_ULL(32); /* * Makes a large (2^32) boot-time value to limit ID collision in logs * from different boots, and to limit info leak about the number of * initially (relative to the reader) created elements (e.g. domains). */ init += random_32bits; /* Sets first or ignores. This will be the first ID. */ atomic64_cmpxchg(counter, COUNTER_PRE_INIT, init); } #ifdef CONFIG_SECURITY_LANDLOCK_KUNIT_TEST static void __init test_init_min(struct kunit *const test) { atomic64_t counter = ATOMIC64_INIT(COUNTER_PRE_INIT); init_id(&counter, 0); KUNIT_EXPECT_EQ(test, atomic64_read(&counter), 1ULL + U32_MAX); } static void __init test_init_max(struct kunit *const test) { atomic64_t counter = ATOMIC64_INIT(COUNTER_PRE_INIT); init_id(&counter, ~0); KUNIT_EXPECT_EQ(test, atomic64_read(&counter), 1 + (2ULL * U32_MAX)); } static void __init test_init_once(struct kunit *const test) { const u64 first_init = 1ULL + U32_MAX; atomic64_t counter = ATOMIC64_INIT(COUNTER_PRE_INIT); init_id(&counter, 0); KUNIT_EXPECT_EQ(test, atomic64_read(&counter), first_init); init_id(&counter, ~0); KUNIT_EXPECT_EQ_MSG( test, atomic64_read(&counter), first_init, "Should still have the same value after the subsequent init_id()"); } #endif /* CONFIG_SECURITY_LANDLOCK_KUNIT_TEST */ void __init landlock_init_id(void) { return init_id(&next_id, get_random_u32()); } /* * It's not worth it to try to hide the monotonic counter because it can still * be inferred (with N counter ranges), and if we are allowed to read the inode * number we should also be allowed to read the time creation anyway, and it * can be handy to store and sort domain IDs for user space. * * Returns the value of next_id and increment it to let some space for the next * one. */ static u64 get_id_range(size_t number_of_ids, atomic64_t *const counter, u8 random_4bits) { u64 id, step; /* * We should return at least 1 ID, and we may need a set of consecutive * ones (e.g. to generate a set of inodes). */ if (WARN_ON_ONCE(number_of_ids <= 0)) number_of_ids = 1; /* * Blurs the next ID guess with 1/16 ratio. We get 2^(64 - 4) - * (2 * 2^32), so a bit less than 2^60 available IDs, which should be * much more than enough considering the number of CPU cycles required * to get a new ID (e.g. a full landlock_restrict_self() call), and the * cost of draining all available IDs during the system's uptime. */ random_4bits &= 0b1111; step = number_of_ids + random_4bits; /* It is safe to cast a signed atomic to an unsigned value. */ id = atomic64_fetch_add(step, counter); /* Warns if landlock_init_id() was not called. */ WARN_ON_ONCE(id == COUNTER_PRE_INIT); return id; } #ifdef CONFIG_SECURITY_LANDLOCK_KUNIT_TEST static u8 get_random_u8_positive(void) { /* max() evaluates its arguments once. */ return max(1, get_random_u8()); } static void test_range1_rand0(struct kunit *const test) { atomic64_t counter; u64 init; init = get_random_u32(); atomic64_set(&counter, init); KUNIT_EXPECT_EQ(test, get_id_range(1, &counter, 0), init); KUNIT_EXPECT_EQ(test, get_id_range(get_random_u8_positive(), &counter, get_random_u8()), init + 1); } static void test_range1_rand1(struct kunit *const test) { atomic64_t counter; u64 init; init = get_random_u32(); atomic64_set(&counter, init); KUNIT_EXPECT_EQ(test, get_id_range(1, &counter, 1), init); KUNIT_EXPECT_EQ(test, get_id_range(get_random_u8_positive(), &counter, get_random_u8()), init + 2); } static void test_range1_rand15(struct kunit *const test) { atomic64_t counter; u64 init; init = get_random_u32(); atomic64_set(&counter, init); KUNIT_EXPECT_EQ(test, get_id_range(1, &counter, 15), init); KUNIT_EXPECT_EQ(test, get_id_range(get_random_u8_positive(), &counter, get_random_u8()), init + 16); } static void test_range1_rand16(struct kunit *const test) { atomic64_t counter; u64 init; init = get_random_u32(); atomic64_set(&counter, init); KUNIT_EXPECT_EQ(test, get_id_range(1, &counter, 16), init); KUNIT_EXPECT_EQ(test, get_id_range(get_random_u8_positive(), &counter, get_random_u8()), init + 1); } static void test_range2_rand0(struct kunit *const test) { atomic64_t counter; u64 init; init = get_random_u32(); atomic64_set(&counter, init); KUNIT_EXPECT_EQ(test, get_id_range(2, &counter, 0), init); KUNIT_EXPECT_EQ(test, get_id_range(get_random_u8_positive(), &counter, get_random_u8()), init + 2); } static void test_range2_rand1(struct kunit *const test) { atomic64_t counter; u64 init; init = get_random_u32(); atomic64_set(&counter, init); KUNIT_EXPECT_EQ(test, get_id_range(2, &counter, 1), init); KUNIT_EXPECT_EQ(test, get_id_range(get_random_u8_positive(), &counter, get_random_u8()), init + 3); } static void test_range2_rand2(struct kunit *const test) { atomic64_t counter; u64 init; init = get_random_u32(); atomic64_set(&counter, init); KUNIT_EXPECT_EQ(test, get_id_range(2, &counter, 2), init); KUNIT_EXPECT_EQ(test, get_id_range(get_random_u8_positive(), &counter, get_random_u8()), init + 4); } static void test_range2_rand15(struct kunit *const test) { atomic64_t counter; u64 init; init = get_random_u32(); atomic64_set(&counter, init); KUNIT_EXPECT_EQ(test, get_id_range(2, &counter, 15), init); KUNIT_EXPECT_EQ(test, get_id_range(get_random_u8_positive(), &counter, get_random_u8()), init + 17); } static void test_range2_rand16(struct kunit *const test) { atomic64_t counter; u64 init; init = get_random_u32(); atomic64_set(&counter, init); KUNIT_EXPECT_EQ(test, get_id_range(2, &counter, 16), init); KUNIT_EXPECT_EQ(test, get_id_range(get_random_u8_positive(), &counter, get_random_u8()), init + 2); } #endif /* CONFIG_SECURITY_LANDLOCK_KUNIT_TEST */ /** * landlock_get_id_range - Get a range of unique IDs * * @number_of_ids: Number of IDs to hold. Must be greater than one. * * Returns: The first ID in the range. */ u64 landlock_get_id_range(size_t number_of_ids) { return get_id_range(number_of_ids, &next_id, get_random_u8()); } #ifdef CONFIG_SECURITY_LANDLOCK_KUNIT_TEST static struct kunit_case __refdata test_cases[] = { /* clang-format off */ KUNIT_CASE(test_init_min), KUNIT_CASE(test_init_max), KUNIT_CASE(test_init_once), KUNIT_CASE(test_range1_rand0), KUNIT_CASE(test_range1_rand1), KUNIT_CASE(test_range1_rand15), KUNIT_CASE(test_range1_rand16), KUNIT_CASE(test_range2_rand0), KUNIT_CASE(test_range2_rand1), KUNIT_CASE(test_range2_rand2), KUNIT_CASE(test_range2_rand15), KUNIT_CASE(test_range2_rand16), {} /* clang-format on */ }; static struct kunit_suite test_suite = { .name = "landlock_id", .test_cases = test_cases, }; kunit_test_init_section_suite(test_suite); #endif /* CONFIG_SECURITY_LANDLOCK_KUNIT_TEST */ |
| 6 4 4 1 1 1 30 31 35 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 | // SPDX-License-Identifier: GPL-2.0-only /* * KVM dirty ring implementation * * Copyright 2019 Red Hat, Inc. */ #include <linux/kvm_host.h> #include <linux/kvm.h> #include <linux/vmalloc.h> #include <linux/kvm_dirty_ring.h> #include <trace/events/kvm.h> #include "kvm_mm.h" int __weak kvm_cpu_dirty_log_size(struct kvm *kvm) { return 0; } u32 kvm_dirty_ring_get_rsvd_entries(struct kvm *kvm) { return KVM_DIRTY_RING_RSVD_ENTRIES + kvm_cpu_dirty_log_size(kvm); } bool kvm_use_dirty_bitmap(struct kvm *kvm) { lockdep_assert_held(&kvm->slots_lock); return !kvm->dirty_ring_size || kvm->dirty_ring_with_bitmap; } #ifndef CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP bool kvm_arch_allow_write_without_running_vcpu(struct kvm *kvm) { return false; } #endif static u32 kvm_dirty_ring_used(struct kvm_dirty_ring *ring) { return READ_ONCE(ring->dirty_index) - READ_ONCE(ring->reset_index); } static bool kvm_dirty_ring_soft_full(struct kvm_dirty_ring *ring) { return kvm_dirty_ring_used(ring) >= ring->soft_limit; } static bool kvm_dirty_ring_full(struct kvm_dirty_ring *ring) { return kvm_dirty_ring_used(ring) >= ring->size; } static void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask) { struct kvm_memory_slot *memslot; int as_id, id; as_id = slot >> 16; id = (u16)slot; if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS) return; memslot = id_to_memslot(__kvm_memslots(kvm, as_id), id); if (!memslot || (offset + __fls(mask)) >= memslot->npages) return; KVM_MMU_LOCK(kvm); kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset, mask); KVM_MMU_UNLOCK(kvm); } int kvm_dirty_ring_alloc(struct kvm *kvm, struct kvm_dirty_ring *ring, int index, u32 size) { ring->dirty_gfns = vzalloc(size); if (!ring->dirty_gfns) return -ENOMEM; ring->size = size / sizeof(struct kvm_dirty_gfn); ring->soft_limit = ring->size - kvm_dirty_ring_get_rsvd_entries(kvm); ring->dirty_index = 0; ring->reset_index = 0; ring->index = index; return 0; } static inline void kvm_dirty_gfn_set_invalid(struct kvm_dirty_gfn *gfn) { smp_store_release(&gfn->flags, 0); } static inline void kvm_dirty_gfn_set_dirtied(struct kvm_dirty_gfn *gfn) { gfn->flags = KVM_DIRTY_GFN_F_DIRTY; } static inline bool kvm_dirty_gfn_harvested(struct kvm_dirty_gfn *gfn) { return smp_load_acquire(&gfn->flags) & KVM_DIRTY_GFN_F_RESET; } int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring, int *nr_entries_reset) { /* * To minimize mmu_lock contention, batch resets for harvested entries * whose gfns are in the same slot, and are within N frame numbers of * each other, where N is the number of bits in an unsigned long. For * simplicity, process the current set of entries when the next entry * can't be included in the batch. * * Track the current batch slot, the gfn offset into the slot for the * batch, and the bitmask of gfns that need to be reset (relative to * offset). Note, the offset may be adjusted backwards, e.g. so that * a sequence of gfns X, X-1, ... X-N-1 can be batched. */ u32 cur_slot, next_slot; u64 cur_offset, next_offset; unsigned long mask = 0; struct kvm_dirty_gfn *entry; /* * Ensure concurrent calls to KVM_RESET_DIRTY_RINGS are serialized, * e.g. so that KVM fully resets all entries processed by a given call * before returning to userspace. Holding slots_lock also protects * the various memslot accesses. */ lockdep_assert_held(&kvm->slots_lock); while (likely((*nr_entries_reset) < INT_MAX)) { if (signal_pending(current)) return -EINTR; entry = &ring->dirty_gfns[ring->reset_index & (ring->size - 1)]; if (!kvm_dirty_gfn_harvested(entry)) break; next_slot = READ_ONCE(entry->slot); next_offset = READ_ONCE(entry->offset); /* Update the flags to reflect that this GFN is reset */ kvm_dirty_gfn_set_invalid(entry); ring->reset_index++; (*nr_entries_reset)++; if (mask) { /* * While the size of each ring is fixed, it's possible * for the ring to be constantly re-dirtied/harvested * while the reset is in-progress (the hard limit exists * only to guard against the count becoming negative). */ cond_resched(); /* * Try to coalesce the reset operations when the guest * is scanning pages in the same slot. */ if (next_slot == cur_slot) { s64 delta = next_offset - cur_offset; if (delta >= 0 && delta < BITS_PER_LONG) { mask |= 1ull << delta; continue; } /* Backwards visit, careful about overflows! */ if (delta > -BITS_PER_LONG && delta < 0 && (mask << -delta >> -delta) == mask) { cur_offset = next_offset; mask = (mask << -delta) | 1; continue; } } /* * Reset the slot for all the harvested entries that * have been gathered, but not yet fully processed. */ kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask); } /* * The current slot was reset or this is the first harvested * entry, (re)initialize the metadata. */ cur_slot = next_slot; cur_offset = next_offset; mask = 1; } /* * Perform a final reset if there are harvested entries that haven't * been processed, which is guaranteed if at least one harvested was * found. The loop only performs a reset when the "next" entry can't * be batched with the "current" entry(s), and that reset processes the * _current_ entry(s); i.e. the last harvested entry, a.k.a. next, will * always be left pending. */ if (mask) kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask); /* * The request KVM_REQ_DIRTY_RING_SOFT_FULL will be cleared * by the VCPU thread next time when it enters the guest. */ trace_kvm_dirty_ring_reset(ring); return 0; } void kvm_dirty_ring_push(struct kvm_vcpu *vcpu, u32 slot, u64 offset) { struct kvm_dirty_ring *ring = &vcpu->dirty_ring; struct kvm_dirty_gfn *entry; /* It should never get full */ WARN_ON_ONCE(kvm_dirty_ring_full(ring)); entry = &ring->dirty_gfns[ring->dirty_index & (ring->size - 1)]; entry->slot = slot; entry->offset = offset; /* * Make sure the data is filled in before we publish this to * the userspace program. There's no paired kernel-side reader. */ smp_wmb(); kvm_dirty_gfn_set_dirtied(entry); ring->dirty_index++; trace_kvm_dirty_ring_push(ring, slot, offset); if (kvm_dirty_ring_soft_full(ring)) kvm_make_request(KVM_REQ_DIRTY_RING_SOFT_FULL, vcpu); } bool kvm_dirty_ring_check_request(struct kvm_vcpu *vcpu) { /* * The VCPU isn't runnable when the dirty ring becomes soft full. * The KVM_REQ_DIRTY_RING_SOFT_FULL event is always set to prevent * the VCPU from running until the dirty pages are harvested and * the dirty ring is reset by userspace. */ if (kvm_check_request(KVM_REQ_DIRTY_RING_SOFT_FULL, vcpu) && kvm_dirty_ring_soft_full(&vcpu->dirty_ring)) { kvm_make_request(KVM_REQ_DIRTY_RING_SOFT_FULL, vcpu); vcpu->run->exit_reason = KVM_EXIT_DIRTY_RING_FULL; trace_kvm_dirty_ring_exit(vcpu); return true; } return false; } struct page *kvm_dirty_ring_get_page(struct kvm_dirty_ring *ring, u32 offset) { return vmalloc_to_page((void *)ring->dirty_gfns + offset * PAGE_SIZE); } void kvm_dirty_ring_free(struct kvm_dirty_ring *ring) { vfree(ring->dirty_gfns); ring->dirty_gfns = NULL; } |
| 6 6 6 6 3 3 1 1 1 1 1 3 3 3 3 3 3 2 3 3 3 1 6 6 2 2 2 6 6 3 1 1 1 1 8 13 5 3 1 1 2 1 1 10 10 10 10 13 13 13 17 17 16 6 6 5 8 13 13 17 17 549 549 549 549 549 3 549 6 4 4 4 10 1 1 1 1 1 4 4 4 3 3 3 2 4 3 3 2 1 1 10 550 549 549 549 549 549 1 1 554 600 600 600 599 600 599 547 544 547 547 544 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 | // SPDX-License-Identifier: GPL-2.0-or-later /* * IPv6 virtual tunneling interface * * Copyright (C) 2013 secunet Security Networks AG * * Author: * Steffen Klassert <steffen.klassert@secunet.com> * * Based on: * net/ipv6/ip6_tunnel.c */ #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/sockios.h> #include <linux/icmp.h> #include <linux/if.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/icmpv6.h> #include <linux/init.h> #include <linux/route.h> #include <linux/rtnetlink.h> #include <linux/netfilter_ipv6.h> #include <linux/slab.h> #include <linux/hash.h> #include <linux/uaccess.h> #include <linux/atomic.h> #include <net/icmp.h> #include <net/ip.h> #include <net/ip_tunnels.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/ip6_tunnel.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/netdev_lock.h> #include <linux/etherdevice.h> #define IP6_VTI_HASH_SIZE_SHIFT 5 #define IP6_VTI_HASH_SIZE (1 << IP6_VTI_HASH_SIZE_SHIFT) static u32 HASH(const struct in6_addr *addr1, const struct in6_addr *addr2) { u32 hash = ipv6_addr_hash(addr1) ^ ipv6_addr_hash(addr2); return hash_32(hash, IP6_VTI_HASH_SIZE_SHIFT); } static int vti6_dev_init(struct net_device *dev); static void vti6_dev_setup(struct net_device *dev); static struct rtnl_link_ops vti6_link_ops __read_mostly; static unsigned int vti6_net_id __read_mostly; struct vti6_net { /* the vti6 tunnel fallback device */ struct net_device *fb_tnl_dev; /* lists for storing tunnels in use */ struct ip6_tnl __rcu *tnls_r_l[IP6_VTI_HASH_SIZE]; struct ip6_tnl __rcu *tnls_wc[1]; struct ip6_tnl __rcu **tnls[2]; }; #define for_each_vti6_tunnel_rcu(start) \ for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) /** * vti6_tnl_lookup - fetch tunnel matching the end-point addresses * @net: network namespace * @remote: the address of the tunnel exit-point * @local: the address of the tunnel entry-point * * Return: * tunnel matching given end-points if found, * else fallback tunnel if its device is up, * else %NULL **/ static struct ip6_tnl * vti6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_addr *local) { unsigned int hash = HASH(remote, local); struct ip6_tnl *t; struct vti6_net *ip6n = net_generic(net, vti6_net_id); struct in6_addr any; for_each_vti6_tunnel_rcu(ip6n->tnls_r_l[hash]) { if (ipv6_addr_equal(local, &t->parms.laddr) && ipv6_addr_equal(remote, &t->parms.raddr) && (t->dev->flags & IFF_UP)) return t; } memset(&any, 0, sizeof(any)); hash = HASH(&any, local); for_each_vti6_tunnel_rcu(ip6n->tnls_r_l[hash]) { if (ipv6_addr_equal(local, &t->parms.laddr) && (t->dev->flags & IFF_UP)) return t; } hash = HASH(remote, &any); for_each_vti6_tunnel_rcu(ip6n->tnls_r_l[hash]) { if (ipv6_addr_equal(remote, &t->parms.raddr) && (t->dev->flags & IFF_UP)) return t; } t = rcu_dereference(ip6n->tnls_wc[0]); if (t && (t->dev->flags & IFF_UP)) return t; return NULL; } /** * vti6_tnl_bucket - get head of list matching given tunnel parameters * @ip6n: the private data for ip6_vti in the netns * @p: parameters containing tunnel end-points * * Description: * vti6_tnl_bucket() returns the head of the list matching the * &struct in6_addr entries laddr and raddr in @p. * * Return: head of IPv6 tunnel list **/ static struct ip6_tnl __rcu ** vti6_tnl_bucket(struct vti6_net *ip6n, const struct __ip6_tnl_parm *p) { const struct in6_addr *remote = &p->raddr; const struct in6_addr *local = &p->laddr; unsigned int h = 0; int prio = 0; if (!ipv6_addr_any(remote) || !ipv6_addr_any(local)) { prio = 1; h = HASH(remote, local); } return &ip6n->tnls[prio][h]; } static void vti6_tnl_link(struct vti6_net *ip6n, struct ip6_tnl *t) { struct ip6_tnl __rcu **tp = vti6_tnl_bucket(ip6n, &t->parms); rcu_assign_pointer(t->next, rtnl_dereference(*tp)); rcu_assign_pointer(*tp, t); } static void vti6_tnl_unlink(struct vti6_net *ip6n, struct ip6_tnl *t) { struct ip6_tnl __rcu **tp; struct ip6_tnl *iter; for (tp = vti6_tnl_bucket(ip6n, &t->parms); (iter = rtnl_dereference(*tp)) != NULL; tp = &iter->next) { if (t == iter) { rcu_assign_pointer(*tp, t->next); break; } } } static int vti6_tnl_create2(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct vti6_net *ip6n = net_generic(t->net, vti6_net_id); int err; dev->rtnl_link_ops = &vti6_link_ops; err = register_netdevice(dev); if (err < 0) goto out; strcpy(t->parms.name, dev->name); vti6_tnl_link(ip6n, t); return 0; out: return err; } static struct ip6_tnl *vti6_tnl_create(struct net *net, struct __ip6_tnl_parm *p) { struct net_device *dev; struct ip6_tnl *t; char name[IFNAMSIZ]; int err; if (p->name[0]) { if (!dev_valid_name(p->name)) goto failed; strscpy(name, p->name, IFNAMSIZ); } else { sprintf(name, "ip6_vti%%d"); } dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, vti6_dev_setup); if (!dev) goto failed; dev_net_set(dev, net); t = netdev_priv(dev); t->parms = *p; t->net = dev_net(dev); err = vti6_tnl_create2(dev); if (err < 0) goto failed_free; return t; failed_free: free_netdev(dev); failed: return NULL; } /** * vti6_locate - find or create tunnel matching given parameters * @net: network namespace * @p: tunnel parameters * @create: != 0 if allowed to create new tunnel if no match found * * Description: * vti6_locate() first tries to locate an existing tunnel * based on @parms. If this is unsuccessful, but @create is set a new * tunnel device is created and registered for use. * * Return: * matching tunnel or NULL **/ static struct ip6_tnl *vti6_locate(struct net *net, struct __ip6_tnl_parm *p, int create) { const struct in6_addr *remote = &p->raddr; const struct in6_addr *local = &p->laddr; struct ip6_tnl __rcu **tp; struct ip6_tnl *t; struct vti6_net *ip6n = net_generic(net, vti6_net_id); for (tp = vti6_tnl_bucket(ip6n, p); (t = rtnl_dereference(*tp)) != NULL; tp = &t->next) { if (ipv6_addr_equal(local, &t->parms.laddr) && ipv6_addr_equal(remote, &t->parms.raddr)) { if (create) return NULL; return t; } } if (!create) return NULL; return vti6_tnl_create(net, p); } /** * vti6_dev_uninit - tunnel device uninitializer * @dev: the device to be destroyed * * Description: * vti6_dev_uninit() removes tunnel from its list **/ static void vti6_dev_uninit(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct vti6_net *ip6n = net_generic(t->net, vti6_net_id); if (dev == ip6n->fb_tnl_dev) RCU_INIT_POINTER(ip6n->tnls_wc[0], NULL); else vti6_tnl_unlink(ip6n, t); netdev_put(dev, &t->dev_tracker); } static int vti6_input_proto(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) { struct ip6_tnl *t; const struct ipv6hdr *ipv6h = ipv6_hdr(skb); rcu_read_lock(); t = vti6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr, &ipv6h->daddr); if (t) { if (t->parms.proto != IPPROTO_IPV6 && t->parms.proto != 0) { rcu_read_unlock(); goto discard; } if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { rcu_read_unlock(); goto discard; } ipv6h = ipv6_hdr(skb); if (!ip6_tnl_rcv_ctl(t, &ipv6h->daddr, &ipv6h->saddr)) { DEV_STATS_INC(t->dev, rx_dropped); rcu_read_unlock(); goto discard; } rcu_read_unlock(); XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t; XFRM_SPI_SKB_CB(skb)->family = AF_INET6; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); return xfrm_input(skb, nexthdr, spi, encap_type); } rcu_read_unlock(); return -EINVAL; discard: kfree_skb(skb); return 0; } static int vti6_rcv(struct sk_buff *skb) { int nexthdr = skb_network_header(skb)[IP6CB(skb)->nhoff]; return vti6_input_proto(skb, nexthdr, 0, 0); } static int vti6_rcv_cb(struct sk_buff *skb, int err) { unsigned short family; struct net_device *dev; struct xfrm_state *x; const struct xfrm_mode *inner_mode; struct ip6_tnl *t = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6; u32 orig_mark = skb->mark; int ret; if (!t) return 1; dev = t->dev; if (err) { DEV_STATS_INC(dev, rx_errors); DEV_STATS_INC(dev, rx_dropped); return 0; } x = xfrm_input_state(skb); inner_mode = &x->inner_mode; if (x->sel.family == AF_UNSPEC) { inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); if (inner_mode == NULL) { XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMINSTATEMODEERROR); return -EINVAL; } } family = inner_mode->family; skb->mark = be32_to_cpu(t->parms.i_key); ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family); skb->mark = orig_mark; if (!ret) return -EPERM; skb_scrub_packet(skb, !net_eq(t->net, dev_net(skb->dev))); skb->dev = dev; dev_sw_netstats_rx_add(dev, skb->len); return 0; } /** * vti6_addr_conflict - compare packet addresses to tunnel's own * @t: the outgoing tunnel device * @hdr: IPv6 header from the incoming packet * * Description: * Avoid trivial tunneling loop by checking that tunnel exit-point * doesn't match source of incoming packet. * * Return: * 1 if conflict, * 0 else **/ static inline bool vti6_addr_conflict(const struct ip6_tnl *t, const struct ipv6hdr *hdr) { return ipv6_addr_equal(&t->parms.raddr, &hdr->saddr); } static bool vti6_state_check(const struct xfrm_state *x, const struct in6_addr *dst, const struct in6_addr *src) { xfrm_address_t *daddr = (xfrm_address_t *)dst; xfrm_address_t *saddr = (xfrm_address_t *)src; /* if there is no transform then this tunnel is not functional. * Or if the xfrm is not mode tunnel. */ if (!x || x->props.mode != XFRM_MODE_TUNNEL || x->props.family != AF_INET6) return false; if (ipv6_addr_any(dst)) return xfrm_addr_equal(saddr, &x->props.saddr, AF_INET6); if (!xfrm_state_addr_check(x, daddr, saddr, AF_INET6)) return false; return true; } /** * vti6_xmit - send a packet * @skb: the outgoing socket buffer * @dev: the outgoing tunnel device * @fl: the flow informations for the xfrm_lookup **/ static int vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) { struct ip6_tnl *t = netdev_priv(dev); struct dst_entry *dst = skb_dst(skb); struct net_device *tdev; struct xfrm_state *x; int pkt_len = skb->len; int err = -1; int mtu; if (!dst) { switch (skb->protocol) { case htons(ETH_P_IP): { struct rtable *rt; fl->u.ip4.flowi4_oif = dev->ifindex; fl->u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC; rt = __ip_route_output_key(dev_net(dev), &fl->u.ip4); if (IS_ERR(rt)) goto tx_err_link_failure; dst = &rt->dst; skb_dst_set(skb, dst); break; } case htons(ETH_P_IPV6): fl->u.ip6.flowi6_oif = dev->ifindex; fl->u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC; dst = ip6_route_output(dev_net(dev), NULL, &fl->u.ip6); if (dst->error) { dst_release(dst); dst = NULL; goto tx_err_link_failure; } skb_dst_set(skb, dst); break; default: goto tx_err_link_failure; } } dst_hold(dst); dst = xfrm_lookup_route(t->net, dst, fl, NULL, 0); if (IS_ERR(dst)) { err = PTR_ERR(dst); dst = NULL; goto tx_err_link_failure; } if (dst->flags & DST_XFRM_QUEUE) goto xmit; x = dst->xfrm; if (!vti6_state_check(x, &t->parms.raddr, &t->parms.laddr)) goto tx_err_link_failure; if (!ip6_tnl_xmit_ctl(t, (const struct in6_addr *)&x->props.saddr, (const struct in6_addr *)&x->id.daddr)) goto tx_err_link_failure; tdev = dst_dev(dst); if (tdev == dev) { DEV_STATS_INC(dev, collisions); net_warn_ratelimited("%s: Local routing loop detected!\n", t->parms.name); goto tx_err_dst_release; } mtu = dst_mtu(dst); if (skb->len > mtu) { skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->protocol == htons(ETH_P_IPV6)) { if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); } else { if (!(ip_hdr(skb)->frag_off & htons(IP_DF))) goto xmit; icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); } err = -EMSGSIZE; goto tx_err_dst_release; } xmit: skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev))); skb_dst_set(skb, dst); skb->dev = dst_dev(dst); err = dst_output(t->net, skb->sk, skb); if (net_xmit_eval(err) == 0) err = pkt_len; iptunnel_xmit_stats(dev, err); return 0; tx_err_link_failure: DEV_STATS_INC(dev, tx_carrier_errors); dst_link_failure(skb); tx_err_dst_release: dst_release(dst); return err; } static netdev_tx_t vti6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct flowi fl; int ret; if (!pskb_inet_may_pull(skb)) goto tx_err; memset(&fl, 0, sizeof(fl)); switch (skb->protocol) { case htons(ETH_P_IPV6): if ((t->parms.proto != IPPROTO_IPV6 && t->parms.proto != 0) || vti6_addr_conflict(t, ipv6_hdr(skb))) goto tx_err; memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET6); break; case htons(ETH_P_IP): memset(IPCB(skb), 0, sizeof(*IPCB(skb))); xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET); break; default: goto tx_err; } /* override mark with tunnel output key */ fl.flowi_mark = be32_to_cpu(t->parms.o_key); ret = vti6_xmit(skb, dev, &fl); if (ret < 0) goto tx_err; return NETDEV_TX_OK; tx_err: DEV_STATS_INC(dev, tx_errors); DEV_STATS_INC(dev, tx_dropped); kfree_skb(skb); return NETDEV_TX_OK; } static int vti6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { __be32 spi; __u32 mark; struct xfrm_state *x; struct ip6_tnl *t; struct ip_esp_hdr *esph; struct ip_auth_hdr *ah; struct ip_comp_hdr *ipch; struct net *net = dev_net(skb->dev); const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data; int protocol = iph->nexthdr; t = vti6_tnl_lookup(dev_net(skb->dev), &iph->daddr, &iph->saddr); if (!t) return -1; mark = be32_to_cpu(t->parms.o_key); switch (protocol) { case IPPROTO_ESP: esph = (struct ip_esp_hdr *)(skb->data + offset); spi = esph->spi; break; case IPPROTO_AH: ah = (struct ip_auth_hdr *)(skb->data + offset); spi = ah->spi; break; case IPPROTO_COMP: ipch = (struct ip_comp_hdr *)(skb->data + offset); spi = htonl(ntohs(ipch->cpi)); break; default: return 0; } if (type != ICMPV6_PKT_TOOBIG && type != NDISC_REDIRECT) return 0; x = xfrm_state_lookup(net, mark, (const xfrm_address_t *)&iph->daddr, spi, protocol, AF_INET6); if (!x) return 0; if (type == NDISC_REDIRECT) ip6_redirect(skb, net, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); else ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL)); xfrm_state_put(x); return 0; } static void vti6_link_config(struct ip6_tnl *t, bool keep_mtu) { struct net_device *dev = t->dev; struct __ip6_tnl_parm *p = &t->parms; struct net_device *tdev = NULL; int mtu; __dev_addr_set(dev, &p->laddr, sizeof(struct in6_addr)); memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr)); p->flags &= ~(IP6_TNL_F_CAP_XMIT | IP6_TNL_F_CAP_RCV | IP6_TNL_F_CAP_PER_PACKET); p->flags |= ip6_tnl_get_cap(t, &p->laddr, &p->raddr); if (p->flags & IP6_TNL_F_CAP_XMIT && p->flags & IP6_TNL_F_CAP_RCV) dev->flags |= IFF_POINTOPOINT; else dev->flags &= ~IFF_POINTOPOINT; if (keep_mtu && dev->mtu) { WRITE_ONCE(dev->mtu, clamp(dev->mtu, dev->min_mtu, dev->max_mtu)); return; } if (p->flags & IP6_TNL_F_CAP_XMIT) { int strict = (ipv6_addr_type(&p->raddr) & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)); struct rt6_info *rt = rt6_lookup(t->net, &p->raddr, &p->laddr, p->link, NULL, strict); if (rt) tdev = rt->dst.dev; ip6_rt_put(rt); } if (!tdev && p->link) tdev = __dev_get_by_index(t->net, p->link); if (tdev) mtu = tdev->mtu - sizeof(struct ipv6hdr); else mtu = ETH_DATA_LEN - LL_MAX_HEADER - sizeof(struct ipv6hdr); dev->mtu = max_t(int, mtu, IPV4_MIN_MTU); } /** * vti6_tnl_change - update the tunnel parameters * @t: tunnel to be changed * @p: tunnel configuration parameters * @keep_mtu: MTU was set from userspace, don't re-compute it * * Description: * vti6_tnl_change() updates the tunnel parameters **/ static int vti6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p, bool keep_mtu) { t->parms.laddr = p->laddr; t->parms.raddr = p->raddr; t->parms.link = p->link; t->parms.i_key = p->i_key; t->parms.o_key = p->o_key; t->parms.proto = p->proto; t->parms.fwmark = p->fwmark; dst_cache_reset(&t->dst_cache); vti6_link_config(t, keep_mtu); return 0; } static int vti6_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p, bool keep_mtu) { struct net *net = dev_net(t->dev); struct vti6_net *ip6n = net_generic(net, vti6_net_id); int err; vti6_tnl_unlink(ip6n, t); synchronize_net(); err = vti6_tnl_change(t, p, keep_mtu); vti6_tnl_link(ip6n, t); netdev_state_change(t->dev); return err; } static void vti6_parm_from_user(struct __ip6_tnl_parm *p, const struct ip6_tnl_parm2 *u) { p->laddr = u->laddr; p->raddr = u->raddr; p->link = u->link; p->i_key = u->i_key; p->o_key = u->o_key; p->proto = u->proto; memcpy(p->name, u->name, sizeof(u->name)); } static void vti6_parm_to_user(struct ip6_tnl_parm2 *u, const struct __ip6_tnl_parm *p) { u->laddr = p->laddr; u->raddr = p->raddr; u->link = p->link; u->i_key = p->i_key; u->o_key = p->o_key; if (u->i_key) u->i_flags |= GRE_KEY; if (u->o_key) u->o_flags |= GRE_KEY; u->proto = p->proto; memcpy(u->name, p->name, sizeof(u->name)); } /** * vti6_siocdevprivate - configure vti6 tunnels from userspace * @dev: virtual device associated with tunnel * @ifr: unused * @data: parameters passed from userspace * @cmd: command to be performed * * Description: * vti6_siocdevprivate() is used for managing vti6 tunnels * from userspace. * * The possible commands are the following: * %SIOCGETTUNNEL: get tunnel parameters for device * %SIOCADDTUNNEL: add tunnel matching given tunnel parameters * %SIOCCHGTUNNEL: change tunnel parameters to those given * %SIOCDELTUNNEL: delete tunnel * * The fallback device "ip6_vti0", created during module * initialization, can be used for creating other tunnel devices. * * Return: * 0 on success, * %-EFAULT if unable to copy data to or from userspace, * %-EPERM if current process hasn't %CAP_NET_ADMIN set * %-EINVAL if passed tunnel parameters are invalid, * %-EEXIST if changing a tunnel's parameters would cause a conflict * %-ENODEV if attempting to change or delete a nonexisting device **/ static int vti6_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd) { int err = 0; struct ip6_tnl_parm2 p; struct __ip6_tnl_parm p1; struct ip6_tnl *t = NULL; struct net *net = dev_net(dev); struct vti6_net *ip6n = net_generic(net, vti6_net_id); memset(&p1, 0, sizeof(p1)); switch (cmd) { case SIOCGETTUNNEL: if (dev == ip6n->fb_tnl_dev) { if (copy_from_user(&p, data, sizeof(p))) { err = -EFAULT; break; } vti6_parm_from_user(&p1, &p); t = vti6_locate(net, &p1, 0); } else { memset(&p, 0, sizeof(p)); } if (!t) t = netdev_priv(dev); vti6_parm_to_user(&p, &t->parms); if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; break; case SIOCADDTUNNEL: case SIOCCHGTUNNEL: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = -EFAULT; if (copy_from_user(&p, data, sizeof(p))) break; err = -EINVAL; if (p.proto != IPPROTO_IPV6 && p.proto != 0) break; vti6_parm_from_user(&p1, &p); t = vti6_locate(net, &p1, cmd == SIOCADDTUNNEL); if (dev != ip6n->fb_tnl_dev && cmd == SIOCCHGTUNNEL) { if (t) { if (t->dev != dev) { err = -EEXIST; break; } } else t = netdev_priv(dev); err = vti6_update(t, &p1, false); } if (t) { err = 0; vti6_parm_to_user(&p, &t->parms); if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; } else err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); break; case SIOCDELTUNNEL: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; if (dev == ip6n->fb_tnl_dev) { err = -EFAULT; if (copy_from_user(&p, data, sizeof(p))) break; err = -ENOENT; vti6_parm_from_user(&p1, &p); t = vti6_locate(net, &p1, 0); if (!t) break; err = -EPERM; if (t->dev == ip6n->fb_tnl_dev) break; dev = t->dev; } err = 0; unregister_netdevice(dev); break; default: err = -EINVAL; } return err; } static const struct net_device_ops vti6_netdev_ops = { .ndo_init = vti6_dev_init, .ndo_uninit = vti6_dev_uninit, .ndo_start_xmit = vti6_tnl_xmit, .ndo_siocdevprivate = vti6_siocdevprivate, .ndo_get_iflink = ip6_tnl_get_iflink, }; /** * vti6_dev_setup - setup virtual tunnel device * @dev: virtual device associated with tunnel * * Description: * Initialize function pointers and device parameters **/ static void vti6_dev_setup(struct net_device *dev) { dev->netdev_ops = &vti6_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; dev->needs_free_netdev = true; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->type = ARPHRD_TUNNEL6; dev->min_mtu = IPV4_MIN_MTU; dev->max_mtu = IP_MAX_MTU - sizeof(struct ipv6hdr); dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); netif_keep_dst(dev); /* This perm addr will be used as interface identifier by IPv6 */ dev->addr_assign_type = NET_ADDR_RANDOM; eth_random_addr(dev->perm_addr); } /** * vti6_dev_init_gen - general initializer for all tunnel devices * @dev: virtual device associated with tunnel **/ static inline int vti6_dev_init_gen(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); t->dev = dev; netdev_hold(dev, &t->dev_tracker, GFP_KERNEL); netdev_lockdep_set_classes(dev); return 0; } /** * vti6_dev_init - initializer for all non fallback tunnel devices * @dev: virtual device associated with tunnel **/ static int vti6_dev_init(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); int err = vti6_dev_init_gen(dev); if (err) return err; vti6_link_config(t, true); return 0; } /** * vti6_fb_tnl_dev_init - initializer for fallback tunnel device * @dev: fallback device * * Return: 0 **/ static int __net_init vti6_fb_tnl_dev_init(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct net *net = dev_net(dev); struct vti6_net *ip6n = net_generic(net, vti6_net_id); t->net = net; t->parms.proto = IPPROTO_IPV6; rcu_assign_pointer(ip6n->tnls_wc[0], t); return 0; } static int vti6_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { return 0; } static void vti6_netlink_parms(struct nlattr *data[], struct __ip6_tnl_parm *parms) { memset(parms, 0, sizeof(*parms)); if (!data) return; if (data[IFLA_VTI_LINK]) parms->link = nla_get_u32(data[IFLA_VTI_LINK]); if (data[IFLA_VTI_LOCAL]) parms->laddr = nla_get_in6_addr(data[IFLA_VTI_LOCAL]); if (data[IFLA_VTI_REMOTE]) parms->raddr = nla_get_in6_addr(data[IFLA_VTI_REMOTE]); if (data[IFLA_VTI_IKEY]) parms->i_key = nla_get_be32(data[IFLA_VTI_IKEY]); if (data[IFLA_VTI_OKEY]) parms->o_key = nla_get_be32(data[IFLA_VTI_OKEY]); if (data[IFLA_VTI_FWMARK]) parms->fwmark = nla_get_u32(data[IFLA_VTI_FWMARK]); } static int vti6_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct nlattr **data = params->data; struct ip6_tnl *nt; struct net *net; net = params->link_net ? : dev_net(dev); nt = netdev_priv(dev); vti6_netlink_parms(data, &nt->parms); nt->parms.proto = IPPROTO_IPV6; nt->net = net; if (vti6_locate(net, &nt->parms, 0)) return -EEXIST; return vti6_tnl_create2(dev); } static void vti6_dellink(struct net_device *dev, struct list_head *head) { struct net *net = dev_net(dev); struct vti6_net *ip6n = net_generic(net, vti6_net_id); if (dev != ip6n->fb_tnl_dev) unregister_netdevice_queue(dev, head); } static int vti6_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ip6_tnl *t; struct __ip6_tnl_parm p; struct net *net = dev_net(dev); struct vti6_net *ip6n = net_generic(net, vti6_net_id); if (dev == ip6n->fb_tnl_dev) return -EINVAL; vti6_netlink_parms(data, &p); t = vti6_locate(net, &p, 0); if (t) { if (t->dev != dev) return -EEXIST; } else t = netdev_priv(dev); return vti6_update(t, &p, tb && tb[IFLA_MTU]); } static size_t vti6_get_size(const struct net_device *dev) { return /* IFLA_VTI_LINK */ nla_total_size(4) + /* IFLA_VTI_LOCAL */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VTI_REMOTE */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VTI_IKEY */ nla_total_size(4) + /* IFLA_VTI_OKEY */ nla_total_size(4) + /* IFLA_VTI_FWMARK */ nla_total_size(4) + 0; } static int vti6_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip6_tnl *tunnel = netdev_priv(dev); struct __ip6_tnl_parm *parm = &tunnel->parms; if (nla_put_u32(skb, IFLA_VTI_LINK, parm->link) || nla_put_in6_addr(skb, IFLA_VTI_LOCAL, &parm->laddr) || nla_put_in6_addr(skb, IFLA_VTI_REMOTE, &parm->raddr) || nla_put_be32(skb, IFLA_VTI_IKEY, parm->i_key) || nla_put_be32(skb, IFLA_VTI_OKEY, parm->o_key) || nla_put_u32(skb, IFLA_VTI_FWMARK, parm->fwmark)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static const struct nla_policy vti6_policy[IFLA_VTI_MAX + 1] = { [IFLA_VTI_LINK] = { .type = NLA_U32 }, [IFLA_VTI_LOCAL] = { .len = sizeof(struct in6_addr) }, [IFLA_VTI_REMOTE] = { .len = sizeof(struct in6_addr) }, [IFLA_VTI_IKEY] = { .type = NLA_U32 }, [IFLA_VTI_OKEY] = { .type = NLA_U32 }, [IFLA_VTI_FWMARK] = { .type = NLA_U32 }, }; static struct rtnl_link_ops vti6_link_ops __read_mostly = { .kind = "vti6", .maxtype = IFLA_VTI_MAX, .policy = vti6_policy, .priv_size = sizeof(struct ip6_tnl), .setup = vti6_dev_setup, .validate = vti6_validate, .newlink = vti6_newlink, .dellink = vti6_dellink, .changelink = vti6_changelink, .get_size = vti6_get_size, .fill_info = vti6_fill_info, .get_link_net = ip6_tnl_get_link_net, }; static void __net_exit vti6_exit_rtnl_net(struct net *net, struct list_head *list) { struct vti6_net *ip6n = net_generic(net, vti6_net_id); struct ip6_tnl *t; int h; for (h = 0; h < IP6_VTI_HASH_SIZE; h++) { t = rtnl_net_dereference(net, ip6n->tnls_r_l[h]); while (t) { unregister_netdevice_queue(t->dev, list); t = rtnl_net_dereference(net, t->next); } } t = rtnl_net_dereference(net, ip6n->tnls_wc[0]); if (t) unregister_netdevice_queue(t->dev, list); } static int __net_init vti6_init_net(struct net *net) { struct vti6_net *ip6n = net_generic(net, vti6_net_id); struct ip6_tnl *t = NULL; int err; ip6n->tnls[0] = ip6n->tnls_wc; ip6n->tnls[1] = ip6n->tnls_r_l; if (!net_has_fallback_tunnels(net)) return 0; err = -ENOMEM; ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6_vti0", NET_NAME_UNKNOWN, vti6_dev_setup); if (!ip6n->fb_tnl_dev) goto err_alloc_dev; dev_net_set(ip6n->fb_tnl_dev, net); ip6n->fb_tnl_dev->rtnl_link_ops = &vti6_link_ops; err = vti6_fb_tnl_dev_init(ip6n->fb_tnl_dev); if (err < 0) goto err_register; err = register_netdev(ip6n->fb_tnl_dev); if (err < 0) goto err_register; t = netdev_priv(ip6n->fb_tnl_dev); strcpy(t->parms.name, ip6n->fb_tnl_dev->name); return 0; err_register: free_netdev(ip6n->fb_tnl_dev); err_alloc_dev: return err; } static struct pernet_operations vti6_net_ops = { .init = vti6_init_net, .exit_rtnl = vti6_exit_rtnl_net, .id = &vti6_net_id, .size = sizeof(struct vti6_net), }; static struct xfrm6_protocol vti_esp6_protocol __read_mostly = { .handler = vti6_rcv, .input_handler = vti6_input_proto, .cb_handler = vti6_rcv_cb, .err_handler = vti6_err, .priority = 100, }; static struct xfrm6_protocol vti_ah6_protocol __read_mostly = { .handler = vti6_rcv, .input_handler = vti6_input_proto, .cb_handler = vti6_rcv_cb, .err_handler = vti6_err, .priority = 100, }; static struct xfrm6_protocol vti_ipcomp6_protocol __read_mostly = { .handler = vti6_rcv, .input_handler = vti6_input_proto, .cb_handler = vti6_rcv_cb, .err_handler = vti6_err, .priority = 100, }; #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) static int vti6_rcv_tunnel(struct sk_buff *skb) { const xfrm_address_t *saddr; __be32 spi; saddr = (const xfrm_address_t *)&ipv6_hdr(skb)->saddr; spi = xfrm6_tunnel_spi_lookup(dev_net(skb->dev), saddr); return vti6_input_proto(skb, IPPROTO_IPV6, spi, 0); } static struct xfrm6_tunnel vti_ipv6_handler __read_mostly = { .handler = vti6_rcv_tunnel, .cb_handler = vti6_rcv_cb, .err_handler = vti6_err, .priority = 0, }; static struct xfrm6_tunnel vti_ip6ip_handler __read_mostly = { .handler = vti6_rcv_tunnel, .cb_handler = vti6_rcv_cb, .err_handler = vti6_err, .priority = 0, }; #endif /** * vti6_tunnel_init - register protocol and reserve needed resources * * Return: 0 on success **/ static int __init vti6_tunnel_init(void) { const char *msg; int err; msg = "tunnel device"; err = register_pernet_device(&vti6_net_ops); if (err < 0) goto pernet_dev_failed; msg = "tunnel protocols"; err = xfrm6_protocol_register(&vti_esp6_protocol, IPPROTO_ESP); if (err < 0) goto xfrm_proto_esp_failed; err = xfrm6_protocol_register(&vti_ah6_protocol, IPPROTO_AH); if (err < 0) goto xfrm_proto_ah_failed; err = xfrm6_protocol_register(&vti_ipcomp6_protocol, IPPROTO_COMP); if (err < 0) goto xfrm_proto_comp_failed; #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) msg = "ipv6 tunnel"; err = xfrm6_tunnel_register(&vti_ipv6_handler, AF_INET6); if (err < 0) goto vti_tunnel_ipv6_failed; err = xfrm6_tunnel_register(&vti_ip6ip_handler, AF_INET); if (err < 0) goto vti_tunnel_ip6ip_failed; #endif msg = "netlink interface"; err = rtnl_link_register(&vti6_link_ops); if (err < 0) goto rtnl_link_failed; return 0; rtnl_link_failed: #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) err = xfrm6_tunnel_deregister(&vti_ip6ip_handler, AF_INET); vti_tunnel_ip6ip_failed: err = xfrm6_tunnel_deregister(&vti_ipv6_handler, AF_INET6); vti_tunnel_ipv6_failed: #endif xfrm6_protocol_deregister(&vti_ipcomp6_protocol, IPPROTO_COMP); xfrm_proto_comp_failed: xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH); xfrm_proto_ah_failed: xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP); xfrm_proto_esp_failed: unregister_pernet_device(&vti6_net_ops); pernet_dev_failed: pr_err("vti6 init: failed to register %s\n", msg); return err; } /** * vti6_tunnel_cleanup - free resources and unregister protocol **/ static void __exit vti6_tunnel_cleanup(void) { rtnl_link_unregister(&vti6_link_ops); #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) xfrm6_tunnel_deregister(&vti_ip6ip_handler, AF_INET); xfrm6_tunnel_deregister(&vti_ipv6_handler, AF_INET6); #endif xfrm6_protocol_deregister(&vti_ipcomp6_protocol, IPPROTO_COMP); xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH); xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP); unregister_pernet_device(&vti6_net_ops); } module_init(vti6_tunnel_init); module_exit(vti6_tunnel_cleanup); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("vti6"); MODULE_ALIAS_NETDEV("ip6_vti0"); MODULE_AUTHOR("Steffen Klassert"); MODULE_DESCRIPTION("IPv6 virtual tunnel interface"); |
| 2859 2865 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2020 Google LLC. */ #ifndef _LINUX_BPF_LSM_H #define _LINUX_BPF_LSM_H #include <linux/sched.h> #include <linux/bpf.h> #include <linux/bpf_verifier.h> #include <linux/lsm_hooks.h> #ifdef CONFIG_BPF_LSM #define LSM_HOOK(RET, DEFAULT, NAME, ...) \ RET bpf_lsm_##NAME(__VA_ARGS__); #include <linux/lsm_hook_defs.h> #undef LSM_HOOK struct bpf_storage_blob { struct bpf_local_storage __rcu *storage; }; extern struct lsm_blob_sizes bpf_lsm_blob_sizes; int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog); bool bpf_lsm_is_sleepable_hook(u32 btf_id); bool bpf_lsm_is_trusted(const struct bpf_prog *prog); static inline struct bpf_storage_blob *bpf_inode( const struct inode *inode) { if (unlikely(!inode->i_security)) return NULL; return inode->i_security + bpf_lsm_blob_sizes.lbs_inode; } extern const struct bpf_func_proto bpf_inode_storage_get_proto; extern const struct bpf_func_proto bpf_inode_storage_delete_proto; void bpf_inode_storage_free(struct inode *inode); void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func); int bpf_lsm_get_retval_range(const struct bpf_prog *prog, struct bpf_retval_range *range); int bpf_set_dentry_xattr_locked(struct dentry *dentry, const char *name__str, const struct bpf_dynptr *value_p, int flags); int bpf_remove_dentry_xattr_locked(struct dentry *dentry, const char *name__str); bool bpf_lsm_has_d_inode_locked(const struct bpf_prog *prog); #else /* !CONFIG_BPF_LSM */ static inline bool bpf_lsm_is_sleepable_hook(u32 btf_id) { return false; } static inline bool bpf_lsm_is_trusted(const struct bpf_prog *prog) { return false; } static inline int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog) { return -EOPNOTSUPP; } static inline struct bpf_storage_blob *bpf_inode( const struct inode *inode) { return NULL; } static inline void bpf_inode_storage_free(struct inode *inode) { } static inline void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func) { } static inline int bpf_lsm_get_retval_range(const struct bpf_prog *prog, struct bpf_retval_range *range) { return -EOPNOTSUPP; } static inline int bpf_set_dentry_xattr_locked(struct dentry *dentry, const char *name__str, const struct bpf_dynptr *value_p, int flags) { return -EOPNOTSUPP; } static inline int bpf_remove_dentry_xattr_locked(struct dentry *dentry, const char *name__str) { return -EOPNOTSUPP; } static inline bool bpf_lsm_has_d_inode_locked(const struct bpf_prog *prog) { return false; } #endif /* CONFIG_BPF_LSM */ #endif /* _LINUX_BPF_LSM_H */ |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __IEEE802154_CORE_H #define __IEEE802154_CORE_H #include <net/cfg802154.h> struct cfg802154_registered_device { const struct cfg802154_ops *ops; struct list_head list; /* wpan_phy index, internal only */ int wpan_phy_idx; /* also protected by devlist_mtx */ int opencount; wait_queue_head_t dev_wait; /* protected by RTNL only */ int num_running_ifaces; /* associated wpan interfaces, protected by rtnl or RCU */ struct list_head wpan_dev_list; int devlist_generation, wpan_dev_id; /* must be last because of the way we do wpan_phy_priv(), * and it should at least be aligned to NETDEV_ALIGN */ struct wpan_phy wpan_phy __aligned(NETDEV_ALIGN); }; static inline struct cfg802154_registered_device * wpan_phy_to_rdev(struct wpan_phy *wpan_phy) { BUG_ON(!wpan_phy); return container_of(wpan_phy, struct cfg802154_registered_device, wpan_phy); } extern struct list_head cfg802154_rdev_list; extern int cfg802154_rdev_list_generation; int cfg802154_switch_netns(struct cfg802154_registered_device *rdev, struct net *net); /* free object */ void cfg802154_dev_free(struct cfg802154_registered_device *rdev); struct cfg802154_registered_device * cfg802154_rdev_by_wpan_phy_idx(int wpan_phy_idx); struct wpan_phy *wpan_phy_idx_to_wpan_phy(int wpan_phy_idx); #endif /* __IEEE802154_CORE_H */ |
| 4 4 4 4 4 4 4 4 9 9 9 9 9 7 7 9 9 9 7 7 9 7 3 3 3 3 3 3 3 4 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ #include "main.h" #include <linux/byteorder/generic.h> #include <linux/container_of.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/pkt_sched.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> #include <uapi/linux/batadv_packet.h> #include "originator.h" #include "send.h" #include "tvlv.h" /** * batadv_tvlv_handler_release() - release tvlv handler from lists and queue for * free after rcu grace period * @ref: kref pointer of the tvlv */ static void batadv_tvlv_handler_release(struct kref *ref) { struct batadv_tvlv_handler *tvlv_handler; tvlv_handler = container_of(ref, struct batadv_tvlv_handler, refcount); kfree_rcu(tvlv_handler, rcu); } /** * batadv_tvlv_handler_put() - decrement the tvlv container refcounter and * possibly release it * @tvlv_handler: the tvlv handler to free */ static void batadv_tvlv_handler_put(struct batadv_tvlv_handler *tvlv_handler) { if (!tvlv_handler) return; kref_put(&tvlv_handler->refcount, batadv_tvlv_handler_release); } /** * batadv_tvlv_handler_get() - retrieve tvlv handler from the tvlv handler list * based on the provided type and version (both need to match) * @bat_priv: the bat priv with all the mesh interface information * @type: tvlv handler type to look for * @version: tvlv handler version to look for * * Return: tvlv handler if found or NULL otherwise. */ static struct batadv_tvlv_handler * batadv_tvlv_handler_get(struct batadv_priv *bat_priv, u8 type, u8 version) { struct batadv_tvlv_handler *tvlv_handler_tmp, *tvlv_handler = NULL; rcu_read_lock(); hlist_for_each_entry_rcu(tvlv_handler_tmp, &bat_priv->tvlv.handler_list, list) { if (tvlv_handler_tmp->type != type) continue; if (tvlv_handler_tmp->version != version) continue; if (!kref_get_unless_zero(&tvlv_handler_tmp->refcount)) continue; tvlv_handler = tvlv_handler_tmp; break; } rcu_read_unlock(); return tvlv_handler; } /** * batadv_tvlv_container_release() - release tvlv from lists and free * @ref: kref pointer of the tvlv */ static void batadv_tvlv_container_release(struct kref *ref) { struct batadv_tvlv_container *tvlv; tvlv = container_of(ref, struct batadv_tvlv_container, refcount); kfree(tvlv); } /** * batadv_tvlv_container_put() - decrement the tvlv container refcounter and * possibly release it * @tvlv: the tvlv container to free */ static void batadv_tvlv_container_put(struct batadv_tvlv_container *tvlv) { if (!tvlv) return; kref_put(&tvlv->refcount, batadv_tvlv_container_release); } /** * batadv_tvlv_container_get() - retrieve tvlv container from the tvlv container * list based on the provided type and version (both need to match) * @bat_priv: the bat priv with all the mesh interface information * @type: tvlv container type to look for * @version: tvlv container version to look for * * Has to be called with the appropriate locks being acquired * (tvlv.container_list_lock). * * Return: tvlv container if found or NULL otherwise. */ static struct batadv_tvlv_container * batadv_tvlv_container_get(struct batadv_priv *bat_priv, u8 type, u8 version) { struct batadv_tvlv_container *tvlv_tmp, *tvlv = NULL; lockdep_assert_held(&bat_priv->tvlv.container_list_lock); hlist_for_each_entry(tvlv_tmp, &bat_priv->tvlv.container_list, list) { if (tvlv_tmp->tvlv_hdr.type != type) continue; if (tvlv_tmp->tvlv_hdr.version != version) continue; kref_get(&tvlv_tmp->refcount); tvlv = tvlv_tmp; break; } return tvlv; } /** * batadv_tvlv_container_list_size() - calculate the size of the tvlv container * list entries * @bat_priv: the bat priv with all the mesh interface information * * Has to be called with the appropriate locks being acquired * (tvlv.container_list_lock). * * Return: size of all currently registered tvlv containers in bytes. */ static u16 batadv_tvlv_container_list_size(struct batadv_priv *bat_priv) { struct batadv_tvlv_container *tvlv; u16 tvlv_len = 0; lockdep_assert_held(&bat_priv->tvlv.container_list_lock); hlist_for_each_entry(tvlv, &bat_priv->tvlv.container_list, list) { tvlv_len += sizeof(struct batadv_tvlv_hdr); tvlv_len += ntohs(tvlv->tvlv_hdr.len); } return tvlv_len; } /** * batadv_tvlv_container_remove() - remove tvlv container from the tvlv * container list * @bat_priv: the bat priv with all the mesh interface information * @tvlv: the to be removed tvlv container * * Has to be called with the appropriate locks being acquired * (tvlv.container_list_lock). */ static void batadv_tvlv_container_remove(struct batadv_priv *bat_priv, struct batadv_tvlv_container *tvlv) { lockdep_assert_held(&bat_priv->tvlv.container_list_lock); if (!tvlv) return; hlist_del(&tvlv->list); /* first call to decrement the counter, second call to free */ batadv_tvlv_container_put(tvlv); batadv_tvlv_container_put(tvlv); } /** * batadv_tvlv_container_unregister() - unregister tvlv container based on the * provided type and version (both need to match) * @bat_priv: the bat priv with all the mesh interface information * @type: tvlv container type to unregister * @version: tvlv container type to unregister */ void batadv_tvlv_container_unregister(struct batadv_priv *bat_priv, u8 type, u8 version) { struct batadv_tvlv_container *tvlv; spin_lock_bh(&bat_priv->tvlv.container_list_lock); tvlv = batadv_tvlv_container_get(bat_priv, type, version); batadv_tvlv_container_remove(bat_priv, tvlv); spin_unlock_bh(&bat_priv->tvlv.container_list_lock); } /** * batadv_tvlv_container_register() - register tvlv type, version and content * to be propagated with each (primary interface) OGM * @bat_priv: the bat priv with all the mesh interface information * @type: tvlv container type * @version: tvlv container version * @tvlv_value: tvlv container content * @tvlv_value_len: tvlv container content length * * If a container of the same type and version was already registered the new * content is going to replace the old one. */ void batadv_tvlv_container_register(struct batadv_priv *bat_priv, u8 type, u8 version, void *tvlv_value, u16 tvlv_value_len) { struct batadv_tvlv_container *tvlv_old, *tvlv_new; if (!tvlv_value) tvlv_value_len = 0; tvlv_new = kzalloc(sizeof(*tvlv_new) + tvlv_value_len, GFP_ATOMIC); if (!tvlv_new) return; tvlv_new->tvlv_hdr.version = version; tvlv_new->tvlv_hdr.type = type; tvlv_new->tvlv_hdr.len = htons(tvlv_value_len); memcpy(tvlv_new + 1, tvlv_value, ntohs(tvlv_new->tvlv_hdr.len)); INIT_HLIST_NODE(&tvlv_new->list); kref_init(&tvlv_new->refcount); spin_lock_bh(&bat_priv->tvlv.container_list_lock); tvlv_old = batadv_tvlv_container_get(bat_priv, type, version); batadv_tvlv_container_remove(bat_priv, tvlv_old); kref_get(&tvlv_new->refcount); hlist_add_head(&tvlv_new->list, &bat_priv->tvlv.container_list); spin_unlock_bh(&bat_priv->tvlv.container_list_lock); /* don't return reference to new tvlv_container */ batadv_tvlv_container_put(tvlv_new); } /** * batadv_tvlv_realloc_packet_buff() - reallocate packet buffer to accommodate * requested packet size * @packet_buff: packet buffer * @packet_buff_len: packet buffer size * @min_packet_len: requested packet minimum size * @additional_packet_len: requested additional packet size on top of minimum * size * * Return: true of the packet buffer could be changed to the requested size, * false otherwise. */ static bool batadv_tvlv_realloc_packet_buff(unsigned char **packet_buff, int *packet_buff_len, int min_packet_len, int additional_packet_len) { unsigned char *new_buff; new_buff = kmalloc(min_packet_len + additional_packet_len, GFP_ATOMIC); /* keep old buffer if kmalloc should fail */ if (!new_buff) return false; memcpy(new_buff, *packet_buff, min_packet_len); kfree(*packet_buff); *packet_buff = new_buff; *packet_buff_len = min_packet_len + additional_packet_len; return true; } /** * batadv_tvlv_container_ogm_append() - append tvlv container content to given * OGM packet buffer * @bat_priv: the bat priv with all the mesh interface information * @packet_buff: ogm packet buffer * @packet_buff_len: ogm packet buffer size including ogm header and tvlv * content * @packet_min_len: ogm header size to be preserved for the OGM itself * * The ogm packet might be enlarged or shrunk depending on the current size * and the size of the to-be-appended tvlv containers. * * Return: size of all appended tvlv containers in bytes. */ u16 batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv, unsigned char **packet_buff, int *packet_buff_len, int packet_min_len) { struct batadv_tvlv_container *tvlv; struct batadv_tvlv_hdr *tvlv_hdr; u16 tvlv_value_len; void *tvlv_value; bool ret; spin_lock_bh(&bat_priv->tvlv.container_list_lock); tvlv_value_len = batadv_tvlv_container_list_size(bat_priv); ret = batadv_tvlv_realloc_packet_buff(packet_buff, packet_buff_len, packet_min_len, tvlv_value_len); if (!ret) goto end; if (!tvlv_value_len) goto end; tvlv_value = (*packet_buff) + packet_min_len; hlist_for_each_entry(tvlv, &bat_priv->tvlv.container_list, list) { tvlv_hdr = tvlv_value; tvlv_hdr->type = tvlv->tvlv_hdr.type; tvlv_hdr->version = tvlv->tvlv_hdr.version; tvlv_hdr->len = tvlv->tvlv_hdr.len; tvlv_value = tvlv_hdr + 1; memcpy(tvlv_value, tvlv + 1, ntohs(tvlv->tvlv_hdr.len)); tvlv_value = (u8 *)tvlv_value + ntohs(tvlv->tvlv_hdr.len); } end: spin_unlock_bh(&bat_priv->tvlv.container_list_lock); return tvlv_value_len; } /** * batadv_tvlv_call_handler() - parse the given tvlv buffer to call the * appropriate handlers * @bat_priv: the bat priv with all the mesh interface information * @tvlv_handler: tvlv callback function handling the tvlv content * @packet_type: indicates for which packet type the TVLV handler is called * @orig_node: orig node emitting the ogm packet * @skb: the skb the TVLV handler is called for * @tvlv_value: tvlv content * @tvlv_value_len: tvlv content length * * Return: success if the handler was not found or the return value of the * handler callback. */ static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv, struct batadv_tvlv_handler *tvlv_handler, u8 packet_type, struct batadv_orig_node *orig_node, struct sk_buff *skb, void *tvlv_value, u16 tvlv_value_len) { unsigned int tvlv_offset; u8 *src, *dst; if (!tvlv_handler) return NET_RX_SUCCESS; switch (packet_type) { case BATADV_IV_OGM: case BATADV_OGM2: if (!tvlv_handler->ogm_handler) return NET_RX_SUCCESS; if (!orig_node) return NET_RX_SUCCESS; tvlv_handler->ogm_handler(bat_priv, orig_node, BATADV_NO_FLAGS, tvlv_value, tvlv_value_len); tvlv_handler->flags |= BATADV_TVLV_HANDLER_OGM_CALLED; break; case BATADV_UNICAST_TVLV: if (!skb) return NET_RX_SUCCESS; if (!tvlv_handler->unicast_handler) return NET_RX_SUCCESS; src = ((struct batadv_unicast_tvlv_packet *)skb->data)->src; dst = ((struct batadv_unicast_tvlv_packet *)skb->data)->dst; return tvlv_handler->unicast_handler(bat_priv, src, dst, tvlv_value, tvlv_value_len); case BATADV_MCAST: if (!skb) return NET_RX_SUCCESS; if (!tvlv_handler->mcast_handler) return NET_RX_SUCCESS; tvlv_offset = (unsigned char *)tvlv_value - skb->data; skb_set_network_header(skb, tvlv_offset); skb_set_transport_header(skb, tvlv_offset + tvlv_value_len); return tvlv_handler->mcast_handler(bat_priv, skb); } return NET_RX_SUCCESS; } /** * batadv_tvlv_containers_process() - parse the given tvlv buffer to call the * appropriate handlers * @bat_priv: the bat priv with all the mesh interface information * @packet_type: indicates for which packet type the TVLV handler is called * @orig_node: orig node emitting the ogm packet * @skb: the skb the TVLV handler is called for * @tvlv_value: tvlv content * @tvlv_value_len: tvlv content length * * Return: success when processing an OGM or the return value of all called * handler callbacks. */ int batadv_tvlv_containers_process(struct batadv_priv *bat_priv, u8 packet_type, struct batadv_orig_node *orig_node, struct sk_buff *skb, void *tvlv_value, u16 tvlv_value_len) { struct batadv_tvlv_handler *tvlv_handler; struct batadv_tvlv_hdr *tvlv_hdr; u16 tvlv_value_cont_len; u8 cifnotfound = BATADV_TVLV_HANDLER_OGM_CIFNOTFND; int ret = NET_RX_SUCCESS; while (tvlv_value_len >= sizeof(*tvlv_hdr)) { tvlv_hdr = tvlv_value; tvlv_value_cont_len = ntohs(tvlv_hdr->len); tvlv_value = tvlv_hdr + 1; tvlv_value_len -= sizeof(*tvlv_hdr); if (tvlv_value_cont_len > tvlv_value_len) break; tvlv_handler = batadv_tvlv_handler_get(bat_priv, tvlv_hdr->type, tvlv_hdr->version); ret |= batadv_tvlv_call_handler(bat_priv, tvlv_handler, packet_type, orig_node, skb, tvlv_value, tvlv_value_cont_len); batadv_tvlv_handler_put(tvlv_handler); tvlv_value = (u8 *)tvlv_value + tvlv_value_cont_len; tvlv_value_len -= tvlv_value_cont_len; } if (packet_type != BATADV_IV_OGM && packet_type != BATADV_OGM2) return ret; rcu_read_lock(); hlist_for_each_entry_rcu(tvlv_handler, &bat_priv->tvlv.handler_list, list) { if (!tvlv_handler->ogm_handler) continue; if ((tvlv_handler->flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND) && !(tvlv_handler->flags & BATADV_TVLV_HANDLER_OGM_CALLED)) tvlv_handler->ogm_handler(bat_priv, orig_node, cifnotfound, NULL, 0); tvlv_handler->flags &= ~BATADV_TVLV_HANDLER_OGM_CALLED; } rcu_read_unlock(); return NET_RX_SUCCESS; } /** * batadv_tvlv_ogm_receive() - process an incoming ogm and call the appropriate * handlers * @bat_priv: the bat priv with all the mesh interface information * @batadv_ogm_packet: ogm packet containing the tvlv containers * @orig_node: orig node emitting the ogm packet */ void batadv_tvlv_ogm_receive(struct batadv_priv *bat_priv, struct batadv_ogm_packet *batadv_ogm_packet, struct batadv_orig_node *orig_node) { void *tvlv_value; u16 tvlv_value_len; if (!batadv_ogm_packet) return; tvlv_value_len = ntohs(batadv_ogm_packet->tvlv_len); if (!tvlv_value_len) return; tvlv_value = batadv_ogm_packet + 1; batadv_tvlv_containers_process(bat_priv, BATADV_IV_OGM, orig_node, NULL, tvlv_value, tvlv_value_len); } /** * batadv_tvlv_handler_register() - register tvlv handler based on the provided * type and version (both need to match) for ogm tvlv payload and/or unicast * payload * @bat_priv: the bat priv with all the mesh interface information * @optr: ogm tvlv handler callback function. This function receives the orig * node, flags and the tvlv content as argument to process. * @uptr: unicast tvlv handler callback function. This function receives the * source & destination of the unicast packet as well as the tvlv content * to process. * @mptr: multicast packet tvlv handler callback function. This function * receives the full skb to process, with the skb network header pointing * to the current tvlv and the skb transport header pointing to the first * byte after the current tvlv. * @type: tvlv handler type to be registered * @version: tvlv handler version to be registered * @flags: flags to enable or disable TVLV API behavior */ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, void (*optr)(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 flags, void *tvlv_value, u16 tvlv_value_len), int (*uptr)(struct batadv_priv *bat_priv, u8 *src, u8 *dst, void *tvlv_value, u16 tvlv_value_len), int (*mptr)(struct batadv_priv *bat_priv, struct sk_buff *skb), u8 type, u8 version, u8 flags) { struct batadv_tvlv_handler *tvlv_handler; spin_lock_bh(&bat_priv->tvlv.handler_list_lock); tvlv_handler = batadv_tvlv_handler_get(bat_priv, type, version); if (tvlv_handler) { spin_unlock_bh(&bat_priv->tvlv.handler_list_lock); batadv_tvlv_handler_put(tvlv_handler); return; } tvlv_handler = kzalloc(sizeof(*tvlv_handler), GFP_ATOMIC); if (!tvlv_handler) { spin_unlock_bh(&bat_priv->tvlv.handler_list_lock); return; } tvlv_handler->ogm_handler = optr; tvlv_handler->unicast_handler = uptr; tvlv_handler->mcast_handler = mptr; tvlv_handler->type = type; tvlv_handler->version = version; tvlv_handler->flags = flags; kref_init(&tvlv_handler->refcount); INIT_HLIST_NODE(&tvlv_handler->list); kref_get(&tvlv_handler->refcount); hlist_add_head_rcu(&tvlv_handler->list, &bat_priv->tvlv.handler_list); spin_unlock_bh(&bat_priv->tvlv.handler_list_lock); /* don't return reference to new tvlv_handler */ batadv_tvlv_handler_put(tvlv_handler); } /** * batadv_tvlv_handler_unregister() - unregister tvlv handler based on the * provided type and version (both need to match) * @bat_priv: the bat priv with all the mesh interface information * @type: tvlv handler type to be unregistered * @version: tvlv handler version to be unregistered */ void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv, u8 type, u8 version) { struct batadv_tvlv_handler *tvlv_handler; tvlv_handler = batadv_tvlv_handler_get(bat_priv, type, version); if (!tvlv_handler) return; batadv_tvlv_handler_put(tvlv_handler); spin_lock_bh(&bat_priv->tvlv.handler_list_lock); hlist_del_rcu(&tvlv_handler->list); spin_unlock_bh(&bat_priv->tvlv.handler_list_lock); batadv_tvlv_handler_put(tvlv_handler); } /** * batadv_tvlv_unicast_send() - send a unicast packet with tvlv payload to the * specified host * @bat_priv: the bat priv with all the mesh interface information * @src: source mac address of the unicast packet * @dst: destination mac address of the unicast packet * @type: tvlv type * @version: tvlv version * @tvlv_value: tvlv content * @tvlv_value_len: tvlv content length */ void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, const u8 *src, const u8 *dst, u8 type, u8 version, void *tvlv_value, u16 tvlv_value_len) { struct batadv_unicast_tvlv_packet *unicast_tvlv_packet; struct batadv_tvlv_hdr *tvlv_hdr; struct batadv_orig_node *orig_node; struct sk_buff *skb; unsigned char *tvlv_buff; unsigned int tvlv_len; ssize_t hdr_len = sizeof(*unicast_tvlv_packet); orig_node = batadv_orig_hash_find(bat_priv, dst); if (!orig_node) return; tvlv_len = sizeof(*tvlv_hdr) + tvlv_value_len; skb = netdev_alloc_skb_ip_align(NULL, ETH_HLEN + hdr_len + tvlv_len); if (!skb) goto out; skb->priority = TC_PRIO_CONTROL; skb_reserve(skb, ETH_HLEN); tvlv_buff = skb_put(skb, sizeof(*unicast_tvlv_packet) + tvlv_len); unicast_tvlv_packet = (struct batadv_unicast_tvlv_packet *)tvlv_buff; unicast_tvlv_packet->packet_type = BATADV_UNICAST_TVLV; unicast_tvlv_packet->version = BATADV_COMPAT_VERSION; unicast_tvlv_packet->ttl = BATADV_TTL; unicast_tvlv_packet->reserved = 0; unicast_tvlv_packet->tvlv_len = htons(tvlv_len); unicast_tvlv_packet->align = 0; ether_addr_copy(unicast_tvlv_packet->src, src); ether_addr_copy(unicast_tvlv_packet->dst, dst); tvlv_buff = (unsigned char *)(unicast_tvlv_packet + 1); tvlv_hdr = (struct batadv_tvlv_hdr *)tvlv_buff; tvlv_hdr->version = version; tvlv_hdr->type = type; tvlv_hdr->len = htons(tvlv_value_len); tvlv_buff += sizeof(*tvlv_hdr); memcpy(tvlv_buff, tvlv_value, tvlv_value_len); batadv_send_skb_to_orig(skb, orig_node, NULL); out: batadv_orig_node_put(orig_node); } |
| 14 14 11 1 11 10 9 5 5 14 1 10 10 10 10 5 1 5 1 4 4 8 1 1 1 1 7 10 6 6 5 5 6 6 6 6 3 3 3 6 5 4 2 24 3 2 1 1 10 8 10 9 9 1 1 1 1 12 12 12 3 2 1 11 12 31 31 9 9 9 9 1 1 10 2 2 2 2 10 2 10 11 1 10 10 1 11 11 4 9 3 11 23 23 1 23 23 23 23 13 12 13 3 2 1 13 12 12 3 1 10 10 10 11 11 7 18 23 6 6 6 6 6 6 6 6 9 10 9 10 13 15 13 13 12 13 13 13 15 1 1 1 13 13 13 13 15 1 15 13 13 23 9 9 1 9 8 1 1 1 11 11 9 2 11 11 38 4 38 2 2 1 2 1 1 13 13 15 27 27 4 3 4 1 2 2 2 2 2 37 38 39 3 3 3 3 1 1 3 10 10 8 10 10 7 6 7 6 6 6 7 4 4 4 4 2 2 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 | // SPDX-License-Identifier: LGPL-2.1-or-later /* * dmxdev.c - DVB demultiplexer device * * Copyright (C) 2000 Ralph Metzler & Marcus Metzler * for convergence integrated media GmbH */ #define pr_fmt(fmt) "dmxdev: " fmt #include <linux/sched.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/module.h> #include <linux/poll.h> #include <linux/ioctl.h> #include <linux/wait.h> #include <linux/uaccess.h> #include <media/dmxdev.h> #include <media/dvb_vb2.h> static int debug; module_param(debug, int, 0644); MODULE_PARM_DESC(debug, "Turn on/off debugging (default:off)."); #define dprintk(fmt, arg...) do { \ if (debug) \ printk(KERN_DEBUG pr_fmt("%s: " fmt), \ __func__, ##arg); \ } while (0) static int dvb_dmxdev_buffer_write(struct dvb_ringbuffer *buf, const u8 *src, size_t len) { ssize_t free; if (!len) return 0; if (!buf->data) return 0; free = dvb_ringbuffer_free(buf); if (len > free) { dprintk("buffer overflow\n"); return -EOVERFLOW; } return dvb_ringbuffer_write(buf, src, len); } static ssize_t dvb_dmxdev_buffer_read(struct dvb_ringbuffer *src, int non_blocking, char __user *buf, size_t count, loff_t *ppos) { size_t todo; ssize_t avail; ssize_t ret = 0; if (!src->data) return 0; if (src->error) { ret = src->error; dvb_ringbuffer_flush(src); return ret; } for (todo = count; todo > 0; todo -= ret) { if (non_blocking && dvb_ringbuffer_empty(src)) { ret = -EWOULDBLOCK; break; } ret = wait_event_interruptible(src->queue, !dvb_ringbuffer_empty(src) || (src->error != 0)); if (ret < 0) break; if (src->error) { ret = src->error; dvb_ringbuffer_flush(src); break; } avail = dvb_ringbuffer_avail(src); if (avail > todo) avail = todo; ret = dvb_ringbuffer_read_user(src, buf, avail); if (ret < 0) break; buf += ret; } return (count - todo) ? (count - todo) : ret; } static struct dmx_frontend *get_fe(struct dmx_demux *demux, int type) { struct list_head *head, *pos; head = demux->get_frontends(demux); if (!head) return NULL; list_for_each(pos, head) if (DMX_FE_ENTRY(pos)->source == type) return DMX_FE_ENTRY(pos); return NULL; } static int dvb_dvr_open(struct inode *inode, struct file *file) { struct dvb_device *dvbdev = file->private_data; struct dmxdev *dmxdev = dvbdev->priv; struct dmx_frontend *front; bool need_ringbuffer = false; dprintk("%s\n", __func__); if (mutex_lock_interruptible(&dmxdev->mutex)) return -ERESTARTSYS; if (dmxdev->exit) { mutex_unlock(&dmxdev->mutex); return -ENODEV; } dmxdev->may_do_mmap = 0; /* * The logic here is a little tricky due to the ifdef. * * The ringbuffer is used for both read and mmap. * * It is not needed, however, on two situations: * - Write devices (access with O_WRONLY); * - For duplex device nodes, opened with O_RDWR. */ if ((file->f_flags & O_ACCMODE) == O_RDONLY) need_ringbuffer = true; else if ((file->f_flags & O_ACCMODE) == O_RDWR) { if (!(dmxdev->capabilities & DMXDEV_CAP_DUPLEX)) { #ifdef CONFIG_DVB_MMAP dmxdev->may_do_mmap = 1; need_ringbuffer = true; #else mutex_unlock(&dmxdev->mutex); return -EOPNOTSUPP; #endif } } if (need_ringbuffer) { void *mem; if (!dvbdev->readers) { mutex_unlock(&dmxdev->mutex); return -EBUSY; } mem = vmalloc(DVR_BUFFER_SIZE); if (!mem) { mutex_unlock(&dmxdev->mutex); return -ENOMEM; } dvb_ringbuffer_init(&dmxdev->dvr_buffer, mem, DVR_BUFFER_SIZE); if (dmxdev->may_do_mmap) dvb_vb2_init(&dmxdev->dvr_vb2_ctx, "dvr", file->f_flags & O_NONBLOCK); dvbdev->readers--; } if ((file->f_flags & O_ACCMODE) == O_WRONLY) { dmxdev->dvr_orig_fe = dmxdev->demux->frontend; if (!dmxdev->demux->write) { mutex_unlock(&dmxdev->mutex); return -EOPNOTSUPP; } front = get_fe(dmxdev->demux, DMX_MEMORY_FE); if (!front) { mutex_unlock(&dmxdev->mutex); return -EINVAL; } dmxdev->demux->disconnect_frontend(dmxdev->demux); dmxdev->demux->connect_frontend(dmxdev->demux, front); } dvbdev->users++; mutex_unlock(&dmxdev->mutex); return 0; } static int dvb_dvr_release(struct inode *inode, struct file *file) { struct dvb_device *dvbdev = file->private_data; struct dmxdev *dmxdev = dvbdev->priv; mutex_lock(&dmxdev->mutex); if ((file->f_flags & O_ACCMODE) == O_WRONLY) { dmxdev->demux->disconnect_frontend(dmxdev->demux); dmxdev->demux->connect_frontend(dmxdev->demux, dmxdev->dvr_orig_fe); } if (((file->f_flags & O_ACCMODE) == O_RDONLY) || dmxdev->may_do_mmap) { if (dmxdev->may_do_mmap) { if (dvb_vb2_is_streaming(&dmxdev->dvr_vb2_ctx)) dvb_vb2_stream_off(&dmxdev->dvr_vb2_ctx); dvb_vb2_release(&dmxdev->dvr_vb2_ctx); } dvbdev->readers++; if (dmxdev->dvr_buffer.data) { void *mem = dmxdev->dvr_buffer.data; /*memory barrier*/ mb(); spin_lock_irq(&dmxdev->lock); dmxdev->dvr_buffer.data = NULL; spin_unlock_irq(&dmxdev->lock); vfree(mem); } } /* TODO */ dvbdev->users--; if (dvbdev->users == 1 && dmxdev->exit == 1) { mutex_unlock(&dmxdev->mutex); wake_up(&dvbdev->wait_queue); } else mutex_unlock(&dmxdev->mutex); return 0; } static ssize_t dvb_dvr_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct dvb_device *dvbdev = file->private_data; struct dmxdev *dmxdev = dvbdev->priv; int ret; if (!dmxdev->demux->write) return -EOPNOTSUPP; if ((file->f_flags & O_ACCMODE) != O_WRONLY) return -EINVAL; if (mutex_lock_interruptible(&dmxdev->mutex)) return -ERESTARTSYS; if (dmxdev->exit) { mutex_unlock(&dmxdev->mutex); return -ENODEV; } ret = dmxdev->demux->write(dmxdev->demux, buf, count); mutex_unlock(&dmxdev->mutex); return ret; } static ssize_t dvb_dvr_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct dvb_device *dvbdev = file->private_data; struct dmxdev *dmxdev = dvbdev->priv; if (dmxdev->exit) return -ENODEV; return dvb_dmxdev_buffer_read(&dmxdev->dvr_buffer, file->f_flags & O_NONBLOCK, buf, count, ppos); } static int dvb_dvr_set_buffer_size(struct dmxdev *dmxdev, unsigned long size) { struct dvb_ringbuffer *buf = &dmxdev->dvr_buffer; void *newmem; void *oldmem; dprintk("%s\n", __func__); if (buf->size == size) return 0; if (!size) return -EINVAL; newmem = vmalloc(size); if (!newmem) return -ENOMEM; oldmem = buf->data; spin_lock_irq(&dmxdev->lock); buf->data = newmem; buf->size = size; /* reset and not flush in case the buffer shrinks */ dvb_ringbuffer_reset(buf); spin_unlock_irq(&dmxdev->lock); vfree(oldmem); return 0; } static inline void dvb_dmxdev_filter_state_set(struct dmxdev_filter *dmxdevfilter, int state) { spin_lock_irq(&dmxdevfilter->dev->lock); dmxdevfilter->state = state; spin_unlock_irq(&dmxdevfilter->dev->lock); } static int dvb_dmxdev_set_buffer_size(struct dmxdev_filter *dmxdevfilter, unsigned long size) { struct dvb_ringbuffer *buf = &dmxdevfilter->buffer; void *newmem; void *oldmem; if (buf->size == size) return 0; if (!size) return -EINVAL; if (dmxdevfilter->state >= DMXDEV_STATE_GO) return -EBUSY; newmem = vmalloc(size); if (!newmem) return -ENOMEM; oldmem = buf->data; spin_lock_irq(&dmxdevfilter->dev->lock); buf->data = newmem; buf->size = size; /* reset and not flush in case the buffer shrinks */ dvb_ringbuffer_reset(buf); spin_unlock_irq(&dmxdevfilter->dev->lock); vfree(oldmem); return 0; } static void dvb_dmxdev_filter_timeout(struct timer_list *t) { struct dmxdev_filter *dmxdevfilter = timer_container_of(dmxdevfilter, t, timer); dmxdevfilter->buffer.error = -ETIMEDOUT; spin_lock_irq(&dmxdevfilter->dev->lock); dmxdevfilter->state = DMXDEV_STATE_TIMEDOUT; spin_unlock_irq(&dmxdevfilter->dev->lock); wake_up(&dmxdevfilter->buffer.queue); } static void dvb_dmxdev_filter_timer(struct dmxdev_filter *dmxdevfilter) { struct dmx_sct_filter_params *para = &dmxdevfilter->params.sec; timer_delete(&dmxdevfilter->timer); if (para->timeout) { dmxdevfilter->timer.expires = jiffies + 1 + (HZ / 2 + HZ * para->timeout) / 1000; add_timer(&dmxdevfilter->timer); } } static int dvb_dmxdev_section_callback(const u8 *buffer1, size_t buffer1_len, const u8 *buffer2, size_t buffer2_len, struct dmx_section_filter *filter, u32 *buffer_flags) { struct dmxdev_filter *dmxdevfilter = filter->priv; int ret; if (!dvb_vb2_is_streaming(&dmxdevfilter->vb2_ctx) && dmxdevfilter->buffer.error) { wake_up(&dmxdevfilter->buffer.queue); return 0; } spin_lock(&dmxdevfilter->dev->lock); if (dmxdevfilter->state != DMXDEV_STATE_GO) { spin_unlock(&dmxdevfilter->dev->lock); return 0; } timer_delete(&dmxdevfilter->timer); dprintk("section callback %*ph\n", 6, buffer1); if (dvb_vb2_is_streaming(&dmxdevfilter->vb2_ctx)) { ret = dvb_vb2_fill_buffer(&dmxdevfilter->vb2_ctx, buffer1, buffer1_len, buffer_flags); if (ret == buffer1_len) ret = dvb_vb2_fill_buffer(&dmxdevfilter->vb2_ctx, buffer2, buffer2_len, buffer_flags); } else { ret = dvb_dmxdev_buffer_write(&dmxdevfilter->buffer, buffer1, buffer1_len); if (ret == buffer1_len) { ret = dvb_dmxdev_buffer_write(&dmxdevfilter->buffer, buffer2, buffer2_len); } } if (ret < 0) dmxdevfilter->buffer.error = ret; if (dmxdevfilter->params.sec.flags & DMX_ONESHOT) dmxdevfilter->state = DMXDEV_STATE_DONE; spin_unlock(&dmxdevfilter->dev->lock); wake_up(&dmxdevfilter->buffer.queue); return 0; } static int dvb_dmxdev_ts_callback(const u8 *buffer1, size_t buffer1_len, const u8 *buffer2, size_t buffer2_len, struct dmx_ts_feed *feed, u32 *buffer_flags) { struct dmxdev_filter *dmxdevfilter = feed->priv; struct dvb_ringbuffer *buffer; #ifdef CONFIG_DVB_MMAP struct dvb_vb2_ctx *ctx; #endif int ret; spin_lock(&dmxdevfilter->dev->lock); if (dmxdevfilter->params.pes.output == DMX_OUT_DECODER) { spin_unlock(&dmxdevfilter->dev->lock); return 0; } if (dmxdevfilter->params.pes.output == DMX_OUT_TAP || dmxdevfilter->params.pes.output == DMX_OUT_TSDEMUX_TAP) { buffer = &dmxdevfilter->buffer; #ifdef CONFIG_DVB_MMAP ctx = &dmxdevfilter->vb2_ctx; #endif } else { buffer = &dmxdevfilter->dev->dvr_buffer; #ifdef CONFIG_DVB_MMAP ctx = &dmxdevfilter->dev->dvr_vb2_ctx; #endif } if (dvb_vb2_is_streaming(ctx)) { ret = dvb_vb2_fill_buffer(ctx, buffer1, buffer1_len, buffer_flags); if (ret == buffer1_len) ret = dvb_vb2_fill_buffer(ctx, buffer2, buffer2_len, buffer_flags); } else { if (buffer->error) { spin_unlock(&dmxdevfilter->dev->lock); wake_up(&buffer->queue); return 0; } ret = dvb_dmxdev_buffer_write(buffer, buffer1, buffer1_len); if (ret == buffer1_len) ret = dvb_dmxdev_buffer_write(buffer, buffer2, buffer2_len); } if (ret < 0) buffer->error = ret; spin_unlock(&dmxdevfilter->dev->lock); wake_up(&buffer->queue); return 0; } /* stop feed but only mark the specified filter as stopped (state set) */ static int dvb_dmxdev_feed_stop(struct dmxdev_filter *dmxdevfilter) { struct dmxdev_feed *feed; dvb_dmxdev_filter_state_set(dmxdevfilter, DMXDEV_STATE_SET); switch (dmxdevfilter->type) { case DMXDEV_TYPE_SEC: timer_delete(&dmxdevfilter->timer); dmxdevfilter->feed.sec->stop_filtering(dmxdevfilter->feed.sec); break; case DMXDEV_TYPE_PES: list_for_each_entry(feed, &dmxdevfilter->feed.ts, next) feed->ts->stop_filtering(feed->ts); break; default: return -EINVAL; } return 0; } /* start feed associated with the specified filter */ static int dvb_dmxdev_feed_start(struct dmxdev_filter *filter) { struct dmxdev_feed *feed; int ret; dvb_dmxdev_filter_state_set(filter, DMXDEV_STATE_GO); switch (filter->type) { case DMXDEV_TYPE_SEC: return filter->feed.sec->start_filtering(filter->feed.sec); case DMXDEV_TYPE_PES: list_for_each_entry(feed, &filter->feed.ts, next) { ret = feed->ts->start_filtering(feed->ts); if (ret < 0) { dvb_dmxdev_feed_stop(filter); return ret; } } break; default: return -EINVAL; } return 0; } /* restart section feed if it has filters left associated with it, otherwise release the feed */ static int dvb_dmxdev_feed_restart(struct dmxdev_filter *filter) { int i; struct dmxdev *dmxdev = filter->dev; u16 pid = filter->params.sec.pid; for (i = 0; i < dmxdev->filternum; i++) if (dmxdev->filter[i].state >= DMXDEV_STATE_GO && dmxdev->filter[i].type == DMXDEV_TYPE_SEC && dmxdev->filter[i].params.sec.pid == pid) { dvb_dmxdev_feed_start(&dmxdev->filter[i]); return 0; } filter->dev->demux->release_section_feed(dmxdev->demux, filter->feed.sec); return 0; } static int dvb_dmxdev_filter_stop(struct dmxdev_filter *dmxdevfilter) { struct dmxdev_feed *feed; struct dmx_demux *demux; if (dmxdevfilter->state < DMXDEV_STATE_GO) return 0; switch (dmxdevfilter->type) { case DMXDEV_TYPE_SEC: if (!dmxdevfilter->feed.sec) break; dvb_dmxdev_feed_stop(dmxdevfilter); if (dmxdevfilter->filter.sec) dmxdevfilter->feed.sec-> release_filter(dmxdevfilter->feed.sec, dmxdevfilter->filter.sec); dvb_dmxdev_feed_restart(dmxdevfilter); dmxdevfilter->feed.sec = NULL; break; case DMXDEV_TYPE_PES: dvb_dmxdev_feed_stop(dmxdevfilter); demux = dmxdevfilter->dev->demux; list_for_each_entry(feed, &dmxdevfilter->feed.ts, next) { demux->release_ts_feed(demux, feed->ts); feed->ts = NULL; } break; default: if (dmxdevfilter->state == DMXDEV_STATE_ALLOCATED) return 0; return -EINVAL; } dvb_ringbuffer_flush(&dmxdevfilter->buffer); return 0; } static void dvb_dmxdev_delete_pids(struct dmxdev_filter *dmxdevfilter) { struct dmxdev_feed *feed, *tmp; /* delete all PIDs */ list_for_each_entry_safe(feed, tmp, &dmxdevfilter->feed.ts, next) { list_del(&feed->next); kfree(feed); } BUG_ON(!list_empty(&dmxdevfilter->feed.ts)); } static inline int dvb_dmxdev_filter_reset(struct dmxdev_filter *dmxdevfilter) { if (dmxdevfilter->state < DMXDEV_STATE_SET) return 0; if (dmxdevfilter->type == DMXDEV_TYPE_PES) dvb_dmxdev_delete_pids(dmxdevfilter); dmxdevfilter->type = DMXDEV_TYPE_NONE; dvb_dmxdev_filter_state_set(dmxdevfilter, DMXDEV_STATE_ALLOCATED); return 0; } static int dvb_dmxdev_start_feed(struct dmxdev *dmxdev, struct dmxdev_filter *filter, struct dmxdev_feed *feed) { ktime_t timeout = ktime_set(0, 0); struct dmx_pes_filter_params *para = &filter->params.pes; enum dmx_output otype; int ret; int ts_type; enum dmx_ts_pes ts_pes; struct dmx_ts_feed *tsfeed; feed->ts = NULL; otype = para->output; ts_pes = para->pes_type; if (ts_pes < DMX_PES_OTHER) ts_type = TS_DECODER; else ts_type = 0; if (otype == DMX_OUT_TS_TAP) ts_type |= TS_PACKET; else if (otype == DMX_OUT_TSDEMUX_TAP) ts_type |= TS_PACKET | TS_DEMUX; else if (otype == DMX_OUT_TAP) ts_type |= TS_PACKET | TS_DEMUX | TS_PAYLOAD_ONLY; ret = dmxdev->demux->allocate_ts_feed(dmxdev->demux, &feed->ts, dvb_dmxdev_ts_callback); if (ret < 0) return ret; tsfeed = feed->ts; tsfeed->priv = filter; ret = tsfeed->set(tsfeed, feed->pid, ts_type, ts_pes, timeout); if (ret < 0) { dmxdev->demux->release_ts_feed(dmxdev->demux, tsfeed); return ret; } ret = tsfeed->start_filtering(tsfeed); if (ret < 0) { dmxdev->demux->release_ts_feed(dmxdev->demux, tsfeed); return ret; } return 0; } static int dvb_dmxdev_filter_start(struct dmxdev_filter *filter) { struct dmxdev *dmxdev = filter->dev; struct dmxdev_feed *feed; void *mem; int ret, i; if (filter->state < DMXDEV_STATE_SET) return -EINVAL; if (filter->state >= DMXDEV_STATE_GO) dvb_dmxdev_filter_stop(filter); if (!filter->buffer.data) { mem = vmalloc(filter->buffer.size); if (!mem) return -ENOMEM; spin_lock_irq(&filter->dev->lock); filter->buffer.data = mem; spin_unlock_irq(&filter->dev->lock); } dvb_ringbuffer_flush(&filter->buffer); switch (filter->type) { case DMXDEV_TYPE_SEC: { struct dmx_sct_filter_params *para = &filter->params.sec; struct dmx_section_filter **secfilter = &filter->filter.sec; struct dmx_section_feed **secfeed = &filter->feed.sec; *secfilter = NULL; *secfeed = NULL; /* find active filter/feed with same PID */ for (i = 0; i < dmxdev->filternum; i++) { if (dmxdev->filter[i].state >= DMXDEV_STATE_GO && dmxdev->filter[i].type == DMXDEV_TYPE_SEC && dmxdev->filter[i].params.sec.pid == para->pid) { *secfeed = dmxdev->filter[i].feed.sec; break; } } /* if no feed found, try to allocate new one */ if (!*secfeed) { ret = dmxdev->demux->allocate_section_feed(dmxdev->demux, secfeed, dvb_dmxdev_section_callback); if (!*secfeed) { pr_err("DVB (%s): could not alloc feed\n", __func__); return ret; } ret = (*secfeed)->set(*secfeed, para->pid, (para->flags & DMX_CHECK_CRC) ? 1 : 0); if (ret < 0) { pr_err("DVB (%s): could not set feed\n", __func__); dvb_dmxdev_feed_restart(filter); return ret; } } else { dvb_dmxdev_feed_stop(filter); } ret = (*secfeed)->allocate_filter(*secfeed, secfilter); if (ret < 0) { dvb_dmxdev_feed_restart(filter); *secfeed = NULL; dprintk("could not get filter\n"); return ret; } (*secfilter)->priv = filter; memcpy(&((*secfilter)->filter_value[3]), &(para->filter.filter[1]), DMX_FILTER_SIZE - 1); memcpy(&(*secfilter)->filter_mask[3], ¶->filter.mask[1], DMX_FILTER_SIZE - 1); memcpy(&(*secfilter)->filter_mode[3], ¶->filter.mode[1], DMX_FILTER_SIZE - 1); (*secfilter)->filter_value[0] = para->filter.filter[0]; (*secfilter)->filter_mask[0] = para->filter.mask[0]; (*secfilter)->filter_mode[0] = para->filter.mode[0]; (*secfilter)->filter_mask[1] = 0; (*secfilter)->filter_mask[2] = 0; filter->todo = 0; ret = filter->feed.sec->start_filtering(filter->feed.sec); if (ret < 0) return ret; dvb_dmxdev_filter_timer(filter); break; } case DMXDEV_TYPE_PES: list_for_each_entry(feed, &filter->feed.ts, next) { ret = dvb_dmxdev_start_feed(dmxdev, filter, feed); if (ret < 0) { dvb_dmxdev_filter_stop(filter); return ret; } } break; default: return -EINVAL; } dvb_dmxdev_filter_state_set(filter, DMXDEV_STATE_GO); return 0; } static int dvb_demux_open(struct inode *inode, struct file *file) { struct dvb_device *dvbdev = file->private_data; struct dmxdev *dmxdev = dvbdev->priv; int i; struct dmxdev_filter *dmxdevfilter; if (!dmxdev->filter) return -EINVAL; if (mutex_lock_interruptible(&dmxdev->mutex)) return -ERESTARTSYS; if (dmxdev->exit) { mutex_unlock(&dmxdev->mutex); return -ENODEV; } for (i = 0; i < dmxdev->filternum; i++) if (dmxdev->filter[i].state == DMXDEV_STATE_FREE) break; if (i == dmxdev->filternum) { mutex_unlock(&dmxdev->mutex); return -EMFILE; } dmxdevfilter = &dmxdev->filter[i]; mutex_init(&dmxdevfilter->mutex); file->private_data = dmxdevfilter; #ifdef CONFIG_DVB_MMAP dmxdev->may_do_mmap = 1; #else dmxdev->may_do_mmap = 0; #endif dvb_ringbuffer_init(&dmxdevfilter->buffer, NULL, 8192); dvb_vb2_init(&dmxdevfilter->vb2_ctx, "demux_filter", file->f_flags & O_NONBLOCK); dmxdevfilter->type = DMXDEV_TYPE_NONE; dvb_dmxdev_filter_state_set(dmxdevfilter, DMXDEV_STATE_ALLOCATED); timer_setup(&dmxdevfilter->timer, dvb_dmxdev_filter_timeout, 0); dvbdev->users++; mutex_unlock(&dmxdev->mutex); return 0; } static int dvb_dmxdev_filter_free(struct dmxdev *dmxdev, struct dmxdev_filter *dmxdevfilter) { mutex_lock(&dmxdev->mutex); mutex_lock(&dmxdevfilter->mutex); if (dvb_vb2_is_streaming(&dmxdevfilter->vb2_ctx)) dvb_vb2_stream_off(&dmxdevfilter->vb2_ctx); dvb_vb2_release(&dmxdevfilter->vb2_ctx); dvb_dmxdev_filter_stop(dmxdevfilter); dvb_dmxdev_filter_reset(dmxdevfilter); if (dmxdevfilter->buffer.data) { void *mem = dmxdevfilter->buffer.data; spin_lock_irq(&dmxdev->lock); dmxdevfilter->buffer.data = NULL; spin_unlock_irq(&dmxdev->lock); vfree(mem); } dvb_dmxdev_filter_state_set(dmxdevfilter, DMXDEV_STATE_FREE); wake_up(&dmxdevfilter->buffer.queue); mutex_unlock(&dmxdevfilter->mutex); mutex_unlock(&dmxdev->mutex); return 0; } static inline void invert_mode(struct dmx_filter *filter) { int i; for (i = 0; i < DMX_FILTER_SIZE; i++) filter->mode[i] ^= 0xff; } static int dvb_dmxdev_add_pid(struct dmxdev *dmxdev, struct dmxdev_filter *filter, u16 pid) { struct dmxdev_feed *feed; if ((filter->type != DMXDEV_TYPE_PES) || (filter->state < DMXDEV_STATE_SET)) return -EINVAL; /* only TS packet filters may have multiple PIDs */ if ((filter->params.pes.output != DMX_OUT_TSDEMUX_TAP) && (!list_empty(&filter->feed.ts))) return -EINVAL; feed = kzalloc(sizeof(struct dmxdev_feed), GFP_KERNEL); if (feed == NULL) return -ENOMEM; feed->pid = pid; list_add(&feed->next, &filter->feed.ts); if (filter->state >= DMXDEV_STATE_GO) return dvb_dmxdev_start_feed(dmxdev, filter, feed); return 0; } static int dvb_dmxdev_remove_pid(struct dmxdev *dmxdev, struct dmxdev_filter *filter, u16 pid) { struct dmxdev_feed *feed, *tmp; if ((filter->type != DMXDEV_TYPE_PES) || (filter->state < DMXDEV_STATE_SET)) return -EINVAL; list_for_each_entry_safe(feed, tmp, &filter->feed.ts, next) { if ((feed->pid == pid) && (feed->ts != NULL)) { feed->ts->stop_filtering(feed->ts); filter->dev->demux->release_ts_feed(filter->dev->demux, feed->ts); list_del(&feed->next); kfree(feed); } } return 0; } static int dvb_dmxdev_filter_set(struct dmxdev *dmxdev, struct dmxdev_filter *dmxdevfilter, struct dmx_sct_filter_params *params) { dprintk("%s: PID=0x%04x, flags=%02x, timeout=%d\n", __func__, params->pid, params->flags, params->timeout); dvb_dmxdev_filter_stop(dmxdevfilter); dmxdevfilter->type = DMXDEV_TYPE_SEC; memcpy(&dmxdevfilter->params.sec, params, sizeof(struct dmx_sct_filter_params)); invert_mode(&dmxdevfilter->params.sec.filter); dvb_dmxdev_filter_state_set(dmxdevfilter, DMXDEV_STATE_SET); if (params->flags & DMX_IMMEDIATE_START) return dvb_dmxdev_filter_start(dmxdevfilter); return 0; } static int dvb_dmxdev_pes_filter_set(struct dmxdev *dmxdev, struct dmxdev_filter *dmxdevfilter, struct dmx_pes_filter_params *params) { int ret; dvb_dmxdev_filter_stop(dmxdevfilter); dvb_dmxdev_filter_reset(dmxdevfilter); if ((unsigned int)params->pes_type > DMX_PES_OTHER) return -EINVAL; dmxdevfilter->type = DMXDEV_TYPE_PES; memcpy(&dmxdevfilter->params, params, sizeof(struct dmx_pes_filter_params)); INIT_LIST_HEAD(&dmxdevfilter->feed.ts); dvb_dmxdev_filter_state_set(dmxdevfilter, DMXDEV_STATE_SET); ret = dvb_dmxdev_add_pid(dmxdev, dmxdevfilter, dmxdevfilter->params.pes.pid); if (ret < 0) return ret; if (params->flags & DMX_IMMEDIATE_START) return dvb_dmxdev_filter_start(dmxdevfilter); return 0; } static ssize_t dvb_dmxdev_read_sec(struct dmxdev_filter *dfil, struct file *file, char __user *buf, size_t count, loff_t *ppos) { int result, hcount; int done = 0; if (dfil->todo <= 0) { hcount = 3 + dfil->todo; if (hcount > count) hcount = count; result = dvb_dmxdev_buffer_read(&dfil->buffer, file->f_flags & O_NONBLOCK, buf, hcount, ppos); if (result < 0) { dfil->todo = 0; return result; } if (copy_from_user(dfil->secheader - dfil->todo, buf, result)) return -EFAULT; buf += result; done = result; count -= result; dfil->todo -= result; if (dfil->todo > -3) return done; dfil->todo = ((dfil->secheader[1] << 8) | dfil->secheader[2]) & 0xfff; if (!count) return done; } if (count > dfil->todo) count = dfil->todo; result = dvb_dmxdev_buffer_read(&dfil->buffer, file->f_flags & O_NONBLOCK, buf, count, ppos); if (result < 0) return result; dfil->todo -= result; return (result + done); } static ssize_t dvb_demux_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct dmxdev_filter *dmxdevfilter = file->private_data; int ret; if (mutex_lock_interruptible(&dmxdevfilter->mutex)) return -ERESTARTSYS; if (dmxdevfilter->type == DMXDEV_TYPE_SEC) ret = dvb_dmxdev_read_sec(dmxdevfilter, file, buf, count, ppos); else ret = dvb_dmxdev_buffer_read(&dmxdevfilter->buffer, file->f_flags & O_NONBLOCK, buf, count, ppos); mutex_unlock(&dmxdevfilter->mutex); return ret; } static int dvb_demux_do_ioctl(struct file *file, unsigned int cmd, void *parg) { struct dmxdev_filter *dmxdevfilter = file->private_data; struct dmxdev *dmxdev = dmxdevfilter->dev; unsigned long arg = (unsigned long)parg; int ret = 0; if (mutex_lock_interruptible(&dmxdev->mutex)) return -ERESTARTSYS; switch (cmd) { case DMX_START: if (mutex_lock_interruptible(&dmxdevfilter->mutex)) { mutex_unlock(&dmxdev->mutex); return -ERESTARTSYS; } if (dmxdevfilter->state < DMXDEV_STATE_SET) ret = -EINVAL; else ret = dvb_dmxdev_filter_start(dmxdevfilter); mutex_unlock(&dmxdevfilter->mutex); break; case DMX_STOP: if (mutex_lock_interruptible(&dmxdevfilter->mutex)) { mutex_unlock(&dmxdev->mutex); return -ERESTARTSYS; } ret = dvb_dmxdev_filter_stop(dmxdevfilter); mutex_unlock(&dmxdevfilter->mutex); break; case DMX_SET_FILTER: if (mutex_lock_interruptible(&dmxdevfilter->mutex)) { mutex_unlock(&dmxdev->mutex); return -ERESTARTSYS; } ret = dvb_dmxdev_filter_set(dmxdev, dmxdevfilter, parg); mutex_unlock(&dmxdevfilter->mutex); break; case DMX_SET_PES_FILTER: if (mutex_lock_interruptible(&dmxdevfilter->mutex)) { mutex_unlock(&dmxdev->mutex); return -ERESTARTSYS; } ret = dvb_dmxdev_pes_filter_set(dmxdev, dmxdevfilter, parg); mutex_unlock(&dmxdevfilter->mutex); break; case DMX_SET_BUFFER_SIZE: if (mutex_lock_interruptible(&dmxdevfilter->mutex)) { mutex_unlock(&dmxdev->mutex); return -ERESTARTSYS; } ret = dvb_dmxdev_set_buffer_size(dmxdevfilter, arg); mutex_unlock(&dmxdevfilter->mutex); break; case DMX_GET_PES_PIDS: if (!dmxdev->demux->get_pes_pids) { ret = -EINVAL; break; } dmxdev->demux->get_pes_pids(dmxdev->demux, parg); break; case DMX_GET_STC: if (!dmxdev->demux->get_stc) { ret = -EINVAL; break; } ret = dmxdev->demux->get_stc(dmxdev->demux, ((struct dmx_stc *)parg)->num, &((struct dmx_stc *)parg)->stc, &((struct dmx_stc *)parg)->base); break; case DMX_ADD_PID: if (mutex_lock_interruptible(&dmxdevfilter->mutex)) { ret = -ERESTARTSYS; break; } ret = dvb_dmxdev_add_pid(dmxdev, dmxdevfilter, *(u16 *)parg); mutex_unlock(&dmxdevfilter->mutex); break; case DMX_REMOVE_PID: if (mutex_lock_interruptible(&dmxdevfilter->mutex)) { ret = -ERESTARTSYS; break; } ret = dvb_dmxdev_remove_pid(dmxdev, dmxdevfilter, *(u16 *)parg); mutex_unlock(&dmxdevfilter->mutex); break; #ifdef CONFIG_DVB_MMAP case DMX_REQBUFS: if (mutex_lock_interruptible(&dmxdevfilter->mutex)) { mutex_unlock(&dmxdev->mutex); return -ERESTARTSYS; } ret = dvb_vb2_reqbufs(&dmxdevfilter->vb2_ctx, parg); mutex_unlock(&dmxdevfilter->mutex); break; case DMX_QUERYBUF: if (mutex_lock_interruptible(&dmxdevfilter->mutex)) { mutex_unlock(&dmxdev->mutex); return -ERESTARTSYS; } ret = dvb_vb2_querybuf(&dmxdevfilter->vb2_ctx, parg); mutex_unlock(&dmxdevfilter->mutex); break; case DMX_EXPBUF: if (mutex_lock_interruptible(&dmxdevfilter->mutex)) { mutex_unlock(&dmxdev->mutex); return -ERESTARTSYS; } ret = dvb_vb2_expbuf(&dmxdevfilter->vb2_ctx, parg); mutex_unlock(&dmxdevfilter->mutex); break; case DMX_QBUF: if (mutex_lock_interruptible(&dmxdevfilter->mutex)) { mutex_unlock(&dmxdev->mutex); return -ERESTARTSYS; } ret = dvb_vb2_qbuf(&dmxdevfilter->vb2_ctx, parg); if (ret == 0 && !dvb_vb2_is_streaming(&dmxdevfilter->vb2_ctx)) ret = dvb_vb2_stream_on(&dmxdevfilter->vb2_ctx); mutex_unlock(&dmxdevfilter->mutex); break; case DMX_DQBUF: if (mutex_lock_interruptible(&dmxdevfilter->mutex)) { mutex_unlock(&dmxdev->mutex); return -ERESTARTSYS; } ret = dvb_vb2_dqbuf(&dmxdevfilter->vb2_ctx, parg); mutex_unlock(&dmxdevfilter->mutex); break; #endif default: ret = -ENOTTY; break; } mutex_unlock(&dmxdev->mutex); return ret; } static long dvb_demux_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return dvb_usercopy(file, cmd, arg, dvb_demux_do_ioctl); } static __poll_t dvb_demux_poll(struct file *file, poll_table *wait) { struct dmxdev_filter *dmxdevfilter = file->private_data; __poll_t mask = 0; poll_wait(file, &dmxdevfilter->buffer.queue, wait); if ((!dmxdevfilter) || dmxdevfilter->dev->exit) return EPOLLERR; if (dvb_vb2_is_streaming(&dmxdevfilter->vb2_ctx)) return dvb_vb2_poll(&dmxdevfilter->vb2_ctx, file, wait); if (dmxdevfilter->state != DMXDEV_STATE_GO && dmxdevfilter->state != DMXDEV_STATE_DONE && dmxdevfilter->state != DMXDEV_STATE_TIMEDOUT) return 0; if (dmxdevfilter->buffer.error) mask |= (EPOLLIN | EPOLLRDNORM | EPOLLPRI | EPOLLERR); if (!dvb_ringbuffer_empty(&dmxdevfilter->buffer)) mask |= (EPOLLIN | EPOLLRDNORM | EPOLLPRI); return mask; } #ifdef CONFIG_DVB_MMAP static int dvb_demux_mmap(struct file *file, struct vm_area_struct *vma) { struct dmxdev_filter *dmxdevfilter = file->private_data; struct dmxdev *dmxdev = dmxdevfilter->dev; int ret; if (!dmxdev->may_do_mmap) return -ENOTTY; if (mutex_lock_interruptible(&dmxdev->mutex)) return -ERESTARTSYS; if (mutex_lock_interruptible(&dmxdevfilter->mutex)) { mutex_unlock(&dmxdev->mutex); return -ERESTARTSYS; } ret = dvb_vb2_mmap(&dmxdevfilter->vb2_ctx, vma); mutex_unlock(&dmxdevfilter->mutex); mutex_unlock(&dmxdev->mutex); return ret; } #endif static int dvb_demux_release(struct inode *inode, struct file *file) { struct dmxdev_filter *dmxdevfilter = file->private_data; struct dmxdev *dmxdev = dmxdevfilter->dev; int ret; ret = dvb_dmxdev_filter_free(dmxdev, dmxdevfilter); mutex_lock(&dmxdev->mutex); dmxdev->dvbdev->users--; if (dmxdev->dvbdev->users == 1 && dmxdev->exit == 1) { mutex_unlock(&dmxdev->mutex); wake_up(&dmxdev->dvbdev->wait_queue); } else mutex_unlock(&dmxdev->mutex); return ret; } static const struct file_operations dvb_demux_fops = { .owner = THIS_MODULE, .read = dvb_demux_read, .unlocked_ioctl = dvb_demux_ioctl, .compat_ioctl = dvb_demux_ioctl, .open = dvb_demux_open, .release = dvb_demux_release, .poll = dvb_demux_poll, .llseek = default_llseek, #ifdef CONFIG_DVB_MMAP .mmap = dvb_demux_mmap, #endif }; static const struct dvb_device dvbdev_demux = { .priv = NULL, .users = 1, .writers = 1, #if defined(CONFIG_MEDIA_CONTROLLER_DVB) .name = "dvb-demux", #endif .fops = &dvb_demux_fops }; static int dvb_dvr_do_ioctl(struct file *file, unsigned int cmd, void *parg) { struct dvb_device *dvbdev = file->private_data; struct dmxdev *dmxdev = dvbdev->priv; unsigned long arg = (unsigned long)parg; int ret; if (mutex_lock_interruptible(&dmxdev->mutex)) return -ERESTARTSYS; switch (cmd) { case DMX_SET_BUFFER_SIZE: ret = dvb_dvr_set_buffer_size(dmxdev, arg); break; #ifdef CONFIG_DVB_MMAP case DMX_REQBUFS: ret = dvb_vb2_reqbufs(&dmxdev->dvr_vb2_ctx, parg); break; case DMX_QUERYBUF: ret = dvb_vb2_querybuf(&dmxdev->dvr_vb2_ctx, parg); break; case DMX_EXPBUF: ret = dvb_vb2_expbuf(&dmxdev->dvr_vb2_ctx, parg); break; case DMX_QBUF: ret = dvb_vb2_qbuf(&dmxdev->dvr_vb2_ctx, parg); if (ret == 0 && !dvb_vb2_is_streaming(&dmxdev->dvr_vb2_ctx)) ret = dvb_vb2_stream_on(&dmxdev->dvr_vb2_ctx); break; case DMX_DQBUF: ret = dvb_vb2_dqbuf(&dmxdev->dvr_vb2_ctx, parg); break; #endif default: ret = -ENOTTY; break; } mutex_unlock(&dmxdev->mutex); return ret; } static long dvb_dvr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return dvb_usercopy(file, cmd, arg, dvb_dvr_do_ioctl); } static __poll_t dvb_dvr_poll(struct file *file, poll_table *wait) { struct dvb_device *dvbdev = file->private_data; struct dmxdev *dmxdev = dvbdev->priv; __poll_t mask = 0; dprintk("%s\n", __func__); poll_wait(file, &dmxdev->dvr_buffer.queue, wait); if (dmxdev->exit) return EPOLLERR; if (dvb_vb2_is_streaming(&dmxdev->dvr_vb2_ctx)) return dvb_vb2_poll(&dmxdev->dvr_vb2_ctx, file, wait); if (((file->f_flags & O_ACCMODE) == O_RDONLY) || dmxdev->may_do_mmap) { if (dmxdev->dvr_buffer.error) mask |= (EPOLLIN | EPOLLRDNORM | EPOLLPRI | EPOLLERR); if (!dvb_ringbuffer_empty(&dmxdev->dvr_buffer)) mask |= (EPOLLIN | EPOLLRDNORM | EPOLLPRI); } else mask |= (EPOLLOUT | EPOLLWRNORM | EPOLLPRI); return mask; } #ifdef CONFIG_DVB_MMAP static int dvb_dvr_mmap(struct file *file, struct vm_area_struct *vma) { struct dvb_device *dvbdev = file->private_data; struct dmxdev *dmxdev = dvbdev->priv; int ret; if (!dmxdev->may_do_mmap) return -ENOTTY; if (dmxdev->exit) return -ENODEV; if (mutex_lock_interruptible(&dmxdev->mutex)) return -ERESTARTSYS; ret = dvb_vb2_mmap(&dmxdev->dvr_vb2_ctx, vma); mutex_unlock(&dmxdev->mutex); return ret; } #endif static const struct file_operations dvb_dvr_fops = { .owner = THIS_MODULE, .read = dvb_dvr_read, .write = dvb_dvr_write, .unlocked_ioctl = dvb_dvr_ioctl, .open = dvb_dvr_open, .release = dvb_dvr_release, .poll = dvb_dvr_poll, .llseek = default_llseek, #ifdef CONFIG_DVB_MMAP .mmap = dvb_dvr_mmap, #endif }; static const struct dvb_device dvbdev_dvr = { .priv = NULL, .readers = 1, .users = 1, #if defined(CONFIG_MEDIA_CONTROLLER_DVB) .name = "dvb-dvr", #endif .fops = &dvb_dvr_fops }; int dvb_dmxdev_init(struct dmxdev *dmxdev, struct dvb_adapter *dvb_adapter) { int i, ret; if (dmxdev->demux->open(dmxdev->demux) < 0) return -EUSERS; dmxdev->filter = vmalloc_array(dmxdev->filternum, sizeof(struct dmxdev_filter)); if (!dmxdev->filter) return -ENOMEM; mutex_init(&dmxdev->mutex); spin_lock_init(&dmxdev->lock); for (i = 0; i < dmxdev->filternum; i++) { dmxdev->filter[i].dev = dmxdev; dmxdev->filter[i].buffer.data = NULL; dvb_dmxdev_filter_state_set(&dmxdev->filter[i], DMXDEV_STATE_FREE); } ret = dvb_register_device(dvb_adapter, &dmxdev->dvbdev, &dvbdev_demux, dmxdev, DVB_DEVICE_DEMUX, dmxdev->filternum); if (ret < 0) goto err_register_dvbdev; ret = dvb_register_device(dvb_adapter, &dmxdev->dvr_dvbdev, &dvbdev_dvr, dmxdev, DVB_DEVICE_DVR, dmxdev->filternum); if (ret < 0) goto err_register_dvr_dvbdev; dvb_ringbuffer_init(&dmxdev->dvr_buffer, NULL, 8192); return 0; err_register_dvr_dvbdev: dvb_unregister_device(dmxdev->dvbdev); err_register_dvbdev: vfree(dmxdev->filter); dmxdev->filter = NULL; return ret; } EXPORT_SYMBOL(dvb_dmxdev_init); void dvb_dmxdev_release(struct dmxdev *dmxdev) { mutex_lock(&dmxdev->mutex); dmxdev->exit = 1; mutex_unlock(&dmxdev->mutex); if (dmxdev->dvbdev->users > 1) { wait_event(dmxdev->dvbdev->wait_queue, dmxdev->dvbdev->users == 1); } if (dmxdev->dvr_dvbdev->users > 1) { wait_event(dmxdev->dvr_dvbdev->wait_queue, dmxdev->dvr_dvbdev->users == 1); } dvb_unregister_device(dmxdev->dvbdev); dvb_unregister_device(dmxdev->dvr_dvbdev); vfree(dmxdev->filter); dmxdev->filter = NULL; dmxdev->demux->close(dmxdev->demux); } EXPORT_SYMBOL(dvb_dmxdev_release); |
| 35 36 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> */ #ifndef _BR_PRIVATE_STP_H #define _BR_PRIVATE_STP_H #define BPDU_TYPE_CONFIG 0 #define BPDU_TYPE_TCN 0x80 /* IEEE 802.1D-1998 timer values */ #define BR_MIN_HELLO_TIME (1*HZ) #define BR_MAX_HELLO_TIME (10*HZ) #define BR_MIN_FORWARD_DELAY (2*HZ) #define BR_MAX_FORWARD_DELAY (30*HZ) #define BR_MIN_MAX_AGE (6*HZ) #define BR_MAX_MAX_AGE (40*HZ) #define BR_MIN_PATH_COST 1 #define BR_MAX_PATH_COST 65535 struct br_config_bpdu { unsigned int topology_change:1; unsigned int topology_change_ack:1; bridge_id root; int root_path_cost; bridge_id bridge_id; port_id port_id; int message_age; int max_age; int hello_time; int forward_delay; }; /* called under bridge lock */ static inline int br_is_designated_port(const struct net_bridge_port *p) { return !memcmp(&p->designated_bridge, &p->br->bridge_id, 8) && (p->designated_port == p->port_id); } /* br_stp.c */ void br_become_root_bridge(struct net_bridge *br); void br_config_bpdu_generation(struct net_bridge *); void br_configuration_update(struct net_bridge *); void br_port_state_selection(struct net_bridge *); void br_received_config_bpdu(struct net_bridge_port *p, const struct br_config_bpdu *bpdu); void br_received_tcn_bpdu(struct net_bridge_port *p); void br_transmit_config(struct net_bridge_port *p); void br_transmit_tcn(struct net_bridge *br); void br_topology_change_detection(struct net_bridge *br); void __br_set_topology_change(struct net_bridge *br, unsigned char val); /* br_stp_bpdu.c */ void br_send_config_bpdu(struct net_bridge_port *, struct br_config_bpdu *); void br_send_tcn_bpdu(struct net_bridge_port *); #endif |
| 462 746 747 745 745 746 745 745 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 | /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef _ASM_X86_INSN_H #define _ASM_X86_INSN_H /* * x86 instruction analysis * * Copyright (C) IBM Corporation, 2009 */ #include <asm/byteorder.h> /* insn_attr_t is defined in inat.h */ #include <asm/inat.h> /* __ignore_sync_check__ */ #if defined(__BYTE_ORDER) ? __BYTE_ORDER == __LITTLE_ENDIAN : defined(__LITTLE_ENDIAN) struct insn_field { union { insn_value_t value; insn_byte_t bytes[4]; }; /* !0 if we've run insn_get_xxx() for this field */ unsigned char got; unsigned char nbytes; }; static inline void insn_field_set(struct insn_field *p, insn_value_t v, unsigned char n) { p->value = v; p->nbytes = n; } static inline void insn_set_byte(struct insn_field *p, unsigned char n, insn_byte_t v) { p->bytes[n] = v; } #else struct insn_field { insn_value_t value; union { insn_value_t little; insn_byte_t bytes[4]; }; /* !0 if we've run insn_get_xxx() for this field */ unsigned char got; unsigned char nbytes; }; static inline void insn_field_set(struct insn_field *p, insn_value_t v, unsigned char n) { p->value = v; p->little = __cpu_to_le32(v); p->nbytes = n; } static inline void insn_set_byte(struct insn_field *p, unsigned char n, insn_byte_t v) { p->bytes[n] = v; p->value = __le32_to_cpu(p->little); } #endif struct insn { struct insn_field prefixes; /* * Prefixes * prefixes.bytes[3]: last prefix */ struct insn_field rex_prefix; /* REX prefix */ union { struct insn_field vex_prefix; /* VEX prefix */ struct insn_field xop_prefix; /* XOP prefix */ }; struct insn_field opcode; /* * opcode.bytes[0]: opcode1 * opcode.bytes[1]: opcode2 * opcode.bytes[2]: opcode3 */ struct insn_field modrm; struct insn_field sib; struct insn_field displacement; union { struct insn_field immediate; struct insn_field moffset1; /* for 64bit MOV */ struct insn_field immediate1; /* for 64bit imm or off16/32 */ }; union { struct insn_field moffset2; /* for 64bit MOV */ struct insn_field immediate2; /* for 64bit imm or seg16 */ }; int emulate_prefix_size; insn_attr_t attr; unsigned char opnd_bytes; unsigned char addr_bytes; unsigned char length; unsigned char x86_64; const insn_byte_t *kaddr; /* kernel address of insn to analyze */ const insn_byte_t *end_kaddr; /* kernel address of last insn in buffer */ const insn_byte_t *next_byte; }; #define MAX_INSN_SIZE 15 #define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6) #define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3) #define X86_MODRM_RM(modrm) ((modrm) & 0x07) #define X86_SIB_SCALE(sib) (((sib) & 0xc0) >> 6) #define X86_SIB_INDEX(sib) (((sib) & 0x38) >> 3) #define X86_SIB_BASE(sib) ((sib) & 0x07) #define X86_REX2_M(rex) ((rex) & 0x80) /* REX2 M0 */ #define X86_REX2_R(rex) ((rex) & 0x40) /* REX2 R4 */ #define X86_REX2_X(rex) ((rex) & 0x20) /* REX2 X4 */ #define X86_REX2_B(rex) ((rex) & 0x10) /* REX2 B4 */ #define X86_REX_W(rex) ((rex) & 8) /* REX or REX2 W */ #define X86_REX_R(rex) ((rex) & 4) /* REX or REX2 R3 */ #define X86_REX_X(rex) ((rex) & 2) /* REX or REX2 X3 */ #define X86_REX_B(rex) ((rex) & 1) /* REX or REX2 B3 */ /* VEX bit flags */ #define X86_VEX_W(vex) ((vex) & 0x80) /* VEX3 Byte2 */ #define X86_VEX_R(vex) ((vex) & 0x80) /* VEX2/3 Byte1 */ #define X86_VEX_X(vex) ((vex) & 0x40) /* VEX3 Byte1 */ #define X86_VEX_B(vex) ((vex) & 0x20) /* VEX3 Byte1 */ #define X86_VEX_L(vex) ((vex) & 0x04) /* VEX3 Byte2, VEX2 Byte1 */ /* VEX bit fields */ #define X86_EVEX_M(vex) ((vex) & 0x07) /* EVEX Byte1 */ #define X86_VEX3_M(vex) ((vex) & 0x1f) /* VEX3 Byte1 */ #define X86_VEX2_M 1 /* VEX2.M always 1 */ #define X86_VEX_V(vex) (((vex) & 0x78) >> 3) /* VEX3 Byte2, VEX2 Byte1 */ #define X86_VEX_P(vex) ((vex) & 0x03) /* VEX3 Byte2, VEX2 Byte1 */ #define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */ /* XOP bit fields */ #define X86_XOP_R(xop) ((xop) & 0x80) /* XOP Byte2 */ #define X86_XOP_X(xop) ((xop) & 0x40) /* XOP Byte2 */ #define X86_XOP_B(xop) ((xop) & 0x20) /* XOP Byte2 */ #define X86_XOP_M(xop) ((xop) & 0x1f) /* XOP Byte2 */ #define X86_XOP_W(xop) ((xop) & 0x80) /* XOP Byte3 */ #define X86_XOP_V(xop) ((xop) & 0x78) /* XOP Byte3 */ #define X86_XOP_L(xop) ((xop) & 0x04) /* XOP Byte3 */ #define X86_XOP_P(xop) ((xop) & 0x03) /* XOP Byte3 */ #define X86_XOP_M_MIN 0x08 /* Min of XOP.M */ #define X86_XOP_M_MAX 0x1f /* Max of XOP.M */ extern void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64); extern int insn_get_prefixes(struct insn *insn); extern int insn_get_opcode(struct insn *insn); extern int insn_get_modrm(struct insn *insn); extern int insn_get_sib(struct insn *insn); extern int insn_get_displacement(struct insn *insn); extern int insn_get_immediate(struct insn *insn); extern int insn_get_length(struct insn *insn); enum insn_mode { INSN_MODE_32, INSN_MODE_64, /* Mode is determined by the current kernel build. */ INSN_MODE_KERN, INSN_NUM_MODES, }; extern int insn_decode(struct insn *insn, const void *kaddr, int buf_len, enum insn_mode m); #define insn_decode_kernel(_insn, _ptr) insn_decode((_insn), (_ptr), MAX_INSN_SIZE, INSN_MODE_KERN) /* Attribute will be determined after getting ModRM (for opcode groups) */ static inline void insn_get_attribute(struct insn *insn) { insn_get_modrm(insn); } /* Instruction uses RIP-relative addressing */ extern int insn_rip_relative(struct insn *insn); static inline int insn_is_rex2(struct insn *insn) { if (!insn->prefixes.got) insn_get_prefixes(insn); return insn->rex_prefix.nbytes == 2; } static inline insn_byte_t insn_rex2_m_bit(struct insn *insn) { return X86_REX2_M(insn->rex_prefix.bytes[1]); } static inline int insn_is_avx_or_xop(struct insn *insn) { if (!insn->prefixes.got) insn_get_prefixes(insn); return (insn->vex_prefix.value != 0); } static inline int insn_is_evex(struct insn *insn) { if (!insn->prefixes.got) insn_get_prefixes(insn); return (insn->vex_prefix.nbytes == 4); } /* If we already know this is AVX/XOP encoded */ static inline int avx_insn_is_xop(struct insn *insn) { insn_attr_t attr = inat_get_opcode_attribute(insn->vex_prefix.bytes[0]); return inat_is_xop_prefix(attr); } static inline int insn_is_xop(struct insn *insn) { if (!insn_is_avx_or_xop(insn)) return 0; return avx_insn_is_xop(insn); } static inline int insn_has_emulate_prefix(struct insn *insn) { return !!insn->emulate_prefix_size; } static inline insn_byte_t insn_vex_m_bits(struct insn *insn) { if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */ return X86_VEX2_M; else if (insn->vex_prefix.nbytes == 3) /* 3 bytes VEX */ return X86_VEX3_M(insn->vex_prefix.bytes[1]); else /* EVEX */ return X86_EVEX_M(insn->vex_prefix.bytes[1]); } static inline insn_byte_t insn_vex_p_bits(struct insn *insn) { if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */ return X86_VEX_P(insn->vex_prefix.bytes[1]); else return X86_VEX_P(insn->vex_prefix.bytes[2]); } static inline insn_byte_t insn_vex_w_bit(struct insn *insn) { if (insn->vex_prefix.nbytes < 3) return 0; return X86_VEX_W(insn->vex_prefix.bytes[2]); } static inline insn_byte_t insn_xop_map_bits(struct insn *insn) { if (insn->xop_prefix.nbytes < 3) /* XOP is 3 bytes */ return 0; return X86_XOP_M(insn->xop_prefix.bytes[1]); } static inline insn_byte_t insn_xop_p_bits(struct insn *insn) { return X86_XOP_P(insn->vex_prefix.bytes[2]); } /* Get the last prefix id from last prefix or VEX prefix */ static inline int insn_last_prefix_id(struct insn *insn) { if (insn_is_avx_or_xop(insn)) { if (avx_insn_is_xop(insn)) return insn_xop_p_bits(insn); return insn_vex_p_bits(insn); /* VEX_p is a SIMD prefix id */ } if (insn->prefixes.bytes[3]) return inat_get_last_prefix_id(insn->prefixes.bytes[3]); return 0; } /* Offset of each field from kaddr */ static inline int insn_offset_rex_prefix(struct insn *insn) { return insn->prefixes.nbytes; } static inline int insn_offset_vex_prefix(struct insn *insn) { return insn_offset_rex_prefix(insn) + insn->rex_prefix.nbytes; } static inline int insn_offset_opcode(struct insn *insn) { return insn_offset_vex_prefix(insn) + insn->vex_prefix.nbytes; } static inline int insn_offset_modrm(struct insn *insn) { return insn_offset_opcode(insn) + insn->opcode.nbytes; } static inline int insn_offset_sib(struct insn *insn) { return insn_offset_modrm(insn) + insn->modrm.nbytes; } static inline int insn_offset_displacement(struct insn *insn) { return insn_offset_sib(insn) + insn->sib.nbytes; } static inline int insn_offset_immediate(struct insn *insn) { return insn_offset_displacement(insn) + insn->displacement.nbytes; } /** * for_each_insn_prefix() -- Iterate prefixes in the instruction * @insn: Pointer to struct insn. * @prefix: Prefix byte. * * Iterate prefix bytes of given @insn. Each prefix byte is stored in @prefix * and the index is stored in @idx (note that this @idx is just for a cursor, * do not change it.) * Since prefixes.nbytes can be bigger than 4 if some prefixes * are repeated, it cannot be used for looping over the prefixes. */ #define for_each_insn_prefix(insn, prefix) \ for (int idx = 0; idx < ARRAY_SIZE(insn->prefixes.bytes) && (prefix = insn->prefixes.bytes[idx]) != 0; idx++) #define POP_SS_OPCODE 0x1f #define MOV_SREG_OPCODE 0x8e /* * Intel SDM Vol.3A 6.8.3 states; * "Any single-step trap that would be delivered following the MOV to SS * instruction or POP to SS instruction (because EFLAGS.TF is 1) is * suppressed." * This function returns true if @insn is MOV SS or POP SS. On these * instructions, single stepping is suppressed. */ static inline int insn_masking_exception(struct insn *insn) { return insn->opcode.bytes[0] == POP_SS_OPCODE || (insn->opcode.bytes[0] == MOV_SREG_OPCODE && X86_MODRM_REG(insn->modrm.bytes[0]) == 2); } #endif /* _ASM_X86_INSN_H */ |
| 25 25 25 25 25 25 25 25 22 18 25 18 18 25 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Randomness driver for virtio * Copyright (C) 2007, 2008 Rusty Russell IBM Corporation */ #include <asm/barrier.h> #include <linux/err.h> #include <linux/hw_random.h> #include <linux/scatterlist.h> #include <linux/spinlock.h> #include <linux/virtio.h> #include <linux/virtio_rng.h> #include <linux/module.h> #include <linux/slab.h> static DEFINE_IDA(rng_index_ida); struct virtrng_info { struct hwrng hwrng; struct virtqueue *vq; char name[25]; int index; bool hwrng_register_done; bool hwrng_removed; /* data transfer */ struct completion have_data; unsigned int data_avail; unsigned int data_idx; /* minimal size returned by rng_buffer_size() */ #if SMP_CACHE_BYTES < 32 u8 data[32]; #else u8 data[SMP_CACHE_BYTES]; #endif }; static void random_recv_done(struct virtqueue *vq) { struct virtrng_info *vi = vq->vdev->priv; unsigned int len; /* We can get spurious callbacks, e.g. shared IRQs + virtio_pci. */ if (!virtqueue_get_buf(vi->vq, &len)) return; smp_store_release(&vi->data_avail, len); complete(&vi->have_data); } static void request_entropy(struct virtrng_info *vi) { struct scatterlist sg; reinit_completion(&vi->have_data); vi->data_idx = 0; sg_init_one(&sg, vi->data, sizeof(vi->data)); /* There should always be room for one buffer. */ virtqueue_add_inbuf(vi->vq, &sg, 1, vi->data, GFP_KERNEL); virtqueue_kick(vi->vq); } static unsigned int copy_data(struct virtrng_info *vi, void *buf, unsigned int size) { size = min_t(unsigned int, size, vi->data_avail); memcpy(buf, vi->data + vi->data_idx, size); vi->data_idx += size; vi->data_avail -= size; if (vi->data_avail == 0) request_entropy(vi); return size; } static int virtio_read(struct hwrng *rng, void *buf, size_t size, bool wait) { int ret; struct virtrng_info *vi = (struct virtrng_info *)rng->priv; unsigned int chunk; size_t read; if (vi->hwrng_removed) return -ENODEV; read = 0; /* copy available data */ if (smp_load_acquire(&vi->data_avail)) { chunk = copy_data(vi, buf, size); size -= chunk; read += chunk; } if (!wait) return read; /* We have already copied available entropy, * so either size is 0 or data_avail is 0 */ while (size != 0) { /* data_avail is 0 but a request is pending */ ret = wait_for_completion_killable(&vi->have_data); if (ret < 0) return ret; /* if vi->data_avail is 0, we have been interrupted * by a cleanup, but buffer stays in the queue */ if (vi->data_avail == 0) return read; chunk = copy_data(vi, buf + read, size); size -= chunk; read += chunk; } return read; } static void virtio_cleanup(struct hwrng *rng) { struct virtrng_info *vi = (struct virtrng_info *)rng->priv; complete(&vi->have_data); } static int probe_common(struct virtio_device *vdev) { int err, index; struct virtrng_info *vi = NULL; vi = kzalloc(sizeof(struct virtrng_info), GFP_KERNEL); if (!vi) return -ENOMEM; vi->index = index = ida_alloc(&rng_index_ida, GFP_KERNEL); if (index < 0) { err = index; goto err_ida; } sprintf(vi->name, "virtio_rng.%d", index); init_completion(&vi->have_data); vi->hwrng = (struct hwrng) { .read = virtio_read, .cleanup = virtio_cleanup, .priv = (unsigned long)vi, .name = vi->name, }; vdev->priv = vi; /* We expect a single virtqueue. */ vi->vq = virtio_find_single_vq(vdev, random_recv_done, "input"); if (IS_ERR(vi->vq)) { err = PTR_ERR(vi->vq); goto err_find; } virtio_device_ready(vdev); /* we always have a pending entropy request */ request_entropy(vi); return 0; err_find: ida_free(&rng_index_ida, index); err_ida: kfree(vi); return err; } static void remove_common(struct virtio_device *vdev) { struct virtrng_info *vi = vdev->priv; vi->hwrng_removed = true; vi->data_avail = 0; vi->data_idx = 0; complete(&vi->have_data); if (vi->hwrng_register_done) hwrng_unregister(&vi->hwrng); virtio_reset_device(vdev); vdev->config->del_vqs(vdev); ida_free(&rng_index_ida, vi->index); kfree(vi); } static int virtrng_probe(struct virtio_device *vdev) { return probe_common(vdev); } static void virtrng_remove(struct virtio_device *vdev) { remove_common(vdev); } static void virtrng_scan(struct virtio_device *vdev) { struct virtrng_info *vi = vdev->priv; int err; err = hwrng_register(&vi->hwrng); if (!err) vi->hwrng_register_done = true; } static int virtrng_freeze(struct virtio_device *vdev) { remove_common(vdev); return 0; } static int virtrng_restore(struct virtio_device *vdev) { int err; err = probe_common(vdev); if (!err) { struct virtrng_info *vi = vdev->priv; /* * Set hwrng_removed to ensure that virtio_read() * does not block waiting for data before the * registration is complete. */ vi->hwrng_removed = true; err = hwrng_register(&vi->hwrng); if (!err) { vi->hwrng_register_done = true; vi->hwrng_removed = false; } } return err; } static const struct virtio_device_id id_table[] = { { VIRTIO_ID_RNG, VIRTIO_DEV_ANY_ID }, { 0 }, }; static struct virtio_driver virtio_rng_driver = { .driver.name = KBUILD_MODNAME, .id_table = id_table, .probe = virtrng_probe, .remove = virtrng_remove, .scan = virtrng_scan, .freeze = pm_sleep_ptr(virtrng_freeze), .restore = pm_sleep_ptr(virtrng_restore), }; module_virtio_driver(virtio_rng_driver); MODULE_DEVICE_TABLE(virtio, id_table); MODULE_DESCRIPTION("Virtio random number driver"); MODULE_LICENSE("GPL"); |
| 55 57 57 21 21 21 22 9 9 5 5 5 54 39 39 39 39 30 39 38 38 15 24 38 38 23 15 15 54 7 57 56 56 1 1 1 17 5 5 5 5 5 5 5 2 2 1 2 2 2 2 1 2 57 57 57 49 25 56 32 57 56 33 32 57 57 29 57 56 57 63 63 3 3 3 3 3 3 1 2 3 2 2 2 1 1 4 1 1 1 1 1 1 7 7 7 6 7 4 4 2 2 1 2 1 1 1 1 10 10 1 1 2 2 1 3 3 1 1 1 2 2 1 2 2 3 2 1 1 1 3 3 5 5 4 3 2 2 5 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 | // SPDX-License-Identifier: GPL-2.0-or-later /* * drivers/net/bond/bond_options.c - bonding options * Copyright (c) 2013 Jiri Pirko <jiri@resnulli.us> * Copyright (c) 2013 Scott Feldman <sfeldma@cumulusnetworks.com> */ #include <linux/errno.h> #include <linux/if.h> #include <linux/netdevice.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/ctype.h> #include <linux/inet.h> #include <linux/sched/signal.h> #include <net/bonding.h> #include <net/ndisc.h> static int bond_option_active_slave_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_miimon_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_updelay_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_downdelay_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_peer_notif_delay_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_use_carrier_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_arp_interval_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_arp_ip_target_add(struct bonding *bond, __be32 target); static int bond_option_arp_ip_target_rem(struct bonding *bond, __be32 target); static int bond_option_arp_ip_targets_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_ns_ip6_targets_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_arp_validate_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_arp_all_targets_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_prio_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_primary_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_primary_reselect_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_fail_over_mac_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_xmit_hash_policy_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_resend_igmp_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_num_peer_notif_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_all_slaves_active_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_min_links_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_lp_interval_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_pps_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_lacp_active_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_lacp_rate_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_ad_select_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_queue_id_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_mode_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_slaves_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_tlb_dynamic_lb_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_ad_actor_sys_prio_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_actor_port_prio_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_ad_actor_system_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_ad_user_port_key_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_missed_max_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_coupled_control_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_broadcast_neigh_set(struct bonding *bond, const struct bond_opt_value *newval); static const struct bond_opt_value bond_mode_tbl[] = { { "balance-rr", BOND_MODE_ROUNDROBIN, BOND_VALFLAG_DEFAULT}, { "active-backup", BOND_MODE_ACTIVEBACKUP, 0}, { "balance-xor", BOND_MODE_XOR, 0}, { "broadcast", BOND_MODE_BROADCAST, 0}, { "802.3ad", BOND_MODE_8023AD, 0}, { "balance-tlb", BOND_MODE_TLB, 0}, { "balance-alb", BOND_MODE_ALB, 0}, { NULL, -1, 0}, }; static const struct bond_opt_value bond_pps_tbl[] = { { "default", 1, BOND_VALFLAG_DEFAULT}, { "maxval", USHRT_MAX, BOND_VALFLAG_MAX}, { NULL, -1, 0}, }; static const struct bond_opt_value bond_xmit_hashtype_tbl[] = { { "layer2", BOND_XMIT_POLICY_LAYER2, BOND_VALFLAG_DEFAULT}, { "layer3+4", BOND_XMIT_POLICY_LAYER34, 0}, { "layer2+3", BOND_XMIT_POLICY_LAYER23, 0}, { "encap2+3", BOND_XMIT_POLICY_ENCAP23, 0}, { "encap3+4", BOND_XMIT_POLICY_ENCAP34, 0}, { "vlan+srcmac", BOND_XMIT_POLICY_VLAN_SRCMAC, 0}, { NULL, -1, 0}, }; static const struct bond_opt_value bond_arp_validate_tbl[] = { { "none", BOND_ARP_VALIDATE_NONE, BOND_VALFLAG_DEFAULT}, { "active", BOND_ARP_VALIDATE_ACTIVE, 0}, { "backup", BOND_ARP_VALIDATE_BACKUP, 0}, { "all", BOND_ARP_VALIDATE_ALL, 0}, { "filter", BOND_ARP_FILTER, 0}, { "filter_active", BOND_ARP_FILTER_ACTIVE, 0}, { "filter_backup", BOND_ARP_FILTER_BACKUP, 0}, { NULL, -1, 0}, }; static const struct bond_opt_value bond_arp_all_targets_tbl[] = { { "any", BOND_ARP_TARGETS_ANY, BOND_VALFLAG_DEFAULT}, { "all", BOND_ARP_TARGETS_ALL, 0}, { NULL, -1, 0}, }; static const struct bond_opt_value bond_fail_over_mac_tbl[] = { { "none", BOND_FOM_NONE, BOND_VALFLAG_DEFAULT}, { "active", BOND_FOM_ACTIVE, 0}, { "follow", BOND_FOM_FOLLOW, 0}, { NULL, -1, 0}, }; static const struct bond_opt_value bond_intmax_tbl[] = { { "off", 0, BOND_VALFLAG_DEFAULT}, { "maxval", INT_MAX, BOND_VALFLAG_MAX}, { NULL, -1, 0} }; static const struct bond_opt_value bond_lacp_active[] = { { "off", 0, 0}, { "on", 1, BOND_VALFLAG_DEFAULT}, { NULL, -1, 0} }; static const struct bond_opt_value bond_lacp_rate_tbl[] = { { "slow", AD_LACP_SLOW, 0}, { "fast", AD_LACP_FAST, 0}, { NULL, -1, 0}, }; static const struct bond_opt_value bond_ad_select_tbl[] = { { "stable", BOND_AD_STABLE, BOND_VALFLAG_DEFAULT}, { "bandwidth", BOND_AD_BANDWIDTH, 0}, { "count", BOND_AD_COUNT, 0}, { "actor_port_prio", BOND_AD_PRIO, 0}, { NULL, -1, 0}, }; static const struct bond_opt_value bond_num_peer_notif_tbl[] = { { "off", 0, 0}, { "maxval", 255, BOND_VALFLAG_MAX}, { "default", 1, BOND_VALFLAG_DEFAULT}, { NULL, -1, 0} }; static const struct bond_opt_value bond_peer_notif_delay_tbl[] = { { "off", 0, 0}, { "maxval", 300000, BOND_VALFLAG_MAX}, { NULL, -1, 0} }; static const struct bond_opt_value bond_primary_reselect_tbl[] = { { "always", BOND_PRI_RESELECT_ALWAYS, BOND_VALFLAG_DEFAULT}, { "better", BOND_PRI_RESELECT_BETTER, 0}, { "failure", BOND_PRI_RESELECT_FAILURE, 0}, { NULL, -1}, }; static const struct bond_opt_value bond_use_carrier_tbl[] = { { "on", 1, BOND_VALFLAG_DEFAULT}, { NULL, -1, 0} }; static const struct bond_opt_value bond_all_slaves_active_tbl[] = { { "off", 0, BOND_VALFLAG_DEFAULT}, { "on", 1, 0}, { NULL, -1, 0} }; static const struct bond_opt_value bond_resend_igmp_tbl[] = { { "off", 0, 0}, { "maxval", 255, BOND_VALFLAG_MAX}, { "default", 1, BOND_VALFLAG_DEFAULT}, { NULL, -1, 0} }; static const struct bond_opt_value bond_lp_interval_tbl[] = { { "minval", 1, BOND_VALFLAG_MIN | BOND_VALFLAG_DEFAULT}, { "maxval", INT_MAX, BOND_VALFLAG_MAX}, { NULL, -1, 0}, }; static const struct bond_opt_value bond_tlb_dynamic_lb_tbl[] = { { "off", 0, 0}, { "on", 1, BOND_VALFLAG_DEFAULT}, { NULL, -1, 0} }; static const struct bond_opt_value bond_ad_actor_sys_prio_tbl[] = { { "minval", 1, BOND_VALFLAG_MIN}, { "maxval", 65535, BOND_VALFLAG_MAX | BOND_VALFLAG_DEFAULT}, { NULL, -1, 0}, }; static const struct bond_opt_value bond_ad_user_port_key_tbl[] = { { "minval", 0, BOND_VALFLAG_MIN | BOND_VALFLAG_DEFAULT}, { "maxval", 1023, BOND_VALFLAG_MAX}, { NULL, -1, 0}, }; static const struct bond_opt_value bond_missed_max_tbl[] = { { "minval", 1, BOND_VALFLAG_MIN}, { "maxval", 255, BOND_VALFLAG_MAX}, { "default", 2, BOND_VALFLAG_DEFAULT}, { NULL, -1, 0}, }; static const struct bond_opt_value bond_coupled_control_tbl[] = { { "on", 1, BOND_VALFLAG_DEFAULT}, { "off", 0, 0}, { NULL, -1, 0}, }; static const struct bond_opt_value bond_broadcast_neigh_tbl[] = { { "off", 0, BOND_VALFLAG_DEFAULT}, { "on", 1, 0}, { NULL, -1, 0} }; static const struct bond_option bond_opts[BOND_OPT_LAST] = { [BOND_OPT_MODE] = { .id = BOND_OPT_MODE, .name = "mode", .desc = "bond device mode", .flags = BOND_OPTFLAG_NOSLAVES | BOND_OPTFLAG_IFDOWN, .values = bond_mode_tbl, .set = bond_option_mode_set }, [BOND_OPT_PACKETS_PER_SLAVE] = { .id = BOND_OPT_PACKETS_PER_SLAVE, .name = "packets_per_slave", .desc = "Packets to send per slave in RR mode", .unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_ROUNDROBIN)), .values = bond_pps_tbl, .set = bond_option_pps_set }, [BOND_OPT_XMIT_HASH] = { .id = BOND_OPT_XMIT_HASH, .name = "xmit_hash_policy", .desc = "balance-xor, 802.3ad, and tlb hashing method", .values = bond_xmit_hashtype_tbl, .set = bond_option_xmit_hash_policy_set }, [BOND_OPT_ARP_VALIDATE] = { .id = BOND_OPT_ARP_VALIDATE, .name = "arp_validate", .desc = "validate src/dst of ARP probes", .unsuppmodes = BIT(BOND_MODE_8023AD) | BIT(BOND_MODE_TLB) | BIT(BOND_MODE_ALB), .values = bond_arp_validate_tbl, .set = bond_option_arp_validate_set }, [BOND_OPT_ARP_ALL_TARGETS] = { .id = BOND_OPT_ARP_ALL_TARGETS, .name = "arp_all_targets", .desc = "fail on any/all arp targets timeout", .values = bond_arp_all_targets_tbl, .set = bond_option_arp_all_targets_set }, [BOND_OPT_FAIL_OVER_MAC] = { .id = BOND_OPT_FAIL_OVER_MAC, .name = "fail_over_mac", .desc = "For active-backup, do not set all slaves to the same MAC", .flags = BOND_OPTFLAG_NOSLAVES, .values = bond_fail_over_mac_tbl, .set = bond_option_fail_over_mac_set }, [BOND_OPT_ARP_INTERVAL] = { .id = BOND_OPT_ARP_INTERVAL, .name = "arp_interval", .desc = "arp interval in milliseconds", .unsuppmodes = BIT(BOND_MODE_8023AD) | BIT(BOND_MODE_TLB) | BIT(BOND_MODE_ALB), .values = bond_intmax_tbl, .set = bond_option_arp_interval_set }, [BOND_OPT_MISSED_MAX] = { .id = BOND_OPT_MISSED_MAX, .name = "arp_missed_max", .desc = "Maximum number of missed ARP interval", .unsuppmodes = BIT(BOND_MODE_8023AD) | BIT(BOND_MODE_TLB) | BIT(BOND_MODE_ALB), .values = bond_missed_max_tbl, .set = bond_option_missed_max_set }, [BOND_OPT_ARP_TARGETS] = { .id = BOND_OPT_ARP_TARGETS, .name = "arp_ip_target", .desc = "arp targets in n.n.n.n form", .flags = BOND_OPTFLAG_RAWVAL, .set = bond_option_arp_ip_targets_set }, [BOND_OPT_NS_TARGETS] = { .id = BOND_OPT_NS_TARGETS, .name = "ns_ip6_target", .desc = "NS targets in ffff:ffff::ffff:ffff form", .flags = BOND_OPTFLAG_RAWVAL, .set = bond_option_ns_ip6_targets_set }, [BOND_OPT_DOWNDELAY] = { .id = BOND_OPT_DOWNDELAY, .name = "downdelay", .desc = "Delay before considering link down, in milliseconds", .values = bond_intmax_tbl, .set = bond_option_downdelay_set }, [BOND_OPT_UPDELAY] = { .id = BOND_OPT_UPDELAY, .name = "updelay", .desc = "Delay before considering link up, in milliseconds", .values = bond_intmax_tbl, .set = bond_option_updelay_set }, [BOND_OPT_LACP_ACTIVE] = { .id = BOND_OPT_LACP_ACTIVE, .name = "lacp_active", .desc = "Send LACPDU frames with configured lacp rate or acts as speak when spoken to", .flags = BOND_OPTFLAG_IFDOWN, .unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_8023AD)), .values = bond_lacp_active, .set = bond_option_lacp_active_set }, [BOND_OPT_LACP_RATE] = { .id = BOND_OPT_LACP_RATE, .name = "lacp_rate", .desc = "LACPDU tx rate to request from 802.3ad partner", .flags = BOND_OPTFLAG_IFDOWN, .unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_8023AD)), .values = bond_lacp_rate_tbl, .set = bond_option_lacp_rate_set }, [BOND_OPT_MINLINKS] = { .id = BOND_OPT_MINLINKS, .name = "min_links", .desc = "Minimum number of available links before turning on carrier", .values = bond_intmax_tbl, .set = bond_option_min_links_set }, [BOND_OPT_AD_SELECT] = { .id = BOND_OPT_AD_SELECT, .name = "ad_select", .desc = "802.3ad aggregation selection logic", .flags = BOND_OPTFLAG_IFDOWN, .values = bond_ad_select_tbl, .set = bond_option_ad_select_set }, [BOND_OPT_NUM_PEER_NOTIF] = { .id = BOND_OPT_NUM_PEER_NOTIF, .name = "num_unsol_na", .desc = "Number of peer notifications to send on failover event", .values = bond_num_peer_notif_tbl, .set = bond_option_num_peer_notif_set }, [BOND_OPT_MIIMON] = { .id = BOND_OPT_MIIMON, .name = "miimon", .desc = "Link check interval in milliseconds", .values = bond_intmax_tbl, .set = bond_option_miimon_set }, [BOND_OPT_PRIO] = { .id = BOND_OPT_PRIO, .name = "prio", .desc = "Link priority for failover re-selection", .flags = BOND_OPTFLAG_RAWVAL, .unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_ACTIVEBACKUP) | BIT(BOND_MODE_TLB) | BIT(BOND_MODE_ALB)), .set = bond_option_prio_set }, [BOND_OPT_PRIMARY] = { .id = BOND_OPT_PRIMARY, .name = "primary", .desc = "Primary network device to use", .flags = BOND_OPTFLAG_RAWVAL, .unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_ACTIVEBACKUP) | BIT(BOND_MODE_TLB) | BIT(BOND_MODE_ALB)), .set = bond_option_primary_set }, [BOND_OPT_PRIMARY_RESELECT] = { .id = BOND_OPT_PRIMARY_RESELECT, .name = "primary_reselect", .desc = "Reselect primary slave once it comes up", .values = bond_primary_reselect_tbl, .set = bond_option_primary_reselect_set }, [BOND_OPT_USE_CARRIER] = { .id = BOND_OPT_USE_CARRIER, .name = "use_carrier", .desc = "option obsolete, use_carrier cannot be disabled", .values = bond_use_carrier_tbl, .set = bond_option_use_carrier_set }, [BOND_OPT_ACTIVE_SLAVE] = { .id = BOND_OPT_ACTIVE_SLAVE, .name = "active_slave", .desc = "Currently active slave", .flags = BOND_OPTFLAG_RAWVAL, .unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_ACTIVEBACKUP) | BIT(BOND_MODE_TLB) | BIT(BOND_MODE_ALB)), .set = bond_option_active_slave_set }, [BOND_OPT_QUEUE_ID] = { .id = BOND_OPT_QUEUE_ID, .name = "queue_id", .desc = "Set queue id of a slave", .flags = BOND_OPTFLAG_RAWVAL, .set = bond_option_queue_id_set }, [BOND_OPT_ALL_SLAVES_ACTIVE] = { .id = BOND_OPT_ALL_SLAVES_ACTIVE, .name = "all_slaves_active", .desc = "Keep all frames received on an interface by setting active flag for all slaves", .values = bond_all_slaves_active_tbl, .set = bond_option_all_slaves_active_set }, [BOND_OPT_RESEND_IGMP] = { .id = BOND_OPT_RESEND_IGMP, .name = "resend_igmp", .desc = "Number of IGMP membership reports to send on link failure", .values = bond_resend_igmp_tbl, .set = bond_option_resend_igmp_set }, [BOND_OPT_LP_INTERVAL] = { .id = BOND_OPT_LP_INTERVAL, .name = "lp_interval", .desc = "The number of seconds between instances where the bonding driver sends learning packets to each slave's peer switch", .values = bond_lp_interval_tbl, .set = bond_option_lp_interval_set }, [BOND_OPT_SLAVES] = { .id = BOND_OPT_SLAVES, .name = "slaves", .desc = "Slave membership management", .flags = BOND_OPTFLAG_RAWVAL, .set = bond_option_slaves_set }, [BOND_OPT_TLB_DYNAMIC_LB] = { .id = BOND_OPT_TLB_DYNAMIC_LB, .name = "tlb_dynamic_lb", .desc = "Enable dynamic flow shuffling", .unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_TLB) | BIT(BOND_MODE_ALB)), .values = bond_tlb_dynamic_lb_tbl, .flags = BOND_OPTFLAG_IFDOWN, .set = bond_option_tlb_dynamic_lb_set, }, [BOND_OPT_AD_ACTOR_SYS_PRIO] = { .id = BOND_OPT_AD_ACTOR_SYS_PRIO, .name = "ad_actor_sys_prio", .unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_8023AD)), .values = bond_ad_actor_sys_prio_tbl, .set = bond_option_ad_actor_sys_prio_set, }, [BOND_OPT_ACTOR_PORT_PRIO] = { .id = BOND_OPT_ACTOR_PORT_PRIO, .name = "actor_port_prio", .unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_8023AD)), .flags = BOND_OPTFLAG_RAWVAL, .set = bond_option_actor_port_prio_set, }, [BOND_OPT_AD_ACTOR_SYSTEM] = { .id = BOND_OPT_AD_ACTOR_SYSTEM, .name = "ad_actor_system", .unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_8023AD)), .flags = BOND_OPTFLAG_RAWVAL, .set = bond_option_ad_actor_system_set, }, [BOND_OPT_AD_USER_PORT_KEY] = { .id = BOND_OPT_AD_USER_PORT_KEY, .name = "ad_user_port_key", .unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_8023AD)), .flags = BOND_OPTFLAG_IFDOWN, .values = bond_ad_user_port_key_tbl, .set = bond_option_ad_user_port_key_set, }, [BOND_OPT_NUM_PEER_NOTIF_ALIAS] = { .id = BOND_OPT_NUM_PEER_NOTIF_ALIAS, .name = "num_grat_arp", .desc = "Number of peer notifications to send on failover event", .values = bond_num_peer_notif_tbl, .set = bond_option_num_peer_notif_set }, [BOND_OPT_PEER_NOTIF_DELAY] = { .id = BOND_OPT_PEER_NOTIF_DELAY, .name = "peer_notif_delay", .desc = "Delay between each peer notification on failover event, in milliseconds", .values = bond_peer_notif_delay_tbl, .set = bond_option_peer_notif_delay_set }, [BOND_OPT_COUPLED_CONTROL] = { .id = BOND_OPT_COUPLED_CONTROL, .name = "coupled_control", .desc = "Opt into using coupled control MUX for LACP states", .unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_8023AD)), .flags = BOND_OPTFLAG_IFDOWN, .values = bond_coupled_control_tbl, .set = bond_option_coupled_control_set, }, [BOND_OPT_BROADCAST_NEIGH] = { .id = BOND_OPT_BROADCAST_NEIGH, .name = "broadcast_neighbor", .desc = "Broadcast neighbor packets to all active slaves", .unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_8023AD)), .values = bond_broadcast_neigh_tbl, .set = bond_option_broadcast_neigh_set, } }; /* Searches for an option by name */ const struct bond_option *bond_opt_get_by_name(const char *name) { const struct bond_option *opt; int option; for (option = 0; option < BOND_OPT_LAST; option++) { opt = bond_opt_get(option); if (opt && !strcmp(opt->name, name)) return opt; } return NULL; } /* Searches for a value in opt's values[] table */ const struct bond_opt_value *bond_opt_get_val(unsigned int option, u64 val) { const struct bond_option *opt; int i; opt = bond_opt_get(option); if (WARN_ON(!opt)) return NULL; for (i = 0; opt->values && opt->values[i].string; i++) if (opt->values[i].value == val) return &opt->values[i]; return NULL; } /* Searches for a value in opt's values[] table which matches the flagmask */ static const struct bond_opt_value *bond_opt_get_flags(const struct bond_option *opt, u32 flagmask) { int i; for (i = 0; opt->values && opt->values[i].string; i++) if (opt->values[i].flags & flagmask) return &opt->values[i]; return NULL; } /* If maxval is missing then there's no range to check. In case minval is * missing then it's considered to be 0. */ static bool bond_opt_check_range(const struct bond_option *opt, u64 val) { const struct bond_opt_value *minval, *maxval; minval = bond_opt_get_flags(opt, BOND_VALFLAG_MIN); maxval = bond_opt_get_flags(opt, BOND_VALFLAG_MAX); if (!maxval || (minval && val < minval->value) || val > maxval->value) return false; return true; } /** * bond_opt_parse - parse option value * @opt: the option to parse against * @val: value to parse * * This function tries to extract the value from @val and check if it's * a possible match for the option and returns NULL if a match isn't found, * or the struct_opt_value that matched. It also strips the new line from * @val->string if it's present. */ const struct bond_opt_value *bond_opt_parse(const struct bond_option *opt, struct bond_opt_value *val) { char *p, valstr[BOND_OPT_MAX_NAMELEN + 1] = { 0, }; const struct bond_opt_value *tbl; const struct bond_opt_value *ret = NULL; bool checkval; int i, rv; /* No parsing if the option wants a raw val */ if (opt->flags & BOND_OPTFLAG_RAWVAL) return val; tbl = opt->values; if (!tbl) goto out; /* ULLONG_MAX is used to bypass string processing */ checkval = val->value != ULLONG_MAX; if (!checkval) { if (!val->string) goto out; p = strchr(val->string, '\n'); if (p) *p = '\0'; for (p = val->string; *p; p++) if (!(isdigit(*p) || isspace(*p))) break; /* The following code extracts the string to match or the value * and sets checkval appropriately */ if (*p) { rv = sscanf(val->string, "%32s", valstr); } else { rv = sscanf(val->string, "%llu", &val->value); checkval = true; } if (!rv) goto out; } for (i = 0; tbl[i].string; i++) { /* Check for exact match */ if (checkval) { if (val->value == tbl[i].value) ret = &tbl[i]; } else { if (!strcmp(valstr, "default") && (tbl[i].flags & BOND_VALFLAG_DEFAULT)) ret = &tbl[i]; if (!strcmp(valstr, tbl[i].string)) ret = &tbl[i]; } /* Found an exact match */ if (ret) goto out; } /* Possible range match */ if (checkval && bond_opt_check_range(opt, val->value)) ret = val; out: return ret; } /* Check opt's dependencies against bond mode and currently set options */ static int bond_opt_check_deps(struct bonding *bond, const struct bond_option *opt) { struct bond_params *params = &bond->params; if (test_bit(params->mode, &opt->unsuppmodes)) return -EACCES; if ((opt->flags & BOND_OPTFLAG_NOSLAVES) && bond_has_slaves(bond)) return -ENOTEMPTY; if ((opt->flags & BOND_OPTFLAG_IFDOWN) && (bond->dev->flags & IFF_UP)) return -EBUSY; return 0; } static void bond_opt_dep_print(struct bonding *bond, const struct bond_option *opt, struct nlattr *bad_attr, struct netlink_ext_ack *extack) { const struct bond_opt_value *modeval; struct bond_params *params; params = &bond->params; modeval = bond_opt_get_val(BOND_OPT_MODE, params->mode); if (test_bit(params->mode, &opt->unsuppmodes)) { netdev_err(bond->dev, "option %s: mode dependency failed, not supported in mode %s(%llu)\n", opt->name, modeval->string, modeval->value); NL_SET_ERR_MSG_ATTR(extack, bad_attr, "option not supported in mode"); } } static void bond_opt_error_interpret(struct bonding *bond, const struct bond_option *opt, int error, const struct bond_opt_value *val, struct nlattr *bad_attr, struct netlink_ext_ack *extack) { const struct bond_opt_value *minval, *maxval; char *p; switch (error) { case -EINVAL: NL_SET_ERR_MSG_ATTR(extack, bad_attr, "invalid option value"); if (val) { if (val->string) { /* sometimes RAWVAL opts may have new lines */ p = strchr(val->string, '\n'); if (p) *p = '\0'; netdev_err(bond->dev, "option %s: invalid value (%s)\n", opt->name, val->string); } else { netdev_err(bond->dev, "option %s: invalid value (%llu)\n", opt->name, val->value); } } minval = bond_opt_get_flags(opt, BOND_VALFLAG_MIN); maxval = bond_opt_get_flags(opt, BOND_VALFLAG_MAX); if (!maxval) break; netdev_err(bond->dev, "option %s: allowed values %llu - %llu\n", opt->name, minval ? minval->value : 0, maxval->value); break; case -EACCES: bond_opt_dep_print(bond, opt, bad_attr, extack); break; case -ENOTEMPTY: NL_SET_ERR_MSG_ATTR(extack, bad_attr, "unable to set option because the bond device has slaves"); netdev_err(bond->dev, "option %s: unable to set because the bond device has slaves\n", opt->name); break; case -EBUSY: NL_SET_ERR_MSG_ATTR(extack, bad_attr, "unable to set option because the bond is up"); netdev_err(bond->dev, "option %s: unable to set because the bond device is up\n", opt->name); break; case -ENODEV: if (val && val->string) { p = strchr(val->string, '\n'); if (p) *p = '\0'; netdev_err(bond->dev, "option %s: interface %s does not exist!\n", opt->name, val->string); NL_SET_ERR_MSG_ATTR(extack, bad_attr, "interface does not exist"); } break; default: break; } } /** * __bond_opt_set - set a bonding option * @bond: target bond device * @option: option to set * @val: value to set it to * @bad_attr: netlink attribue that caused the error * @extack: extended netlink error structure, used when an error message * needs to be returned to the caller via netlink * * This function is used to change the bond's option value, it can be * used for both enabling/changing an option and for disabling it. RTNL lock * must be obtained before calling this function. */ int __bond_opt_set(struct bonding *bond, unsigned int option, struct bond_opt_value *val, struct nlattr *bad_attr, struct netlink_ext_ack *extack) { const struct bond_opt_value *retval = NULL; const struct bond_option *opt; int ret = -ENOENT; ASSERT_RTNL(); opt = bond_opt_get(option); if (WARN_ON(!val) || WARN_ON(!opt)) goto out; ret = bond_opt_check_deps(bond, opt); if (ret) goto out; retval = bond_opt_parse(opt, val); if (!retval) { ret = -EINVAL; goto out; } ret = opt->set(bond, retval); out: if (ret) bond_opt_error_interpret(bond, opt, ret, val, bad_attr, extack); return ret; } /** * __bond_opt_set_notify - set a bonding option * @bond: target bond device * @option: option to set * @val: value to set it to * * This function is used to change the bond's option value and trigger * a notification to user sapce. It can be used for both enabling/changing * an option and for disabling it. RTNL lock must be obtained before calling * this function. */ int __bond_opt_set_notify(struct bonding *bond, unsigned int option, struct bond_opt_value *val) { int ret; ASSERT_RTNL(); ret = __bond_opt_set(bond, option, val, NULL, NULL); if (!ret && (bond->dev->reg_state == NETREG_REGISTERED)) call_netdevice_notifiers(NETDEV_CHANGEINFODATA, bond->dev); return ret; } /** * bond_opt_tryset_rtnl - try to acquire rtnl and call __bond_opt_set * @bond: target bond device * @option: option to set * @buf: value to set it to * * This function tries to acquire RTNL without blocking and if successful * calls __bond_opt_set. It is mainly used for sysfs option manipulation. */ int bond_opt_tryset_rtnl(struct bonding *bond, unsigned int option, char *buf) { struct bond_opt_value optval; int ret; if (!rtnl_trylock()) return restart_syscall(); bond_opt_initstr(&optval, buf); ret = __bond_opt_set_notify(bond, option, &optval); rtnl_unlock(); return ret; } /** * bond_opt_get - get a pointer to an option * @option: option for which to return a pointer * * This function checks if option is valid and if so returns a pointer * to its entry in the bond_opts[] option array. */ const struct bond_option *bond_opt_get(unsigned int option) { if (!BOND_OPT_VALID(option)) return NULL; return &bond_opts[option]; } static bool bond_set_xfrm_features(struct bonding *bond) { if (!IS_ENABLED(CONFIG_XFRM_OFFLOAD)) return false; if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) bond->dev->wanted_features |= BOND_XFRM_FEATURES; else bond->dev->wanted_features &= ~BOND_XFRM_FEATURES; return true; } static int bond_option_mode_set(struct bonding *bond, const struct bond_opt_value *newval) { if (bond->xdp_prog && !bond_xdp_check(bond, newval->value)) return -EOPNOTSUPP; if (!bond_mode_uses_arp(newval->value)) { if (bond->params.arp_interval) { netdev_dbg(bond->dev, "%s mode is incompatible with arp monitoring, start mii monitoring\n", newval->string); /* disable arp monitoring */ bond->params.arp_interval = 0; } if (!bond->params.miimon) { /* set miimon to default value */ bond->params.miimon = BOND_DEFAULT_MIIMON; netdev_dbg(bond->dev, "Setting MII monitoring interval to %d\n", bond->params.miimon); } } if (newval->value == BOND_MODE_ALB) bond->params.tlb_dynamic_lb = 1; /* don't cache arp_validate between modes */ bond->params.arp_validate = BOND_ARP_VALIDATE_NONE; bond->params.mode = newval->value; /* When changing mode, the bond device is down, we may reduce * the bond_bcast_neigh_enabled in bond_close() if broadcast_neighbor * enabled in 8023ad mode. Therefore, only clear broadcast_neighbor * to 0. */ bond->params.broadcast_neighbor = 0; if (bond->dev->reg_state == NETREG_REGISTERED) { bool update = false; update |= bond_set_xfrm_features(bond); if (update) netdev_update_features(bond->dev); } bond_xdp_set_features(bond->dev); return 0; } static int bond_option_active_slave_set(struct bonding *bond, const struct bond_opt_value *newval) { char ifname[IFNAMSIZ] = { 0, }; struct net_device *slave_dev; int ret = 0; sscanf(newval->string, "%15s", ifname); /* IFNAMSIZ */ if (!strlen(ifname) || newval->string[0] == '\n') { slave_dev = NULL; } else { slave_dev = __dev_get_by_name(dev_net(bond->dev), ifname); if (!slave_dev) return -ENODEV; } if (slave_dev) { if (!netif_is_bond_slave(slave_dev)) { slave_err(bond->dev, slave_dev, "Device is not bonding slave\n"); return -EINVAL; } if (bond->dev != netdev_master_upper_dev_get(slave_dev)) { slave_err(bond->dev, slave_dev, "Device is not our slave\n"); return -EINVAL; } } block_netpoll_tx(); /* check to see if we are clearing active */ if (!slave_dev) { netdev_dbg(bond->dev, "Clearing current active slave\n"); bond_change_active_slave(bond, NULL); bond_select_active_slave(bond); } else { struct slave *old_active = rtnl_dereference(bond->curr_active_slave); struct slave *new_active = bond_slave_get_rtnl(slave_dev); BUG_ON(!new_active); if (new_active == old_active) { /* do nothing */ slave_dbg(bond->dev, new_active->dev, "is already the current active slave\n"); } else { if (old_active && (new_active->link == BOND_LINK_UP) && bond_slave_is_up(new_active)) { slave_dbg(bond->dev, new_active->dev, "Setting as active slave\n"); bond_change_active_slave(bond, new_active); } else { slave_err(bond->dev, new_active->dev, "Could not set as active slave; either %s is down or the link is down\n", new_active->dev->name); ret = -EINVAL; } } } unblock_netpoll_tx(); return ret; } /* There are two tricky bits here. First, if MII monitoring is activated, then * we must disable ARP monitoring. Second, if the timer isn't running, we must * start it. */ static int bond_option_miimon_set(struct bonding *bond, const struct bond_opt_value *newval) { netdev_dbg(bond->dev, "Setting MII monitoring interval to %llu\n", newval->value); bond->params.miimon = newval->value; if (bond->params.updelay) netdev_dbg(bond->dev, "Note: Updating updelay (to %d) since it is a multiple of the miimon value\n", bond->params.updelay * bond->params.miimon); if (bond->params.downdelay) netdev_dbg(bond->dev, "Note: Updating downdelay (to %d) since it is a multiple of the miimon value\n", bond->params.downdelay * bond->params.miimon); if (bond->params.peer_notif_delay) netdev_dbg(bond->dev, "Note: Updating peer_notif_delay (to %d) since it is a multiple of the miimon value\n", bond->params.peer_notif_delay * bond->params.miimon); if (newval->value && bond->params.arp_interval) { netdev_dbg(bond->dev, "MII monitoring cannot be used with ARP monitoring - disabling ARP monitoring...\n"); bond->params.arp_interval = 0; if (bond->params.arp_validate) bond->params.arp_validate = BOND_ARP_VALIDATE_NONE; } if (bond->dev->flags & IFF_UP) { /* If the interface is up, we may need to fire off * the MII timer. If the interface is down, the * timer will get fired off when the open function * is called. */ if (!newval->value) { cancel_delayed_work_sync(&bond->mii_work); } else { cancel_delayed_work_sync(&bond->arp_work); queue_delayed_work(bond->wq, &bond->mii_work, 0); } } return 0; } /* Set up, down and peer notification delays. These must be multiples * of the MII monitoring value, and are stored internally as the * multiplier. Thus, we must translate to MS for the real world. */ static int _bond_option_delay_set(struct bonding *bond, const struct bond_opt_value *newval, const char *name, int *target) { int value = newval->value; if (!bond->params.miimon) { netdev_err(bond->dev, "Unable to set %s as MII monitoring is disabled\n", name); return -EPERM; } if ((value % bond->params.miimon) != 0) { netdev_warn(bond->dev, "%s (%d) is not a multiple of miimon (%d), value rounded to %d ms\n", name, value, bond->params.miimon, (value / bond->params.miimon) * bond->params.miimon); } *target = value / bond->params.miimon; netdev_dbg(bond->dev, "Setting %s to %d\n", name, *target * bond->params.miimon); return 0; } static int bond_option_updelay_set(struct bonding *bond, const struct bond_opt_value *newval) { return _bond_option_delay_set(bond, newval, "up delay", &bond->params.updelay); } static int bond_option_downdelay_set(struct bonding *bond, const struct bond_opt_value *newval) { return _bond_option_delay_set(bond, newval, "down delay", &bond->params.downdelay); } static int bond_option_peer_notif_delay_set(struct bonding *bond, const struct bond_opt_value *newval) { int ret = _bond_option_delay_set(bond, newval, "peer notification delay", &bond->params.peer_notif_delay); return ret; } static int bond_option_use_carrier_set(struct bonding *bond, const struct bond_opt_value *newval) { return 0; } /* There are two tricky bits here. First, if ARP monitoring is activated, then * we must disable MII monitoring. Second, if the ARP timer isn't running, * we must start it. */ static int bond_option_arp_interval_set(struct bonding *bond, const struct bond_opt_value *newval) { netdev_dbg(bond->dev, "Setting ARP monitoring interval to %llu\n", newval->value); bond->params.arp_interval = newval->value; if (newval->value) { if (bond->params.miimon) { netdev_dbg(bond->dev, "ARP monitoring cannot be used with MII monitoring. Disabling MII monitoring\n"); bond->params.miimon = 0; } if (!bond->params.arp_targets[0]) netdev_dbg(bond->dev, "ARP monitoring has been set up, but no ARP targets have been specified\n"); } if (bond->dev->flags & IFF_UP) { /* If the interface is up, we may need to fire off * the ARP timer. If the interface is down, the * timer will get fired off when the open function * is called. */ if (!newval->value) { if (bond->params.arp_validate) bond->recv_probe = NULL; cancel_delayed_work_sync(&bond->arp_work); } else { /* arp_validate can be set only in active-backup mode */ bond->recv_probe = bond_rcv_validate; cancel_delayed_work_sync(&bond->mii_work); queue_delayed_work(bond->wq, &bond->arp_work, 0); } } return 0; } static void _bond_options_arp_ip_target_set(struct bonding *bond, int slot, __be32 target, unsigned long last_rx) { __be32 *targets = bond->params.arp_targets; struct list_head *iter; struct slave *slave; if (slot >= 0 && slot < BOND_MAX_ARP_TARGETS) { bond_for_each_slave(bond, slave, iter) WRITE_ONCE(slave->target_last_arp_rx[slot], last_rx); targets[slot] = target; } } static int _bond_option_arp_ip_target_add(struct bonding *bond, __be32 target) { __be32 *targets = bond->params.arp_targets; int ind; if (!bond_is_ip_target_ok(target)) { netdev_err(bond->dev, "invalid ARP target %pI4 specified for addition\n", &target); return -EINVAL; } if (bond_get_targets_ip(targets, target) != -1) { /* dup */ netdev_err(bond->dev, "ARP target %pI4 is already present\n", &target); return -EINVAL; } ind = bond_get_targets_ip(targets, 0); /* first free slot */ if (ind == -1) { netdev_err(bond->dev, "ARP target table is full!\n"); return -EINVAL; } netdev_dbg(bond->dev, "Adding ARP target %pI4\n", &target); _bond_options_arp_ip_target_set(bond, ind, target, jiffies); return 0; } static int bond_option_arp_ip_target_add(struct bonding *bond, __be32 target) { return _bond_option_arp_ip_target_add(bond, target); } static int bond_option_arp_ip_target_rem(struct bonding *bond, __be32 target) { __be32 *targets = bond->params.arp_targets; struct list_head *iter; struct slave *slave; unsigned long *targets_rx; int ind, i; if (!bond_is_ip_target_ok(target)) { netdev_err(bond->dev, "invalid ARP target %pI4 specified for removal\n", &target); return -EINVAL; } ind = bond_get_targets_ip(targets, target); if (ind == -1) { netdev_err(bond->dev, "unable to remove nonexistent ARP target %pI4\n", &target); return -EINVAL; } if (ind == 0 && !targets[1] && bond->params.arp_interval) netdev_warn(bond->dev, "Removing last arp target with arp_interval on\n"); netdev_dbg(bond->dev, "Removing ARP target %pI4\n", &target); bond_for_each_slave(bond, slave, iter) { targets_rx = slave->target_last_arp_rx; for (i = ind; (i < BOND_MAX_ARP_TARGETS-1) && targets[i+1]; i++) WRITE_ONCE(targets_rx[i], READ_ONCE(targets_rx[i+1])); WRITE_ONCE(targets_rx[i], 0); } for (i = ind; (i < BOND_MAX_ARP_TARGETS-1) && targets[i+1]; i++) targets[i] = targets[i+1]; targets[i] = 0; return 0; } void bond_option_arp_ip_targets_clear(struct bonding *bond) { int i; for (i = 0; i < BOND_MAX_ARP_TARGETS; i++) _bond_options_arp_ip_target_set(bond, i, 0, 0); } static int bond_option_arp_ip_targets_set(struct bonding *bond, const struct bond_opt_value *newval) { int ret = -EPERM; __be32 target; if (newval->string) { if (strlen(newval->string) < 1 || !in4_pton(newval->string + 1, -1, (u8 *)&target, -1, NULL)) { netdev_err(bond->dev, "invalid ARP target specified\n"); return ret; } if (newval->string[0] == '+') ret = bond_option_arp_ip_target_add(bond, target); else if (newval->string[0] == '-') ret = bond_option_arp_ip_target_rem(bond, target); else netdev_err(bond->dev, "no command found in arp_ip_targets file - use +<addr> or -<addr>\n"); } else { target = newval->value; ret = bond_option_arp_ip_target_add(bond, target); } return ret; } #if IS_ENABLED(CONFIG_IPV6) static bool slave_can_set_ns_maddr(const struct bonding *bond, struct slave *slave) { return BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP && !bond_is_active_slave(slave) && slave->dev->flags & IFF_MULTICAST; } /** * slave_set_ns_maddrs - add/del all NS mac addresses for slave * @bond: bond device * @slave: slave device * @add: add or remove all the NS mac addresses * * This function tries to add or delete all the NS mac addresses on the slave * * Note, the IPv6 NS target address is the unicast address in Neighbor * Solicitation (NS) message. The dest address of NS message should be * solicited-node multicast address of the target. The dest mac of NS message * is converted from the solicited-node multicast address. * * This function is called when * * arp_validate changes * * enslaving, releasing new slaves */ static void slave_set_ns_maddrs(struct bonding *bond, struct slave *slave, bool add) { struct in6_addr *targets = bond->params.ns_targets; char slot_maddr[MAX_ADDR_LEN]; struct in6_addr mcaddr; int i; if (!slave_can_set_ns_maddr(bond, slave)) return; for (i = 0; i < BOND_MAX_NS_TARGETS; i++) { if (ipv6_addr_any(&targets[i])) break; addrconf_addr_solict_mult(&targets[i], &mcaddr); if (!ndisc_mc_map(&mcaddr, slot_maddr, slave->dev, 0)) { if (add) dev_mc_add(slave->dev, slot_maddr); else dev_mc_del(slave->dev, slot_maddr); } } } void bond_slave_ns_maddrs_add(struct bonding *bond, struct slave *slave) { if (!bond->params.arp_validate) return; slave_set_ns_maddrs(bond, slave, true); } void bond_slave_ns_maddrs_del(struct bonding *bond, struct slave *slave) { if (!bond->params.arp_validate) return; slave_set_ns_maddrs(bond, slave, false); } /** * slave_set_ns_maddr - set new NS mac address for slave * @bond: bond device * @slave: slave device * @target: the new IPv6 target * @slot: the old IPv6 target in the slot * * This function tries to replace the old mac address to new one on the slave. * * Note, the target/slot IPv6 address is the unicast address in Neighbor * Solicitation (NS) message. The dest address of NS message should be * solicited-node multicast address of the target. The dest mac of NS message * is converted from the solicited-node multicast address. * * This function is called when * * An IPv6 NS target is added or removed. */ static void slave_set_ns_maddr(struct bonding *bond, struct slave *slave, struct in6_addr *target, struct in6_addr *slot) { char mac_addr[MAX_ADDR_LEN]; struct in6_addr mcast_addr; if (!bond->params.arp_validate || !slave_can_set_ns_maddr(bond, slave)) return; /* remove the previous mac addr from slave */ addrconf_addr_solict_mult(slot, &mcast_addr); if (!ipv6_addr_any(slot) && !ndisc_mc_map(&mcast_addr, mac_addr, slave->dev, 0)) dev_mc_del(slave->dev, mac_addr); /* add new mac addr on slave if target is set */ addrconf_addr_solict_mult(target, &mcast_addr); if (!ipv6_addr_any(target) && !ndisc_mc_map(&mcast_addr, mac_addr, slave->dev, 0)) dev_mc_add(slave->dev, mac_addr); } static void _bond_options_ns_ip6_target_set(struct bonding *bond, int slot, struct in6_addr *target, unsigned long last_rx) { struct in6_addr *targets = bond->params.ns_targets; struct list_head *iter; struct slave *slave; if (slot >= 0 && slot < BOND_MAX_NS_TARGETS) { bond_for_each_slave(bond, slave, iter) { WRITE_ONCE(slave->target_last_arp_rx[slot], last_rx); slave_set_ns_maddr(bond, slave, target, &targets[slot]); } targets[slot] = *target; } } void bond_option_ns_ip6_targets_clear(struct bonding *bond) { struct in6_addr addr_any = in6addr_any; int i; for (i = 0; i < BOND_MAX_NS_TARGETS; i++) _bond_options_ns_ip6_target_set(bond, i, &addr_any, 0); } static int bond_option_ns_ip6_targets_set(struct bonding *bond, const struct bond_opt_value *newval) { struct in6_addr *target = (struct in6_addr *)newval->extra; struct in6_addr *targets = bond->params.ns_targets; struct in6_addr addr_any = in6addr_any; int index; if (!bond_is_ip6_target_ok(target)) { netdev_err(bond->dev, "invalid NS target %pI6c specified for addition\n", target); return -EINVAL; } if (bond_get_targets_ip6(targets, target) != -1) { /* dup */ netdev_err(bond->dev, "NS target %pI6c is already present\n", target); return -EINVAL; } index = bond_get_targets_ip6(targets, &addr_any); /* first free slot */ if (index == -1) { netdev_err(bond->dev, "NS target table is full!\n"); return -EINVAL; } netdev_dbg(bond->dev, "Adding NS target %pI6c\n", target); _bond_options_ns_ip6_target_set(bond, index, target, jiffies); return 0; } #else static int bond_option_ns_ip6_targets_set(struct bonding *bond, const struct bond_opt_value *newval) { return -EPERM; } static void slave_set_ns_maddrs(struct bonding *bond, struct slave *slave, bool add) {} void bond_slave_ns_maddrs_add(struct bonding *bond, struct slave *slave) {} void bond_slave_ns_maddrs_del(struct bonding *bond, struct slave *slave) {} #endif static int bond_option_arp_validate_set(struct bonding *bond, const struct bond_opt_value *newval) { bool changed = !!bond->params.arp_validate != !!newval->value; struct list_head *iter; struct slave *slave; netdev_dbg(bond->dev, "Setting arp_validate to %s (%llu)\n", newval->string, newval->value); bond->params.arp_validate = newval->value; if (changed) { bond_for_each_slave(bond, slave, iter) slave_set_ns_maddrs(bond, slave, !!bond->params.arp_validate); } return 0; } static int bond_option_arp_all_targets_set(struct bonding *bond, const struct bond_opt_value *newval) { netdev_dbg(bond->dev, "Setting arp_all_targets to %s (%llu)\n", newval->string, newval->value); bond->params.arp_all_targets = newval->value; return 0; } static int bond_option_missed_max_set(struct bonding *bond, const struct bond_opt_value *newval) { netdev_dbg(bond->dev, "Setting missed max to %s (%llu)\n", newval->string, newval->value); bond->params.missed_max = newval->value; return 0; } static int bond_option_prio_set(struct bonding *bond, const struct bond_opt_value *newval) { struct slave *slave; slave = bond_slave_get_rtnl(newval->slave_dev); if (!slave) { netdev_dbg(newval->slave_dev, "%s called on NULL slave\n", __func__); return -ENODEV; } slave->prio = newval->value; if (rtnl_dereference(bond->primary_slave)) slave_warn(bond->dev, slave->dev, "prio updated, but will not affect failover re-selection as primary slave have been set\n"); else bond_select_active_slave(bond); return 0; } static int bond_option_primary_set(struct bonding *bond, const struct bond_opt_value *newval) { char *p, *primary = newval->string; struct list_head *iter; struct slave *slave; block_netpoll_tx(); p = strchr(primary, '\n'); if (p) *p = '\0'; /* check to see if we are clearing primary */ if (!strlen(primary)) { netdev_dbg(bond->dev, "Setting primary slave to None\n"); RCU_INIT_POINTER(bond->primary_slave, NULL); memset(bond->params.primary, 0, sizeof(bond->params.primary)); bond_select_active_slave(bond); goto out; } bond_for_each_slave(bond, slave, iter) { if (strncmp(slave->dev->name, primary, IFNAMSIZ) == 0) { slave_dbg(bond->dev, slave->dev, "Setting as primary slave\n"); rcu_assign_pointer(bond->primary_slave, slave); strcpy(bond->params.primary, slave->dev->name); bond->force_primary = true; bond_select_active_slave(bond); goto out; } } if (rtnl_dereference(bond->primary_slave)) { netdev_dbg(bond->dev, "Setting primary slave to None\n"); RCU_INIT_POINTER(bond->primary_slave, NULL); bond_select_active_slave(bond); } strscpy_pad(bond->params.primary, primary, IFNAMSIZ); netdev_dbg(bond->dev, "Recording %s as primary, but it has not been enslaved yet\n", primary); out: unblock_netpoll_tx(); return 0; } static int bond_option_primary_reselect_set(struct bonding *bond, const struct bond_opt_value *newval) { netdev_dbg(bond->dev, "Setting primary_reselect to %s (%llu)\n", newval->string, newval->value); bond->params.primary_reselect = newval->value; block_netpoll_tx(); bond_select_active_slave(bond); unblock_netpoll_tx(); return 0; } static int bond_option_fail_over_mac_set(struct bonding *bond, const struct bond_opt_value *newval) { netdev_dbg(bond->dev, "Setting fail_over_mac to %s (%llu)\n", newval->string, newval->value); bond->params.fail_over_mac = newval->value; return 0; } static int bond_option_xmit_hash_policy_set(struct bonding *bond, const struct bond_opt_value *newval) { netdev_dbg(bond->dev, "Setting xmit hash policy to %s (%llu)\n", newval->string, newval->value); bond->params.xmit_policy = newval->value; return 0; } static int bond_option_resend_igmp_set(struct bonding *bond, const struct bond_opt_value *newval) { netdev_dbg(bond->dev, "Setting resend_igmp to %llu\n", newval->value); bond->params.resend_igmp = newval->value; return 0; } static int bond_option_num_peer_notif_set(struct bonding *bond, const struct bond_opt_value *newval) { bond->params.num_peer_notif = newval->value; return 0; } static int bond_option_all_slaves_active_set(struct bonding *bond, const struct bond_opt_value *newval) { struct list_head *iter; struct slave *slave; if (newval->value == bond->params.all_slaves_active) return 0; bond->params.all_slaves_active = newval->value; bond_for_each_slave(bond, slave, iter) { if (!bond_is_active_slave(slave)) { if (newval->value) slave->inactive = 0; else slave->inactive = 1; } } return 0; } static int bond_option_min_links_set(struct bonding *bond, const struct bond_opt_value *newval) { netdev_dbg(bond->dev, "Setting min links value to %llu\n", newval->value); bond->params.min_links = newval->value; bond_set_carrier(bond); return 0; } static int bond_option_lp_interval_set(struct bonding *bond, const struct bond_opt_value *newval) { bond->params.lp_interval = newval->value; return 0; } static int bond_option_pps_set(struct bonding *bond, const struct bond_opt_value *newval) { netdev_dbg(bond->dev, "Setting packets per slave to %llu\n", newval->value); bond->params.packets_per_slave = newval->value; if (newval->value > 0) { bond->params.reciprocal_packets_per_slave = reciprocal_value(newval->value); } else { /* reciprocal_packets_per_slave is unused if * packets_per_slave is 0 or 1, just initialize it */ bond->params.reciprocal_packets_per_slave = (struct reciprocal_value) { 0 }; } return 0; } static int bond_option_lacp_active_set(struct bonding *bond, const struct bond_opt_value *newval) { netdev_dbg(bond->dev, "Setting LACP active to %s (%llu)\n", newval->string, newval->value); bond->params.lacp_active = newval->value; bond_3ad_update_lacp_active(bond); return 0; } static int bond_option_lacp_rate_set(struct bonding *bond, const struct bond_opt_value *newval) { netdev_dbg(bond->dev, "Setting LACP rate to %s (%llu)\n", newval->string, newval->value); bond->params.lacp_fast = newval->value; bond_3ad_update_lacp_rate(bond); return 0; } static int bond_option_ad_select_set(struct bonding *bond, const struct bond_opt_value *newval) { netdev_dbg(bond->dev, "Setting ad_select to %s (%llu)\n", newval->string, newval->value); bond->params.ad_select = newval->value; return 0; } static int bond_option_queue_id_set(struct bonding *bond, const struct bond_opt_value *newval) { struct slave *slave, *update_slave; struct net_device *sdev; struct list_head *iter; char *delim; int ret = 0; u16 qid; /* delim will point to queue id if successful */ delim = strchr(newval->string, ':'); if (!delim) goto err_no_cmd; /* Terminate string that points to device name and bump it * up one, so we can read the queue id there. */ *delim = '\0'; if (sscanf(++delim, "%hd\n", &qid) != 1) goto err_no_cmd; /* Check buffer length, valid ifname and queue id */ if (!dev_valid_name(newval->string) || qid > bond->dev->real_num_tx_queues) goto err_no_cmd; /* Get the pointer to that interface if it exists */ sdev = __dev_get_by_name(dev_net(bond->dev), newval->string); if (!sdev) goto err_no_cmd; /* Search for thes slave and check for duplicate qids */ update_slave = NULL; bond_for_each_slave(bond, slave, iter) { if (sdev == slave->dev) /* We don't need to check the matching * slave for dups, since we're overwriting it */ update_slave = slave; else if (qid && qid == slave->queue_id) { goto err_no_cmd; } } if (!update_slave) goto err_no_cmd; /* Actually set the qids for the slave */ WRITE_ONCE(update_slave->queue_id, qid); out: return ret; err_no_cmd: netdev_dbg(bond->dev, "invalid input for queue_id set\n"); ret = -EPERM; goto out; } static int bond_option_slaves_set(struct bonding *bond, const struct bond_opt_value *newval) { char command[IFNAMSIZ + 1] = { 0, }; struct net_device *dev; char *ifname; int ret; sscanf(newval->string, "%16s", command); /* IFNAMSIZ*/ ifname = command + 1; if ((strlen(command) <= 1) || (command[0] != '+' && command[0] != '-') || !dev_valid_name(ifname)) goto err_no_cmd; dev = __dev_get_by_name(dev_net(bond->dev), ifname); if (!dev) { netdev_dbg(bond->dev, "interface %s does not exist!\n", ifname); ret = -ENODEV; goto out; } switch (command[0]) { case '+': slave_dbg(bond->dev, dev, "Enslaving interface\n"); ret = bond_enslave(bond->dev, dev, NULL); break; case '-': slave_dbg(bond->dev, dev, "Releasing interface\n"); ret = bond_release(bond->dev, dev); break; default: /* should not run here. */ goto err_no_cmd; } out: return ret; err_no_cmd: netdev_err(bond->dev, "no command found in slaves file - use +ifname or -ifname\n"); ret = -EPERM; goto out; } static int bond_option_tlb_dynamic_lb_set(struct bonding *bond, const struct bond_opt_value *newval) { netdev_dbg(bond->dev, "Setting dynamic-lb to %s (%llu)\n", newval->string, newval->value); bond->params.tlb_dynamic_lb = newval->value; return 0; } static int bond_option_ad_actor_sys_prio_set(struct bonding *bond, const struct bond_opt_value *newval) { netdev_dbg(bond->dev, "Setting ad_actor_sys_prio to %llu\n", newval->value); bond->params.ad_actor_sys_prio = newval->value; bond_3ad_update_ad_actor_settings(bond); return 0; } static int bond_option_actor_port_prio_set(struct bonding *bond, const struct bond_opt_value *newval) { struct slave *slave; slave = bond_slave_get_rtnl(newval->slave_dev); if (!slave) { netdev_dbg(bond->dev, "%s called on NULL slave\n", __func__); return -ENODEV; } netdev_dbg(newval->slave_dev, "Setting actor_port_prio to %llu\n", newval->value); SLAVE_AD_INFO(slave)->port_priority = newval->value; bond_3ad_update_ad_actor_settings(bond); return 0; } static int bond_option_ad_actor_system_set(struct bonding *bond, const struct bond_opt_value *newval) { u8 macaddr[ETH_ALEN]; u8 *mac; if (newval->string) { if (!mac_pton(newval->string, macaddr)) goto err; mac = macaddr; } else { mac = (u8 *)&newval->value; } if (is_multicast_ether_addr(mac)) goto err; netdev_dbg(bond->dev, "Setting ad_actor_system to %pM\n", mac); ether_addr_copy(bond->params.ad_actor_system, mac); bond_3ad_update_ad_actor_settings(bond); return 0; err: netdev_err(bond->dev, "Invalid ad_actor_system MAC address.\n"); return -EINVAL; } static int bond_option_ad_user_port_key_set(struct bonding *bond, const struct bond_opt_value *newval) { netdev_dbg(bond->dev, "Setting ad_user_port_key to %llu\n", newval->value); bond->params.ad_user_port_key = newval->value; return 0; } static int bond_option_coupled_control_set(struct bonding *bond, const struct bond_opt_value *newval) { netdev_info(bond->dev, "Setting coupled_control to %s (%llu)\n", newval->string, newval->value); bond->params.coupled_control = newval->value; return 0; } static int bond_option_broadcast_neigh_set(struct bonding *bond, const struct bond_opt_value *newval) { if (bond->params.broadcast_neighbor == newval->value) return 0; bond->params.broadcast_neighbor = newval->value; if (bond->dev->flags & IFF_UP) { if (bond->params.broadcast_neighbor) static_branch_inc(&bond_bcast_neigh_enabled); else static_branch_dec(&bond_bcast_neigh_enabled); } netdev_dbg(bond->dev, "Setting broadcast_neighbor to %s (%llu)\n", newval->string, newval->value); return 0; } |
| 13 13 8 5 80 80 93 5 101 5 5 5 5 101 13 7 7 13 13 13 13 6 13 13 5 13 13 13 13 13 24 24 1 1 1 1 1 3 5 8 7 1 6 8 1 102 1 1 7 10 9 1 8 8 33 101 101 101 101 100 101 101 101 3 94 94 94 94 94 94 93 94 80 79 80 80 80 80 80 80 80 80 80 28 29 29 29 33 33 33 14 14 14 14 14 14 13 13 14 1 1 1 11 11 11 11 3 3 3 2 3 3 3 7 5 5 4 3 2 2 1 1 3 7 6 6 6 6 6 2 1 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2006 - 2007 Ivo van Doorn * Copyright (C) 2007 Dmitry Torokhov * Copyright 2009 Johannes Berg <johannes@sipsolutions.net> */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> #include <linux/workqueue.h> #include <linux/capability.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/rfkill.h> #include <linux/sched.h> #include <linux/spinlock.h> #include <linux/device.h> #include <linux/miscdevice.h> #include <linux/wait.h> #include <linux/poll.h> #include <linux/fs.h> #include <linux/slab.h> #include "rfkill.h" #define POLL_INTERVAL (5 * HZ) #define RFKILL_BLOCK_HW BIT(0) #define RFKILL_BLOCK_SW BIT(1) #define RFKILL_BLOCK_SW_PREV BIT(2) #define RFKILL_BLOCK_ANY (RFKILL_BLOCK_HW |\ RFKILL_BLOCK_SW |\ RFKILL_BLOCK_SW_PREV) #define RFKILL_BLOCK_SW_SETCALL BIT(31) struct rfkill { spinlock_t lock; enum rfkill_type type; unsigned long state; unsigned long hard_block_reasons; u32 idx; bool registered; bool persistent; bool polling_paused; bool suspended; bool need_sync; const struct rfkill_ops *ops; void *data; #ifdef CONFIG_RFKILL_LEDS struct led_trigger led_trigger; const char *ledtrigname; #endif struct device dev; struct list_head node; struct delayed_work poll_work; struct work_struct uevent_work; struct work_struct sync_work; char name[]; }; #define to_rfkill(d) container_of(d, struct rfkill, dev) struct rfkill_int_event { struct list_head list; struct rfkill_event_ext ev; }; struct rfkill_data { struct list_head list; struct list_head events; struct mutex mtx; wait_queue_head_t read_wait; bool input_handler; u8 max_size; }; MODULE_AUTHOR("Ivo van Doorn <IvDoorn@gmail.com>"); MODULE_AUTHOR("Johannes Berg <johannes@sipsolutions.net>"); MODULE_DESCRIPTION("RF switch support"); MODULE_LICENSE("GPL"); /* * The locking here should be made much smarter, we currently have * a bit of a stupid situation because drivers might want to register * the rfkill struct under their own lock, and take this lock during * rfkill method calls -- which will cause an AB-BA deadlock situation. * * To fix that, we need to rework this code here to be mostly lock-free * and only use the mutex for list manipulations, not to protect the * various other global variables. Then we can avoid holding the mutex * around driver operations, and all is happy. */ static LIST_HEAD(rfkill_list); /* list of registered rf switches */ static DEFINE_MUTEX(rfkill_global_mutex); static LIST_HEAD(rfkill_fds); /* list of open fds of /dev/rfkill */ static unsigned int rfkill_default_state = 1; module_param_named(default_state, rfkill_default_state, uint, 0444); MODULE_PARM_DESC(default_state, "Default initial state for all radio types, 0 = radio off"); static struct { bool cur, sav; } rfkill_global_states[NUM_RFKILL_TYPES]; static bool rfkill_epo_lock_active; #ifdef CONFIG_RFKILL_LEDS static void rfkill_led_trigger_event(struct rfkill *rfkill) { struct led_trigger *trigger; if (!rfkill->registered) return; trigger = &rfkill->led_trigger; if (rfkill->state & RFKILL_BLOCK_ANY) led_trigger_event(trigger, LED_OFF); else led_trigger_event(trigger, LED_FULL); } static int rfkill_led_trigger_activate(struct led_classdev *led) { struct rfkill *rfkill; rfkill = container_of(led->trigger, struct rfkill, led_trigger); rfkill_led_trigger_event(rfkill); return 0; } const char *rfkill_get_led_trigger_name(struct rfkill *rfkill) { return rfkill->led_trigger.name; } EXPORT_SYMBOL(rfkill_get_led_trigger_name); void rfkill_set_led_trigger_name(struct rfkill *rfkill, const char *name) { BUG_ON(!rfkill); rfkill->ledtrigname = name; } EXPORT_SYMBOL(rfkill_set_led_trigger_name); static int rfkill_led_trigger_register(struct rfkill *rfkill) { rfkill->led_trigger.name = rfkill->ledtrigname ? : dev_name(&rfkill->dev); rfkill->led_trigger.activate = rfkill_led_trigger_activate; return led_trigger_register(&rfkill->led_trigger); } static void rfkill_led_trigger_unregister(struct rfkill *rfkill) { led_trigger_unregister(&rfkill->led_trigger); } static struct led_trigger rfkill_any_led_trigger; static struct led_trigger rfkill_none_led_trigger; static struct work_struct rfkill_global_led_trigger_work; static void rfkill_global_led_trigger_worker(struct work_struct *work) { enum led_brightness brightness = LED_OFF; struct rfkill *rfkill; mutex_lock(&rfkill_global_mutex); list_for_each_entry(rfkill, &rfkill_list, node) { if (!(rfkill->state & RFKILL_BLOCK_ANY)) { brightness = LED_FULL; break; } } mutex_unlock(&rfkill_global_mutex); led_trigger_event(&rfkill_any_led_trigger, brightness); led_trigger_event(&rfkill_none_led_trigger, brightness == LED_OFF ? LED_FULL : LED_OFF); } static void rfkill_global_led_trigger_event(void) { schedule_work(&rfkill_global_led_trigger_work); } static int rfkill_global_led_trigger_register(void) { int ret; INIT_WORK(&rfkill_global_led_trigger_work, rfkill_global_led_trigger_worker); rfkill_any_led_trigger.name = "rfkill-any"; ret = led_trigger_register(&rfkill_any_led_trigger); if (ret) return ret; rfkill_none_led_trigger.name = "rfkill-none"; ret = led_trigger_register(&rfkill_none_led_trigger); if (ret) led_trigger_unregister(&rfkill_any_led_trigger); else /* Delay activation until all global triggers are registered */ rfkill_global_led_trigger_event(); return ret; } static void rfkill_global_led_trigger_unregister(void) { led_trigger_unregister(&rfkill_none_led_trigger); led_trigger_unregister(&rfkill_any_led_trigger); cancel_work_sync(&rfkill_global_led_trigger_work); } #else static void rfkill_led_trigger_event(struct rfkill *rfkill) { } static inline int rfkill_led_trigger_register(struct rfkill *rfkill) { return 0; } static inline void rfkill_led_trigger_unregister(struct rfkill *rfkill) { } static void rfkill_global_led_trigger_event(void) { } static int rfkill_global_led_trigger_register(void) { return 0; } static void rfkill_global_led_trigger_unregister(void) { } #endif /* CONFIG_RFKILL_LEDS */ static void rfkill_fill_event(struct rfkill_event_ext *ev, struct rfkill *rfkill, enum rfkill_operation op) { unsigned long flags; ev->idx = rfkill->idx; ev->type = rfkill->type; ev->op = op; spin_lock_irqsave(&rfkill->lock, flags); ev->hard = !!(rfkill->state & RFKILL_BLOCK_HW); ev->soft = !!(rfkill->state & (RFKILL_BLOCK_SW | RFKILL_BLOCK_SW_PREV)); ev->hard_block_reasons = rfkill->hard_block_reasons; spin_unlock_irqrestore(&rfkill->lock, flags); } static void rfkill_send_events(struct rfkill *rfkill, enum rfkill_operation op) { struct rfkill_data *data; struct rfkill_int_event *ev; list_for_each_entry(data, &rfkill_fds, list) { ev = kzalloc(sizeof(*ev), GFP_KERNEL); if (!ev) continue; rfkill_fill_event(&ev->ev, rfkill, op); mutex_lock(&data->mtx); list_add_tail(&ev->list, &data->events); mutex_unlock(&data->mtx); wake_up_interruptible(&data->read_wait); } } static void rfkill_event(struct rfkill *rfkill) { if (!rfkill->registered) return; kobject_uevent(&rfkill->dev.kobj, KOBJ_CHANGE); /* also send event to /dev/rfkill */ rfkill_send_events(rfkill, RFKILL_OP_CHANGE); } /** * rfkill_set_block - wrapper for set_block method * * @rfkill: the rfkill struct to use * @blocked: the new software state * * Calls the set_block method (when applicable) and handles notifications * etc. as well. */ static void rfkill_set_block(struct rfkill *rfkill, bool blocked) { unsigned long flags; bool prev, curr; int err; if (unlikely(rfkill->dev.power.power_state.event & PM_EVENT_SLEEP)) return; /* * Some platforms (...!) generate input events which affect the * _hard_ kill state -- whenever something tries to change the * current software state query the hardware state too. */ if (rfkill->ops->query) rfkill->ops->query(rfkill, rfkill->data); spin_lock_irqsave(&rfkill->lock, flags); prev = rfkill->state & RFKILL_BLOCK_SW; if (prev) rfkill->state |= RFKILL_BLOCK_SW_PREV; else rfkill->state &= ~RFKILL_BLOCK_SW_PREV; if (blocked) rfkill->state |= RFKILL_BLOCK_SW; else rfkill->state &= ~RFKILL_BLOCK_SW; rfkill->state |= RFKILL_BLOCK_SW_SETCALL; spin_unlock_irqrestore(&rfkill->lock, flags); err = rfkill->ops->set_block(rfkill->data, blocked); spin_lock_irqsave(&rfkill->lock, flags); if (err) { /* * Failed -- reset status to _PREV, which may be different * from what we have set _PREV to earlier in this function * if rfkill_set_sw_state was invoked. */ if (rfkill->state & RFKILL_BLOCK_SW_PREV) rfkill->state |= RFKILL_BLOCK_SW; else rfkill->state &= ~RFKILL_BLOCK_SW; } rfkill->state &= ~RFKILL_BLOCK_SW_SETCALL; rfkill->state &= ~RFKILL_BLOCK_SW_PREV; curr = rfkill->state & RFKILL_BLOCK_SW; spin_unlock_irqrestore(&rfkill->lock, flags); rfkill_led_trigger_event(rfkill); rfkill_global_led_trigger_event(); if (prev != curr) rfkill_event(rfkill); } static void rfkill_sync(struct rfkill *rfkill) { lockdep_assert_held(&rfkill_global_mutex); if (!rfkill->need_sync) return; rfkill_set_block(rfkill, rfkill_global_states[rfkill->type].cur); rfkill->need_sync = false; } static void rfkill_update_global_state(enum rfkill_type type, bool blocked) { int i; if (type != RFKILL_TYPE_ALL) { rfkill_global_states[type].cur = blocked; return; } for (i = 0; i < NUM_RFKILL_TYPES; i++) rfkill_global_states[i].cur = blocked; } #ifdef CONFIG_RFKILL_INPUT static atomic_t rfkill_input_disabled = ATOMIC_INIT(0); /** * __rfkill_switch_all - Toggle state of all switches of given type * @type: type of interfaces to be affected * @blocked: the new state * * This function sets the state of all switches of given type, * unless a specific switch is suspended. * * Caller must have acquired rfkill_global_mutex. */ static void __rfkill_switch_all(const enum rfkill_type type, bool blocked) { struct rfkill *rfkill; rfkill_update_global_state(type, blocked); list_for_each_entry(rfkill, &rfkill_list, node) { if (rfkill->type != type && type != RFKILL_TYPE_ALL) continue; rfkill_set_block(rfkill, blocked); } } /** * rfkill_switch_all - Toggle state of all switches of given type * @type: type of interfaces to be affected * @blocked: the new state * * Acquires rfkill_global_mutex and calls __rfkill_switch_all(@type, @state). * Please refer to __rfkill_switch_all() for details. * * Does nothing if the EPO lock is active. */ void rfkill_switch_all(enum rfkill_type type, bool blocked) { if (atomic_read(&rfkill_input_disabled)) return; mutex_lock(&rfkill_global_mutex); if (!rfkill_epo_lock_active) __rfkill_switch_all(type, blocked); mutex_unlock(&rfkill_global_mutex); } /** * rfkill_epo - emergency power off all transmitters * * This kicks all non-suspended rfkill devices to RFKILL_STATE_SOFT_BLOCKED, * ignoring everything in its path but rfkill_global_mutex and rfkill->mutex. * * The global state before the EPO is saved and can be restored later * using rfkill_restore_states(). */ void rfkill_epo(void) { struct rfkill *rfkill; int i; if (atomic_read(&rfkill_input_disabled)) return; mutex_lock(&rfkill_global_mutex); rfkill_epo_lock_active = true; list_for_each_entry(rfkill, &rfkill_list, node) rfkill_set_block(rfkill, true); for (i = 0; i < NUM_RFKILL_TYPES; i++) { rfkill_global_states[i].sav = rfkill_global_states[i].cur; rfkill_global_states[i].cur = true; } mutex_unlock(&rfkill_global_mutex); } /** * rfkill_restore_states - restore global states * * Restore (and sync switches to) the global state from the * states in rfkill_default_states. This can undo the effects of * a call to rfkill_epo(). */ void rfkill_restore_states(void) { int i; if (atomic_read(&rfkill_input_disabled)) return; mutex_lock(&rfkill_global_mutex); rfkill_epo_lock_active = false; for (i = 0; i < NUM_RFKILL_TYPES; i++) __rfkill_switch_all(i, rfkill_global_states[i].sav); mutex_unlock(&rfkill_global_mutex); } /** * rfkill_remove_epo_lock - unlock state changes * * Used by rfkill-input manually unlock state changes, when * the EPO switch is deactivated. */ void rfkill_remove_epo_lock(void) { if (atomic_read(&rfkill_input_disabled)) return; mutex_lock(&rfkill_global_mutex); rfkill_epo_lock_active = false; mutex_unlock(&rfkill_global_mutex); } /** * rfkill_is_epo_lock_active - returns true EPO is active * * Returns 0 (false) if there is NOT an active EPO condition, * and 1 (true) if there is an active EPO condition, which * locks all radios in one of the BLOCKED states. * * Can be called in atomic context. */ bool rfkill_is_epo_lock_active(void) { return rfkill_epo_lock_active; } /** * rfkill_get_global_sw_state - returns global state for a type * @type: the type to get the global state of * * Returns the current global state for a given wireless * device type. */ bool rfkill_get_global_sw_state(const enum rfkill_type type) { return rfkill_global_states[type].cur; } #endif bool rfkill_set_hw_state_reason(struct rfkill *rfkill, bool blocked, enum rfkill_hard_block_reasons reason) { unsigned long flags; bool ret, prev; BUG_ON(!rfkill); spin_lock_irqsave(&rfkill->lock, flags); prev = !!(rfkill->hard_block_reasons & reason); if (blocked) { rfkill->state |= RFKILL_BLOCK_HW; rfkill->hard_block_reasons |= reason; } else { rfkill->hard_block_reasons &= ~reason; if (!rfkill->hard_block_reasons) rfkill->state &= ~RFKILL_BLOCK_HW; } ret = !!(rfkill->state & RFKILL_BLOCK_ANY); spin_unlock_irqrestore(&rfkill->lock, flags); rfkill_led_trigger_event(rfkill); rfkill_global_led_trigger_event(); if (rfkill->registered && prev != blocked) schedule_work(&rfkill->uevent_work); return ret; } EXPORT_SYMBOL(rfkill_set_hw_state_reason); static void __rfkill_set_sw_state(struct rfkill *rfkill, bool blocked) { u32 bit = RFKILL_BLOCK_SW; /* if in a ops->set_block right now, use other bit */ if (rfkill->state & RFKILL_BLOCK_SW_SETCALL) bit = RFKILL_BLOCK_SW_PREV; if (blocked) rfkill->state |= bit; else rfkill->state &= ~bit; } bool rfkill_set_sw_state(struct rfkill *rfkill, bool blocked) { unsigned long flags; bool prev, hwblock; BUG_ON(!rfkill); spin_lock_irqsave(&rfkill->lock, flags); prev = !!(rfkill->state & RFKILL_BLOCK_SW); __rfkill_set_sw_state(rfkill, blocked); hwblock = !!(rfkill->state & RFKILL_BLOCK_HW); blocked = blocked || hwblock; spin_unlock_irqrestore(&rfkill->lock, flags); if (!rfkill->registered) return blocked; if (prev != blocked && !hwblock) schedule_work(&rfkill->uevent_work); rfkill_led_trigger_event(rfkill); rfkill_global_led_trigger_event(); return blocked; } EXPORT_SYMBOL(rfkill_set_sw_state); void rfkill_init_sw_state(struct rfkill *rfkill, bool blocked) { unsigned long flags; BUG_ON(!rfkill); BUG_ON(rfkill->registered); spin_lock_irqsave(&rfkill->lock, flags); __rfkill_set_sw_state(rfkill, blocked); rfkill->persistent = true; spin_unlock_irqrestore(&rfkill->lock, flags); } EXPORT_SYMBOL(rfkill_init_sw_state); void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw) { unsigned long flags; bool swprev, hwprev; BUG_ON(!rfkill); spin_lock_irqsave(&rfkill->lock, flags); /* * No need to care about prev/setblock ... this is for uevent only * and that will get triggered by rfkill_set_block anyway. */ swprev = !!(rfkill->state & RFKILL_BLOCK_SW); hwprev = !!(rfkill->state & RFKILL_BLOCK_HW); __rfkill_set_sw_state(rfkill, sw); if (hw) rfkill->state |= RFKILL_BLOCK_HW; else rfkill->state &= ~RFKILL_BLOCK_HW; spin_unlock_irqrestore(&rfkill->lock, flags); if (!rfkill->registered) { rfkill->persistent = true; } else { if (swprev != sw || hwprev != hw) schedule_work(&rfkill->uevent_work); rfkill_led_trigger_event(rfkill); rfkill_global_led_trigger_event(); } } EXPORT_SYMBOL(rfkill_set_states); static const char * const rfkill_types[] = { NULL, /* RFKILL_TYPE_ALL */ "wlan", "bluetooth", "ultrawideband", "wimax", "wwan", "gps", "fm", "nfc", }; enum rfkill_type rfkill_find_type(const char *name) { int i; BUILD_BUG_ON(ARRAY_SIZE(rfkill_types) != NUM_RFKILL_TYPES); if (!name) return RFKILL_TYPE_ALL; for (i = 1; i < NUM_RFKILL_TYPES; i++) if (!strcmp(name, rfkill_types[i])) return i; return RFKILL_TYPE_ALL; } EXPORT_SYMBOL(rfkill_find_type); static ssize_t name_show(struct device *dev, struct device_attribute *attr, char *buf) { struct rfkill *rfkill = to_rfkill(dev); return sysfs_emit(buf, "%s\n", rfkill->name); } static DEVICE_ATTR_RO(name); static ssize_t type_show(struct device *dev, struct device_attribute *attr, char *buf) { struct rfkill *rfkill = to_rfkill(dev); return sysfs_emit(buf, "%s\n", rfkill_types[rfkill->type]); } static DEVICE_ATTR_RO(type); static ssize_t index_show(struct device *dev, struct device_attribute *attr, char *buf) { struct rfkill *rfkill = to_rfkill(dev); return sysfs_emit(buf, "%d\n", rfkill->idx); } static DEVICE_ATTR_RO(index); static ssize_t persistent_show(struct device *dev, struct device_attribute *attr, char *buf) { struct rfkill *rfkill = to_rfkill(dev); return sysfs_emit(buf, "%d\n", rfkill->persistent); } static DEVICE_ATTR_RO(persistent); static ssize_t hard_show(struct device *dev, struct device_attribute *attr, char *buf) { struct rfkill *rfkill = to_rfkill(dev); return sysfs_emit(buf, "%d\n", (rfkill->state & RFKILL_BLOCK_HW) ? 1 : 0); } static DEVICE_ATTR_RO(hard); static ssize_t soft_show(struct device *dev, struct device_attribute *attr, char *buf) { struct rfkill *rfkill = to_rfkill(dev); mutex_lock(&rfkill_global_mutex); rfkill_sync(rfkill); mutex_unlock(&rfkill_global_mutex); return sysfs_emit(buf, "%d\n", (rfkill->state & RFKILL_BLOCK_SW) ? 1 : 0); } static ssize_t soft_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct rfkill *rfkill = to_rfkill(dev); unsigned long state; int err; if (!capable(CAP_NET_ADMIN)) return -EPERM; err = kstrtoul(buf, 0, &state); if (err) return err; if (state > 1 ) return -EINVAL; mutex_lock(&rfkill_global_mutex); rfkill_sync(rfkill); rfkill_set_block(rfkill, state); mutex_unlock(&rfkill_global_mutex); return count; } static DEVICE_ATTR_RW(soft); static ssize_t hard_block_reasons_show(struct device *dev, struct device_attribute *attr, char *buf) { struct rfkill *rfkill = to_rfkill(dev); return sysfs_emit(buf, "0x%lx\n", rfkill->hard_block_reasons); } static DEVICE_ATTR_RO(hard_block_reasons); static u8 user_state_from_blocked(unsigned long state) { if (state & RFKILL_BLOCK_HW) return RFKILL_USER_STATE_HARD_BLOCKED; if (state & RFKILL_BLOCK_SW) return RFKILL_USER_STATE_SOFT_BLOCKED; return RFKILL_USER_STATE_UNBLOCKED; } static ssize_t state_show(struct device *dev, struct device_attribute *attr, char *buf) { struct rfkill *rfkill = to_rfkill(dev); mutex_lock(&rfkill_global_mutex); rfkill_sync(rfkill); mutex_unlock(&rfkill_global_mutex); return sysfs_emit(buf, "%d\n", user_state_from_blocked(rfkill->state)); } static ssize_t state_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct rfkill *rfkill = to_rfkill(dev); unsigned long state; int err; if (!capable(CAP_NET_ADMIN)) return -EPERM; err = kstrtoul(buf, 0, &state); if (err) return err; if (state != RFKILL_USER_STATE_SOFT_BLOCKED && state != RFKILL_USER_STATE_UNBLOCKED) return -EINVAL; mutex_lock(&rfkill_global_mutex); rfkill_sync(rfkill); rfkill_set_block(rfkill, state == RFKILL_USER_STATE_SOFT_BLOCKED); mutex_unlock(&rfkill_global_mutex); return count; } static DEVICE_ATTR_RW(state); static struct attribute *rfkill_dev_attrs[] = { &dev_attr_name.attr, &dev_attr_type.attr, &dev_attr_index.attr, &dev_attr_persistent.attr, &dev_attr_state.attr, &dev_attr_soft.attr, &dev_attr_hard.attr, &dev_attr_hard_block_reasons.attr, NULL, }; ATTRIBUTE_GROUPS(rfkill_dev); static void rfkill_release(struct device *dev) { struct rfkill *rfkill = to_rfkill(dev); kfree(rfkill); } static int rfkill_dev_uevent(const struct device *dev, struct kobj_uevent_env *env) { struct rfkill *rfkill = to_rfkill(dev); unsigned long flags; unsigned long reasons; u32 state; int error; error = add_uevent_var(env, "RFKILL_NAME=%s", rfkill->name); if (error) return error; error = add_uevent_var(env, "RFKILL_TYPE=%s", rfkill_types[rfkill->type]); if (error) return error; spin_lock_irqsave(&rfkill->lock, flags); state = rfkill->state; reasons = rfkill->hard_block_reasons; spin_unlock_irqrestore(&rfkill->lock, flags); error = add_uevent_var(env, "RFKILL_STATE=%d", user_state_from_blocked(state)); if (error) return error; return add_uevent_var(env, "RFKILL_HW_BLOCK_REASON=0x%lx", reasons); } void rfkill_pause_polling(struct rfkill *rfkill) { BUG_ON(!rfkill); if (!rfkill->ops->poll) return; rfkill->polling_paused = true; cancel_delayed_work_sync(&rfkill->poll_work); } EXPORT_SYMBOL(rfkill_pause_polling); void rfkill_resume_polling(struct rfkill *rfkill) { BUG_ON(!rfkill); if (!rfkill->ops->poll) return; rfkill->polling_paused = false; if (rfkill->suspended) return; queue_delayed_work(system_power_efficient_wq, &rfkill->poll_work, 0); } EXPORT_SYMBOL(rfkill_resume_polling); #ifdef CONFIG_PM_SLEEP static int rfkill_suspend(struct device *dev) { struct rfkill *rfkill = to_rfkill(dev); rfkill->suspended = true; cancel_delayed_work_sync(&rfkill->poll_work); return 0; } static int rfkill_resume(struct device *dev) { struct rfkill *rfkill = to_rfkill(dev); bool cur; rfkill->suspended = false; if (!rfkill->registered) return 0; if (!rfkill->persistent) { cur = !!(rfkill->state & RFKILL_BLOCK_SW); rfkill_set_block(rfkill, cur); } if (rfkill->ops->poll && !rfkill->polling_paused) queue_delayed_work(system_power_efficient_wq, &rfkill->poll_work, 0); return 0; } static SIMPLE_DEV_PM_OPS(rfkill_pm_ops, rfkill_suspend, rfkill_resume); #define RFKILL_PM_OPS (&rfkill_pm_ops) #else #define RFKILL_PM_OPS NULL #endif static struct class rfkill_class = { .name = "rfkill", .dev_release = rfkill_release, .dev_groups = rfkill_dev_groups, .dev_uevent = rfkill_dev_uevent, .pm = RFKILL_PM_OPS, }; bool rfkill_blocked(struct rfkill *rfkill) { unsigned long flags; u32 state; spin_lock_irqsave(&rfkill->lock, flags); state = rfkill->state; spin_unlock_irqrestore(&rfkill->lock, flags); return !!(state & RFKILL_BLOCK_ANY); } EXPORT_SYMBOL(rfkill_blocked); bool rfkill_soft_blocked(struct rfkill *rfkill) { unsigned long flags; u32 state; spin_lock_irqsave(&rfkill->lock, flags); state = rfkill->state; spin_unlock_irqrestore(&rfkill->lock, flags); return !!(state & RFKILL_BLOCK_SW); } EXPORT_SYMBOL(rfkill_soft_blocked); struct rfkill * __must_check rfkill_alloc(const char *name, struct device *parent, const enum rfkill_type type, const struct rfkill_ops *ops, void *ops_data) { struct rfkill *rfkill; struct device *dev; if (WARN_ON(!ops)) return NULL; if (WARN_ON(!ops->set_block)) return NULL; if (WARN_ON(!name)) return NULL; if (WARN_ON(type == RFKILL_TYPE_ALL || type >= NUM_RFKILL_TYPES)) return NULL; rfkill = kzalloc(sizeof(*rfkill) + strlen(name) + 1, GFP_KERNEL); if (!rfkill) return NULL; spin_lock_init(&rfkill->lock); INIT_LIST_HEAD(&rfkill->node); rfkill->type = type; strcpy(rfkill->name, name); rfkill->ops = ops; rfkill->data = ops_data; dev = &rfkill->dev; dev->class = &rfkill_class; dev->parent = parent; device_initialize(dev); return rfkill; } EXPORT_SYMBOL(rfkill_alloc); static void rfkill_poll(struct work_struct *work) { struct rfkill *rfkill; rfkill = container_of(work, struct rfkill, poll_work.work); /* * Poll hardware state -- driver will use one of the * rfkill_set{,_hw,_sw}_state functions and use its * return value to update the current status. */ rfkill->ops->poll(rfkill, rfkill->data); queue_delayed_work(system_power_efficient_wq, &rfkill->poll_work, round_jiffies_relative(POLL_INTERVAL)); } static void rfkill_uevent_work(struct work_struct *work) { struct rfkill *rfkill; rfkill = container_of(work, struct rfkill, uevent_work); mutex_lock(&rfkill_global_mutex); rfkill_event(rfkill); mutex_unlock(&rfkill_global_mutex); } static void rfkill_sync_work(struct work_struct *work) { struct rfkill *rfkill = container_of(work, struct rfkill, sync_work); mutex_lock(&rfkill_global_mutex); rfkill_sync(rfkill); mutex_unlock(&rfkill_global_mutex); } int __must_check rfkill_register(struct rfkill *rfkill) { static unsigned long rfkill_no; struct device *dev; int error; if (!rfkill) return -EINVAL; dev = &rfkill->dev; mutex_lock(&rfkill_global_mutex); if (rfkill->registered) { error = -EALREADY; goto unlock; } rfkill->idx = rfkill_no; dev_set_name(dev, "rfkill%lu", rfkill_no); rfkill_no++; list_add_tail(&rfkill->node, &rfkill_list); error = device_add(dev); if (error) goto remove; error = rfkill_led_trigger_register(rfkill); if (error) goto devdel; rfkill->registered = true; INIT_DELAYED_WORK(&rfkill->poll_work, rfkill_poll); INIT_WORK(&rfkill->uevent_work, rfkill_uevent_work); INIT_WORK(&rfkill->sync_work, rfkill_sync_work); if (rfkill->ops->poll) queue_delayed_work(system_power_efficient_wq, &rfkill->poll_work, round_jiffies_relative(POLL_INTERVAL)); if (!rfkill->persistent || rfkill_epo_lock_active) { rfkill->need_sync = true; schedule_work(&rfkill->sync_work); } else { #ifdef CONFIG_RFKILL_INPUT bool soft_blocked = !!(rfkill->state & RFKILL_BLOCK_SW); if (!atomic_read(&rfkill_input_disabled)) __rfkill_switch_all(rfkill->type, soft_blocked); #endif } rfkill_global_led_trigger_event(); rfkill_send_events(rfkill, RFKILL_OP_ADD); mutex_unlock(&rfkill_global_mutex); return 0; devdel: device_del(&rfkill->dev); remove: list_del_init(&rfkill->node); unlock: mutex_unlock(&rfkill_global_mutex); return error; } EXPORT_SYMBOL(rfkill_register); void rfkill_unregister(struct rfkill *rfkill) { BUG_ON(!rfkill); if (rfkill->ops->poll) cancel_delayed_work_sync(&rfkill->poll_work); cancel_work_sync(&rfkill->uevent_work); cancel_work_sync(&rfkill->sync_work); rfkill->registered = false; device_del(&rfkill->dev); mutex_lock(&rfkill_global_mutex); rfkill_send_events(rfkill, RFKILL_OP_DEL); list_del_init(&rfkill->node); rfkill_global_led_trigger_event(); mutex_unlock(&rfkill_global_mutex); rfkill_led_trigger_unregister(rfkill); } EXPORT_SYMBOL(rfkill_unregister); void rfkill_destroy(struct rfkill *rfkill) { if (rfkill) put_device(&rfkill->dev); } EXPORT_SYMBOL(rfkill_destroy); static int rfkill_fop_open(struct inode *inode, struct file *file) { struct rfkill_data *data; struct rfkill *rfkill; struct rfkill_int_event *ev, *tmp; data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; data->max_size = RFKILL_EVENT_SIZE_V1; INIT_LIST_HEAD(&data->events); mutex_init(&data->mtx); init_waitqueue_head(&data->read_wait); mutex_lock(&rfkill_global_mutex); /* * start getting events from elsewhere but hold mtx to get * startup events added first */ list_for_each_entry(rfkill, &rfkill_list, node) { ev = kzalloc(sizeof(*ev), GFP_KERNEL); if (!ev) goto free; rfkill_sync(rfkill); rfkill_fill_event(&ev->ev, rfkill, RFKILL_OP_ADD); mutex_lock(&data->mtx); list_add_tail(&ev->list, &data->events); mutex_unlock(&data->mtx); } list_add(&data->list, &rfkill_fds); mutex_unlock(&rfkill_global_mutex); file->private_data = data; return stream_open(inode, file); free: mutex_unlock(&rfkill_global_mutex); mutex_destroy(&data->mtx); list_for_each_entry_safe(ev, tmp, &data->events, list) kfree(ev); kfree(data); return -ENOMEM; } static __poll_t rfkill_fop_poll(struct file *file, poll_table *wait) { struct rfkill_data *data = file->private_data; __poll_t res = EPOLLOUT | EPOLLWRNORM; poll_wait(file, &data->read_wait, wait); mutex_lock(&data->mtx); if (!list_empty(&data->events)) res = EPOLLIN | EPOLLRDNORM; mutex_unlock(&data->mtx); return res; } static ssize_t rfkill_fop_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { struct rfkill_data *data = file->private_data; struct rfkill_int_event *ev; unsigned long sz; int ret; mutex_lock(&data->mtx); while (list_empty(&data->events)) { if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; goto out; } mutex_unlock(&data->mtx); /* since we re-check and it just compares pointers, * using !list_empty() without locking isn't a problem */ ret = wait_event_interruptible(data->read_wait, !list_empty(&data->events)); mutex_lock(&data->mtx); if (ret) goto out; } ev = list_first_entry(&data->events, struct rfkill_int_event, list); sz = min_t(unsigned long, sizeof(ev->ev), count); sz = min_t(unsigned long, sz, data->max_size); ret = sz; if (copy_to_user(buf, &ev->ev, sz)) ret = -EFAULT; list_del(&ev->list); kfree(ev); out: mutex_unlock(&data->mtx); return ret; } static ssize_t rfkill_fop_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { struct rfkill_data *data = file->private_data; struct rfkill *rfkill; struct rfkill_event_ext ev; int ret; /* we don't need the 'hard' variable but accept it */ if (count < RFKILL_EVENT_SIZE_V1 - 1) return -EINVAL; /* * Copy as much data as we can accept into our 'ev' buffer, * but tell userspace how much we've copied so it can determine * our API version even in a write() call, if it cares. */ count = min(count, sizeof(ev)); count = min_t(size_t, count, data->max_size); if (copy_from_user(&ev, buf, count)) return -EFAULT; if (ev.type >= NUM_RFKILL_TYPES) return -EINVAL; mutex_lock(&rfkill_global_mutex); switch (ev.op) { case RFKILL_OP_CHANGE_ALL: rfkill_update_global_state(ev.type, ev.soft); list_for_each_entry(rfkill, &rfkill_list, node) if (rfkill->type == ev.type || ev.type == RFKILL_TYPE_ALL) rfkill_set_block(rfkill, ev.soft); ret = 0; break; case RFKILL_OP_CHANGE: list_for_each_entry(rfkill, &rfkill_list, node) if (rfkill->idx == ev.idx && (rfkill->type == ev.type || ev.type == RFKILL_TYPE_ALL)) rfkill_set_block(rfkill, ev.soft); ret = 0; break; default: ret = -EINVAL; break; } mutex_unlock(&rfkill_global_mutex); return ret ?: count; } static int rfkill_fop_release(struct inode *inode, struct file *file) { struct rfkill_data *data = file->private_data; struct rfkill_int_event *ev, *tmp; mutex_lock(&rfkill_global_mutex); list_del(&data->list); mutex_unlock(&rfkill_global_mutex); mutex_destroy(&data->mtx); list_for_each_entry_safe(ev, tmp, &data->events, list) kfree(ev); #ifdef CONFIG_RFKILL_INPUT if (data->input_handler) if (atomic_dec_return(&rfkill_input_disabled) == 0) printk(KERN_DEBUG "rfkill: input handler enabled\n"); #endif kfree(data); return 0; } static long rfkill_fop_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct rfkill_data *data = file->private_data; int ret = -ENOTTY; u32 size; if (_IOC_TYPE(cmd) != RFKILL_IOC_MAGIC) return -ENOTTY; mutex_lock(&data->mtx); switch (_IOC_NR(cmd)) { #ifdef CONFIG_RFKILL_INPUT case RFKILL_IOC_NOINPUT: if (!data->input_handler) { if (atomic_inc_return(&rfkill_input_disabled) == 1) printk(KERN_DEBUG "rfkill: input handler disabled\n"); data->input_handler = true; } ret = 0; break; #endif case RFKILL_IOC_MAX_SIZE: if (get_user(size, (__u32 __user *)arg)) { ret = -EFAULT; break; } if (size < RFKILL_EVENT_SIZE_V1 || size > U8_MAX) { ret = -EINVAL; break; } data->max_size = size; ret = 0; break; default: break; } mutex_unlock(&data->mtx); return ret; } static const struct file_operations rfkill_fops = { .owner = THIS_MODULE, .open = rfkill_fop_open, .read = rfkill_fop_read, .write = rfkill_fop_write, .poll = rfkill_fop_poll, .release = rfkill_fop_release, .unlocked_ioctl = rfkill_fop_ioctl, .compat_ioctl = compat_ptr_ioctl, }; #define RFKILL_NAME "rfkill" static struct miscdevice rfkill_miscdev = { .fops = &rfkill_fops, .name = RFKILL_NAME, .minor = RFKILL_MINOR, }; static int __init rfkill_init(void) { int error; rfkill_update_global_state(RFKILL_TYPE_ALL, !rfkill_default_state); error = class_register(&rfkill_class); if (error) goto error_class; error = misc_register(&rfkill_miscdev); if (error) goto error_misc; error = rfkill_global_led_trigger_register(); if (error) goto error_led_trigger; #ifdef CONFIG_RFKILL_INPUT error = rfkill_handler_init(); if (error) goto error_input; #endif return 0; #ifdef CONFIG_RFKILL_INPUT error_input: rfkill_global_led_trigger_unregister(); #endif error_led_trigger: misc_deregister(&rfkill_miscdev); error_misc: class_unregister(&rfkill_class); error_class: return error; } subsys_initcall(rfkill_init); static void __exit rfkill_exit(void) { #ifdef CONFIG_RFKILL_INPUT rfkill_handler_exit(); #endif rfkill_global_led_trigger_unregister(); misc_deregister(&rfkill_miscdev); class_unregister(&rfkill_class); } module_exit(rfkill_exit); MODULE_ALIAS_MISCDEV(RFKILL_MINOR); MODULE_ALIAS("devname:" RFKILL_NAME); |
| 22 22 117 117 117 116 130 131 30 131 131 131 131 131 22 22 118 116 3 117 118 118 116 117 92 118 98 98 98 94 38 19 19 5 31 30 30 22 30 30 21 30 30 30 2 2 2 2 2 31 31 31 31 31 31 11 31 30 31 22 31 9 9 9 9 9 9 119 119 119 9 9 118 22 22 118 84 84 56 66 119 130 121 121 118 100 130 129 131 131 30 30 131 130 130 9 9 121 119 119 129 130 129 130 5 13 13 13 13 13 13 13 6 13 13 13 13 13 12 5 13 13 13 13 13 13 13 13 13 13 12 13 13 13 13 12 13 13 13 13 13 3 17 3 3 17 20 20 20 20 20 20 17 20 20 20 17 20 20 30 13 13 29 13 13 20 20 20 72 98 98 98 38 38 38 94 94 98 98 98 72 98 98 98 38 38 38 94 94 94 88 87 8 8 8 8 98 98 98 8 94 94 72 94 94 94 87 94 88 18 18 5 5 1 5 116 116 116 115 115 58 13 58 58 13 13 13 98 98 94 94 98 98 30 102 102 102 102 102 30 18 5 1 1 1 1 30 30 30 30 102 102 23 102 98 38 102 101 102 102 102 102 3 3 3 3 1 3 3 3 3 1 1 1 126 126 126 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 24 24 24 23 24 24 24 22 22 22 12 12 22 21 22 22 22 22 22 22 22 22 22 22 22 22 23 20 13 23 3 23 23 25 25 24 25 25 6 6 24 24 24 5 5 5 24 24 6 23 24 24 23 23 23 23 23 23 23 23 23 23 2 2 24 25 25 5 5 5 5 5 5 5 5 5 5 5 1 9 107 107 107 106 19 107 107 107 107 9 9 9 9 98 98 98 98 98 98 98 98 98 98 98 98 98 98 98 98 97 97 97 97 97 97 37 107 107 106 107 27 27 27 26 27 27 27 27 27 25 25 25 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com * Written by Alex Tomas <alex@clusterfs.com> * * Architecture independence: * Copyright (c) 2005, Bull S.A. * Written by Pierre Peiffer <pierre.peiffer@bull.net> */ /* * Extents support for EXT4 * * TODO: * - ext4*_error() should be used in some situations * - analyze all BUG()/BUG_ON(), use -EIO where appropriate * - smart tree reduction */ #include <linux/fs.h> #include <linux/time.h> #include <linux/jbd2.h> #include <linux/highuid.h> #include <linux/pagemap.h> #include <linux/quotaops.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/fiemap.h> #include <linux/iomap.h> #include <linux/sched/mm.h> #include "ext4_jbd2.h" #include "ext4_extents.h" #include "xattr.h" #include <trace/events/ext4.h> /* * used by extent splitting. */ #define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \ due to ENOSPC */ #define EXT4_EXT_MARK_UNWRIT1 0x2 /* mark first half unwritten */ #define EXT4_EXT_MARK_UNWRIT2 0x4 /* mark second half unwritten */ #define EXT4_EXT_DATA_VALID1 0x8 /* first half contains valid data */ #define EXT4_EXT_DATA_VALID2 0x10 /* second half contains valid data */ static __le32 ext4_extent_block_csum(struct inode *inode, struct ext4_extent_header *eh) { struct ext4_inode_info *ei = EXT4_I(inode); __u32 csum; csum = ext4_chksum(ei->i_csum_seed, (__u8 *)eh, EXT4_EXTENT_TAIL_OFFSET(eh)); return cpu_to_le32(csum); } static int ext4_extent_block_csum_verify(struct inode *inode, struct ext4_extent_header *eh) { struct ext4_extent_tail *et; if (!ext4_has_feature_metadata_csum(inode->i_sb)) return 1; et = find_ext4_extent_tail(eh); if (et->et_checksum != ext4_extent_block_csum(inode, eh)) return 0; return 1; } static void ext4_extent_block_csum_set(struct inode *inode, struct ext4_extent_header *eh) { struct ext4_extent_tail *et; if (!ext4_has_feature_metadata_csum(inode->i_sb)) return; et = find_ext4_extent_tail(eh); et->et_checksum = ext4_extent_block_csum(inode, eh); } static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t split, int split_flag, int flags); static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped) { /* * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this * moment, get_block can be called only for blocks inside i_size since * page cache has been already dropped and writes are blocked by * i_rwsem. So we can safely drop the i_data_sem here. */ BUG_ON(EXT4_JOURNAL(inode) == NULL); ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); *dropped = 1; return 0; } static inline void ext4_ext_path_brelse(struct ext4_ext_path *path) { brelse(path->p_bh); path->p_bh = NULL; } static void ext4_ext_drop_refs(struct ext4_ext_path *path) { int depth, i; if (IS_ERR_OR_NULL(path)) return; depth = path->p_depth; for (i = 0; i <= depth; i++, path++) ext4_ext_path_brelse(path); } void ext4_free_ext_path(struct ext4_ext_path *path) { if (IS_ERR_OR_NULL(path)) return; ext4_ext_drop_refs(path); kfree(path); } /* * Make sure 'handle' has at least 'check_cred' credits. If not, restart * transaction with 'restart_cred' credits. The function drops i_data_sem * when restarting transaction and gets it after transaction is restarted. * * The function returns 0 on success, 1 if transaction had to be restarted, * and < 0 in case of fatal error. */ int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode, int check_cred, int restart_cred, int revoke_cred) { int ret; int dropped = 0; ret = ext4_journal_ensure_credits_fn(handle, check_cred, restart_cred, revoke_cred, ext4_ext_trunc_restart_fn(inode, &dropped)); if (dropped) down_write(&EXT4_I(inode)->i_data_sem); return ret; } /* * could return: * - EROFS * - ENOMEM */ static int ext4_ext_get_access(handle_t *handle, struct inode *inode, struct ext4_ext_path *path) { int err = 0; if (path->p_bh) { /* path points to block */ BUFFER_TRACE(path->p_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, inode->i_sb, path->p_bh, EXT4_JTR_NONE); /* * The extent buffer's verified bit will be set again in * __ext4_ext_dirty(). We could leave an inconsistent * buffer if the extents updating procudure break off du * to some error happens, force to check it again. */ if (!err) clear_buffer_verified(path->p_bh); } /* path points to leaf/index in inode body */ /* we use in-core data, no need to protect them */ return err; } /* * could return: * - EROFS * - ENOMEM * - EIO */ static int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, struct inode *inode, struct ext4_ext_path *path) { int err; WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem)); if (path->p_bh) { ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh)); /* path points to block */ err = __ext4_handle_dirty_metadata(where, line, handle, inode, path->p_bh); /* Extents updating done, re-set verified flag */ if (!err) set_buffer_verified(path->p_bh); } else { /* path points to leaf/index in inode body */ err = ext4_mark_inode_dirty(handle, inode); } return err; } #define ext4_ext_dirty(handle, inode, path) \ __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path)) static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t block) { if (path) { int depth = path->p_depth; struct ext4_extent *ex; /* * Try to predict block placement assuming that we are * filling in a file which will eventually be * non-sparse --- i.e., in the case of libbfd writing * an ELF object sections out-of-order but in a way * the eventually results in a contiguous object or * executable file, or some database extending a table * space file. However, this is actually somewhat * non-ideal if we are writing a sparse file such as * qemu or KVM writing a raw image file that is going * to stay fairly sparse, since it will end up * fragmenting the file system's free space. Maybe we * should have some hueristics or some way to allow * userspace to pass a hint to file system, * especially if the latter case turns out to be * common. */ ex = path[depth].p_ext; if (ex) { ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex); ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block); if (block > ext_block) return ext_pblk + (block - ext_block); else return ext_pblk - (ext_block - block); } /* it looks like index is empty; * try to find starting block from index itself */ if (path[depth].p_bh) return path[depth].p_bh->b_blocknr; } /* OK. use inode's group */ return ext4_inode_to_goal_block(inode); } /* * Allocation for a meta data block */ static ext4_fsblk_t ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *ex, int *err, unsigned int flags) { ext4_fsblk_t goal, newblock; goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); newblock = ext4_new_meta_blocks(handle, inode, goal, flags, NULL, err); return newblock; } static inline int ext4_ext_space_block(struct inode *inode, int check) { int size; size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) / sizeof(struct ext4_extent); #ifdef AGGRESSIVE_TEST if (!check && size > 6) size = 6; #endif return size; } static inline int ext4_ext_space_block_idx(struct inode *inode, int check) { int size; size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) / sizeof(struct ext4_extent_idx); #ifdef AGGRESSIVE_TEST if (!check && size > 5) size = 5; #endif return size; } static inline int ext4_ext_space_root(struct inode *inode, int check) { int size; size = sizeof(EXT4_I(inode)->i_data); size -= sizeof(struct ext4_extent_header); size /= sizeof(struct ext4_extent); #ifdef AGGRESSIVE_TEST if (!check && size > 3) size = 3; #endif return size; } static inline int ext4_ext_space_root_idx(struct inode *inode, int check) { int size; size = sizeof(EXT4_I(inode)->i_data); size -= sizeof(struct ext4_extent_header); size /= sizeof(struct ext4_extent_idx); #ifdef AGGRESSIVE_TEST if (!check && size > 4) size = 4; #endif return size; } static inline struct ext4_ext_path * ext4_force_split_extent_at(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t lblk, int nofail) { int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext); int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_SPLIT_NOMERGE; if (nofail) flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_EX_NOFAIL; return ext4_split_extent_at(handle, inode, path, lblk, unwritten ? EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0, flags); } static int ext4_ext_max_entries(struct inode *inode, int depth) { int max; if (depth == ext_depth(inode)) { if (depth == 0) max = ext4_ext_space_root(inode, 1); else max = ext4_ext_space_root_idx(inode, 1); } else { if (depth == 0) max = ext4_ext_space_block(inode, 1); else max = ext4_ext_space_block_idx(inode, 1); } return max; } static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) { ext4_fsblk_t block = ext4_ext_pblock(ext); int len = ext4_ext_get_actual_len(ext); ext4_lblk_t lblock = le32_to_cpu(ext->ee_block); /* * We allow neither: * - zero length * - overflow/wrap-around */ if (lblock + len <= lblock) return 0; return ext4_inode_block_valid(inode, block, len); } static int ext4_valid_extent_idx(struct inode *inode, struct ext4_extent_idx *ext_idx) { ext4_fsblk_t block = ext4_idx_pblock(ext_idx); return ext4_inode_block_valid(inode, block, 1); } static int ext4_valid_extent_entries(struct inode *inode, struct ext4_extent_header *eh, ext4_lblk_t lblk, ext4_fsblk_t *pblk, int depth) { unsigned short entries; ext4_lblk_t lblock = 0; ext4_lblk_t cur = 0; if (eh->eh_entries == 0) return 1; entries = le16_to_cpu(eh->eh_entries); if (depth == 0) { /* leaf entries */ struct ext4_extent *ext = EXT_FIRST_EXTENT(eh); /* * The logical block in the first entry should equal to * the number in the index block. */ if (depth != ext_depth(inode) && lblk != le32_to_cpu(ext->ee_block)) return 0; while (entries) { if (!ext4_valid_extent(inode, ext)) return 0; /* Check for overlapping extents */ lblock = le32_to_cpu(ext->ee_block); if (lblock < cur) { *pblk = ext4_ext_pblock(ext); return 0; } cur = lblock + ext4_ext_get_actual_len(ext); ext++; entries--; } } else { struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh); /* * The logical block in the first entry should equal to * the number in the parent index block. */ if (depth != ext_depth(inode) && lblk != le32_to_cpu(ext_idx->ei_block)) return 0; while (entries) { if (!ext4_valid_extent_idx(inode, ext_idx)) return 0; /* Check for overlapping index extents */ lblock = le32_to_cpu(ext_idx->ei_block); if (lblock < cur) { *pblk = ext4_idx_pblock(ext_idx); return 0; } ext_idx++; entries--; cur = lblock + 1; } } return 1; } static int __ext4_ext_check(const char *function, unsigned int line, struct inode *inode, struct ext4_extent_header *eh, int depth, ext4_fsblk_t pblk, ext4_lblk_t lblk) { const char *error_msg; int max = 0, err = -EFSCORRUPTED; if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) { error_msg = "invalid magic"; goto corrupted; } if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) { error_msg = "unexpected eh_depth"; goto corrupted; } if (unlikely(eh->eh_max == 0)) { error_msg = "invalid eh_max"; goto corrupted; } max = ext4_ext_max_entries(inode, depth); if (unlikely(le16_to_cpu(eh->eh_max) > max)) { error_msg = "too large eh_max"; goto corrupted; } if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) { error_msg = "invalid eh_entries"; goto corrupted; } if (unlikely((eh->eh_entries == 0) && (depth > 0))) { error_msg = "eh_entries is 0 but eh_depth is > 0"; goto corrupted; } if (!ext4_valid_extent_entries(inode, eh, lblk, &pblk, depth)) { error_msg = "invalid extent entries"; goto corrupted; } if (unlikely(depth > 32)) { error_msg = "too large eh_depth"; goto corrupted; } /* Verify checksum on non-root extent tree nodes */ if (ext_depth(inode) != depth && !ext4_extent_block_csum_verify(inode, eh)) { error_msg = "extent tree corrupted"; err = -EFSBADCRC; goto corrupted; } return 0; corrupted: ext4_error_inode_err(inode, function, line, 0, -err, "pblk %llu bad header/extent: %s - magic %x, " "entries %u, max %u(%u), depth %u(%u)", (unsigned long long) pblk, error_msg, le16_to_cpu(eh->eh_magic), le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), max, le16_to_cpu(eh->eh_depth), depth); return err; } #define ext4_ext_check(inode, eh, depth, pblk) \ __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk), 0) int ext4_ext_check_inode(struct inode *inode) { return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0); } static void ext4_cache_extents(struct inode *inode, struct ext4_extent_header *eh) { struct ext4_extent *ex = EXT_FIRST_EXTENT(eh); ext4_lblk_t prev = 0; int i; for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) { unsigned int status = EXTENT_STATUS_WRITTEN; ext4_lblk_t lblk = le32_to_cpu(ex->ee_block); int len = ext4_ext_get_actual_len(ex); if (prev && (prev != lblk)) ext4_es_cache_extent(inode, prev, lblk - prev, ~0, EXTENT_STATUS_HOLE); if (ext4_ext_is_unwritten(ex)) status = EXTENT_STATUS_UNWRITTEN; ext4_es_cache_extent(inode, lblk, len, ext4_ext_pblock(ex), status); prev = lblk + len; } } static struct buffer_head * __read_extent_tree_block(const char *function, unsigned int line, struct inode *inode, struct ext4_extent_idx *idx, int depth, int flags) { struct buffer_head *bh; int err; gfp_t gfp_flags = __GFP_MOVABLE | GFP_NOFS; ext4_fsblk_t pblk; if (flags & EXT4_EX_NOFAIL) gfp_flags |= __GFP_NOFAIL; pblk = ext4_idx_pblock(idx); bh = sb_getblk_gfp(inode->i_sb, pblk, gfp_flags); if (unlikely(!bh)) return ERR_PTR(-ENOMEM); if (!bh_uptodate_or_lock(bh)) { trace_ext4_ext_load_extent(inode, pblk, _RET_IP_); err = ext4_read_bh(bh, 0, NULL, false); if (err < 0) goto errout; } if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE)) return bh; err = __ext4_ext_check(function, line, inode, ext_block_hdr(bh), depth, pblk, le32_to_cpu(idx->ei_block)); if (err) goto errout; set_buffer_verified(bh); /* * If this is a leaf block, cache all of its entries */ if (!(flags & EXT4_EX_NOCACHE) && depth == 0) { struct ext4_extent_header *eh = ext_block_hdr(bh); ext4_cache_extents(inode, eh); } return bh; errout: put_bh(bh); return ERR_PTR(err); } #define read_extent_tree_block(inode, idx, depth, flags) \ __read_extent_tree_block(__func__, __LINE__, (inode), (idx), \ (depth), (flags)) /* * This function is called to cache a file's extent information in the * extent status tree */ int ext4_ext_precache(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_ext_path *path = NULL; struct buffer_head *bh; int i = 0, depth, ret = 0; if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return 0; /* not an extent-mapped inode */ ext4_check_map_extents_env(inode); down_read(&ei->i_data_sem); depth = ext_depth(inode); /* Don't cache anything if there are no external extent blocks */ if (!depth) { up_read(&ei->i_data_sem); return ret; } path = kcalloc(depth + 1, sizeof(struct ext4_ext_path), GFP_NOFS); if (path == NULL) { up_read(&ei->i_data_sem); return -ENOMEM; } path[0].p_hdr = ext_inode_hdr(inode); ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0); if (ret) goto out; path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr); while (i >= 0) { /* * If this is a leaf block or we've reached the end of * the index block, go up */ if ((i == depth) || path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) { ext4_ext_path_brelse(path + i); i--; continue; } bh = read_extent_tree_block(inode, path[i].p_idx++, depth - i - 1, EXT4_EX_FORCE_CACHE); if (IS_ERR(bh)) { ret = PTR_ERR(bh); break; } i++; path[i].p_bh = bh; path[i].p_hdr = ext_block_hdr(bh); path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr); } ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED); out: up_read(&ei->i_data_sem); ext4_free_ext_path(path); return ret; } #ifdef EXT_DEBUG static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) { int k, l = path->p_depth; ext_debug(inode, "path:"); for (k = 0; k <= l; k++, path++) { if (path->p_idx) { ext_debug(inode, " %d->%llu", le32_to_cpu(path->p_idx->ei_block), ext4_idx_pblock(path->p_idx)); } else if (path->p_ext) { ext_debug(inode, " %d:[%d]%d:%llu ", le32_to_cpu(path->p_ext->ee_block), ext4_ext_is_unwritten(path->p_ext), ext4_ext_get_actual_len(path->p_ext), ext4_ext_pblock(path->p_ext)); } else ext_debug(inode, " []"); } ext_debug(inode, "\n"); } static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) { int depth = ext_depth(inode); struct ext4_extent_header *eh; struct ext4_extent *ex; int i; if (IS_ERR_OR_NULL(path)) return; eh = path[depth].p_hdr; ex = EXT_FIRST_EXTENT(eh); ext_debug(inode, "Displaying leaf extents\n"); for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { ext_debug(inode, "%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); } ext_debug(inode, "\n"); } static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path, ext4_fsblk_t newblock, int level) { int depth = ext_depth(inode); struct ext4_extent *ex; if (depth != level) { struct ext4_extent_idx *idx; idx = path[level].p_idx; while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) { ext_debug(inode, "%d: move %d:%llu in new index %llu\n", level, le32_to_cpu(idx->ei_block), ext4_idx_pblock(idx), newblock); idx++; } return; } ex = path[depth].p_ext; while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) { ext_debug(inode, "move %d:%llu:[%d]%d in new leaf %llu\n", le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), newblock); ex++; } } #else #define ext4_ext_show_path(inode, path) #define ext4_ext_show_leaf(inode, path) #define ext4_ext_show_move(inode, path, newblock, level) #endif /* * ext4_ext_binsearch_idx: * binary search for the closest index of the given block * the header must be checked before calling this */ static void ext4_ext_binsearch_idx(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t block) { struct ext4_extent_header *eh = path->p_hdr; struct ext4_extent_idx *r, *l, *m; ext_debug(inode, "binsearch for %u(idx): ", block); l = EXT_FIRST_INDEX(eh) + 1; r = EXT_LAST_INDEX(eh); while (l <= r) { m = l + (r - l) / 2; ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block), m, le32_to_cpu(m->ei_block), r, le32_to_cpu(r->ei_block)); if (block < le32_to_cpu(m->ei_block)) r = m - 1; else l = m + 1; } path->p_idx = l - 1; ext_debug(inode, " -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block), ext4_idx_pblock(path->p_idx)); #ifdef CHECK_BINSEARCH { struct ext4_extent_idx *chix, *ix; int k; chix = ix = EXT_FIRST_INDEX(eh); for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) { if (k != 0 && le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) { printk(KERN_DEBUG "k=%d, ix=0x%p, " "first=0x%p\n", k, ix, EXT_FIRST_INDEX(eh)); printk(KERN_DEBUG "%u <= %u\n", le32_to_cpu(ix->ei_block), le32_to_cpu(ix[-1].ei_block)); } BUG_ON(k && le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)); if (block < le32_to_cpu(ix->ei_block)) break; chix = ix; } BUG_ON(chix != path->p_idx); } #endif } /* * ext4_ext_binsearch: * binary search for closest extent of the given block * the header must be checked before calling this */ static void ext4_ext_binsearch(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t block) { struct ext4_extent_header *eh = path->p_hdr; struct ext4_extent *r, *l, *m; if (eh->eh_entries == 0) { /* * this leaf is empty: * we get such a leaf in split/add case */ return; } ext_debug(inode, "binsearch for %u: ", block); l = EXT_FIRST_EXTENT(eh) + 1; r = EXT_LAST_EXTENT(eh); while (l <= r) { m = l + (r - l) / 2; ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block), m, le32_to_cpu(m->ee_block), r, le32_to_cpu(r->ee_block)); if (block < le32_to_cpu(m->ee_block)) r = m - 1; else l = m + 1; } path->p_ext = l - 1; ext_debug(inode, " -> %d:%llu:[%d]%d ", le32_to_cpu(path->p_ext->ee_block), ext4_ext_pblock(path->p_ext), ext4_ext_is_unwritten(path->p_ext), ext4_ext_get_actual_len(path->p_ext)); #ifdef CHECK_BINSEARCH { struct ext4_extent *chex, *ex; int k; chex = ex = EXT_FIRST_EXTENT(eh); for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) { BUG_ON(k && le32_to_cpu(ex->ee_block) <= le32_to_cpu(ex[-1].ee_block)); if (block < le32_to_cpu(ex->ee_block)) break; chex = ex; } BUG_ON(chex != path->p_ext); } #endif } void ext4_ext_tree_init(handle_t *handle, struct inode *inode) { struct ext4_extent_header *eh; eh = ext_inode_hdr(inode); eh->eh_depth = 0; eh->eh_entries = 0; eh->eh_magic = EXT4_EXT_MAGIC; eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0)); eh->eh_generation = 0; ext4_mark_inode_dirty(handle, inode); } struct ext4_ext_path * ext4_find_extent(struct inode *inode, ext4_lblk_t block, struct ext4_ext_path *path, int flags) { struct ext4_extent_header *eh; struct buffer_head *bh; short int depth, i, ppos = 0; int ret; gfp_t gfp_flags = GFP_NOFS; if (flags & EXT4_EX_NOFAIL) gfp_flags |= __GFP_NOFAIL; eh = ext_inode_hdr(inode); depth = ext_depth(inode); if (depth < 0 || depth > EXT4_MAX_EXTENT_DEPTH) { EXT4_ERROR_INODE(inode, "inode has invalid extent depth: %d", depth); ret = -EFSCORRUPTED; goto err; } if (path) { ext4_ext_drop_refs(path); if (depth > path[0].p_maxdepth) { kfree(path); path = NULL; } } if (!path) { /* account possible depth increase */ path = kcalloc(depth + 2, sizeof(struct ext4_ext_path), gfp_flags); if (unlikely(!path)) return ERR_PTR(-ENOMEM); path[0].p_maxdepth = depth + 1; } path[0].p_hdr = eh; path[0].p_bh = NULL; i = depth; if (!(flags & EXT4_EX_NOCACHE) && depth == 0) ext4_cache_extents(inode, eh); /* walk through the tree */ while (i) { ext_debug(inode, "depth %d: num %d, max %d\n", ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); ext4_ext_binsearch_idx(inode, path + ppos, block); path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx); path[ppos].p_depth = i; path[ppos].p_ext = NULL; bh = read_extent_tree_block(inode, path[ppos].p_idx, --i, flags); if (IS_ERR(bh)) { ret = PTR_ERR(bh); goto err; } eh = ext_block_hdr(bh); ppos++; path[ppos].p_bh = bh; path[ppos].p_hdr = eh; } path[ppos].p_depth = i; path[ppos].p_ext = NULL; path[ppos].p_idx = NULL; /* find extent */ ext4_ext_binsearch(inode, path + ppos, block); /* if not an empty leaf */ if (path[ppos].p_ext) path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext); ext4_ext_show_path(inode, path); return path; err: ext4_free_ext_path(path); return ERR_PTR(ret); } /* * ext4_ext_insert_index: * insert new index [@logical;@ptr] into the block at @curp; * check where to insert: before @curp or after @curp */ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, struct ext4_ext_path *curp, int logical, ext4_fsblk_t ptr) { struct ext4_extent_idx *ix; int len, err; err = ext4_ext_get_access(handle, inode, curp); if (err) return err; if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) { EXT4_ERROR_INODE(inode, "logical %d == ei_block %d!", logical, le32_to_cpu(curp->p_idx->ei_block)); return -EFSCORRUPTED; } if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) >= le16_to_cpu(curp->p_hdr->eh_max))) { EXT4_ERROR_INODE(inode, "eh_entries %d >= eh_max %d!", le16_to_cpu(curp->p_hdr->eh_entries), le16_to_cpu(curp->p_hdr->eh_max)); return -EFSCORRUPTED; } if (logical > le32_to_cpu(curp->p_idx->ei_block)) { /* insert after */ ext_debug(inode, "insert new index %d after: %llu\n", logical, ptr); ix = curp->p_idx + 1; } else { /* insert before */ ext_debug(inode, "insert new index %d before: %llu\n", logical, ptr); ix = curp->p_idx; } if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) { EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!"); return -EFSCORRUPTED; } len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1; BUG_ON(len < 0); if (len > 0) { ext_debug(inode, "insert new index %d: " "move %d indices from 0x%p to 0x%p\n", logical, len, ix, ix + 1); memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx)); } ix->ei_block = cpu_to_le32(logical); ext4_idx_store_pblock(ix, ptr); le16_add_cpu(&curp->p_hdr->eh_entries, 1); if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!"); return -EFSCORRUPTED; } err = ext4_ext_dirty(handle, inode, curp); ext4_std_error(inode->i_sb, err); return err; } /* * ext4_ext_split: * inserts new subtree into the path, using free index entry * at depth @at: * - allocates all needed blocks (new leaf and all intermediate index blocks) * - makes decision where to split * - moves remaining extents and index entries (right to the split point) * into the newly allocated blocks * - initializes subtree */ static int ext4_ext_split(handle_t *handle, struct inode *inode, unsigned int flags, struct ext4_ext_path *path, struct ext4_extent *newext, int at) { struct buffer_head *bh = NULL; int depth = ext_depth(inode); struct ext4_extent_header *neh; struct ext4_extent_idx *fidx; int i = at, k, m, a; ext4_fsblk_t newblock, oldblock; __le32 border; ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */ gfp_t gfp_flags = GFP_NOFS; int err = 0; size_t ext_size = 0; if (flags & EXT4_EX_NOFAIL) gfp_flags |= __GFP_NOFAIL; /* make decision: where to split? */ /* FIXME: now decision is simplest: at current extent */ /* if current leaf will be split, then we should use * border from split point */ if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) { EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!"); return -EFSCORRUPTED; } if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { border = path[depth].p_ext[1].ee_block; ext_debug(inode, "leaf will be split." " next leaf starts at %d\n", le32_to_cpu(border)); } else { border = newext->ee_block; ext_debug(inode, "leaf will be added." " next leaf starts at %d\n", le32_to_cpu(border)); } /* * If error occurs, then we break processing * and mark filesystem read-only. index won't * be inserted and tree will be in consistent * state. Next mount will repair buffers too. */ /* * Get array to track all allocated blocks. * We need this to handle errors and free blocks * upon them. */ ablocks = kcalloc(depth, sizeof(ext4_fsblk_t), gfp_flags); if (!ablocks) return -ENOMEM; /* allocate all needed blocks */ ext_debug(inode, "allocate %d blocks for indexes/leaf\n", depth - at); for (a = 0; a < depth - at; a++) { newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err, flags); if (newblock == 0) goto cleanup; ablocks[a] = newblock; } /* initialize new leaf */ newblock = ablocks[--a]; if (unlikely(newblock == 0)) { EXT4_ERROR_INODE(inode, "newblock == 0!"); err = -EFSCORRUPTED; goto cleanup; } bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS); if (unlikely(!bh)) { err = -ENOMEM; goto cleanup; } lock_buffer(bh); err = ext4_journal_get_create_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (err) goto cleanup; neh = ext_block_hdr(bh); neh->eh_entries = 0; neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); neh->eh_magic = EXT4_EXT_MAGIC; neh->eh_depth = 0; neh->eh_generation = 0; /* move remainder of path[depth] to the new leaf */ if (unlikely(path[depth].p_hdr->eh_entries != path[depth].p_hdr->eh_max)) { EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!", path[depth].p_hdr->eh_entries, path[depth].p_hdr->eh_max); err = -EFSCORRUPTED; goto cleanup; } /* start copy from next extent */ m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++; ext4_ext_show_move(inode, path, newblock, depth); if (m) { struct ext4_extent *ex; ex = EXT_FIRST_EXTENT(neh); memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m); le16_add_cpu(&neh->eh_entries, m); } /* zero out unused area in the extent block */ ext_size = sizeof(struct ext4_extent_header) + sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries); memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size); ext4_extent_block_csum_set(inode, neh); set_buffer_uptodate(bh); unlock_buffer(bh); err = ext4_handle_dirty_metadata(handle, inode, bh); if (err) goto cleanup; brelse(bh); bh = NULL; /* correct old leaf */ if (m) { err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto cleanup; le16_add_cpu(&path[depth].p_hdr->eh_entries, -m); err = ext4_ext_dirty(handle, inode, path + depth); if (err) goto cleanup; } /* create intermediate indexes */ k = depth - at - 1; if (unlikely(k < 0)) { EXT4_ERROR_INODE(inode, "k %d < 0!", k); err = -EFSCORRUPTED; goto cleanup; } if (k) ext_debug(inode, "create %d intermediate indices\n", k); /* insert new index into current index block */ /* current depth stored in i var */ i = depth - 1; while (k--) { oldblock = newblock; newblock = ablocks[--a]; bh = sb_getblk(inode->i_sb, newblock); if (unlikely(!bh)) { err = -ENOMEM; goto cleanup; } lock_buffer(bh); err = ext4_journal_get_create_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (err) goto cleanup; neh = ext_block_hdr(bh); neh->eh_entries = cpu_to_le16(1); neh->eh_magic = EXT4_EXT_MAGIC; neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); neh->eh_depth = cpu_to_le16(depth - i); neh->eh_generation = 0; fidx = EXT_FIRST_INDEX(neh); fidx->ei_block = border; ext4_idx_store_pblock(fidx, oldblock); ext_debug(inode, "int.index at %d (block %llu): %u -> %llu\n", i, newblock, le32_to_cpu(border), oldblock); /* move remainder of path[i] to the new index block */ if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) != EXT_LAST_INDEX(path[i].p_hdr))) { EXT4_ERROR_INODE(inode, "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!", le32_to_cpu(path[i].p_ext->ee_block)); err = -EFSCORRUPTED; goto cleanup; } /* start copy indexes */ m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++; ext_debug(inode, "cur 0x%p, last 0x%p\n", path[i].p_idx, EXT_MAX_INDEX(path[i].p_hdr)); ext4_ext_show_move(inode, path, newblock, i); if (m) { memmove(++fidx, path[i].p_idx, sizeof(struct ext4_extent_idx) * m); le16_add_cpu(&neh->eh_entries, m); } /* zero out unused area in the extent block */ ext_size = sizeof(struct ext4_extent_header) + (sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries)); memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size); ext4_extent_block_csum_set(inode, neh); set_buffer_uptodate(bh); unlock_buffer(bh); err = ext4_handle_dirty_metadata(handle, inode, bh); if (err) goto cleanup; brelse(bh); bh = NULL; /* correct old index */ if (m) { err = ext4_ext_get_access(handle, inode, path + i); if (err) goto cleanup; le16_add_cpu(&path[i].p_hdr->eh_entries, -m); err = ext4_ext_dirty(handle, inode, path + i); if (err) goto cleanup; } i--; } /* insert new index */ err = ext4_ext_insert_index(handle, inode, path + at, le32_to_cpu(border), newblock); cleanup: if (bh) { if (buffer_locked(bh)) unlock_buffer(bh); brelse(bh); } if (err) { /* free all allocated blocks in error case */ for (i = 0; i < depth; i++) { if (!ablocks[i]) continue; ext4_free_blocks(handle, inode, NULL, ablocks[i], 1, EXT4_FREE_BLOCKS_METADATA); } } kfree(ablocks); return err; } /* * ext4_ext_grow_indepth: * implements tree growing procedure: * - allocates new block * - moves top-level data (index block or leaf) into the new block * - initializes new top-level, creating index that points to the * just created block */ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, unsigned int flags) { struct ext4_extent_header *neh; struct buffer_head *bh; ext4_fsblk_t newblock, goal = 0; struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; int err = 0; size_t ext_size = 0; /* Try to prepend new index to old one */ if (ext_depth(inode)) goal = ext4_idx_pblock(EXT_FIRST_INDEX(ext_inode_hdr(inode))); if (goal > le32_to_cpu(es->s_first_data_block)) { flags |= EXT4_MB_HINT_TRY_GOAL; goal--; } else goal = ext4_inode_to_goal_block(inode); newblock = ext4_new_meta_blocks(handle, inode, goal, flags, NULL, &err); if (newblock == 0) return err; bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS); if (unlikely(!bh)) return -ENOMEM; lock_buffer(bh); err = ext4_journal_get_create_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (err) { unlock_buffer(bh); goto out; } ext_size = sizeof(EXT4_I(inode)->i_data); /* move top-level index/leaf into new block */ memmove(bh->b_data, EXT4_I(inode)->i_data, ext_size); /* zero out unused area in the extent block */ memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size); /* set size of new block */ neh = ext_block_hdr(bh); /* old root could have indexes or leaves * so calculate e_max right way */ if (ext_depth(inode)) neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); else neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); neh->eh_magic = EXT4_EXT_MAGIC; ext4_extent_block_csum_set(inode, neh); set_buffer_uptodate(bh); set_buffer_verified(bh); unlock_buffer(bh); err = ext4_handle_dirty_metadata(handle, inode, bh); if (err) goto out; /* Update top-level index: num,max,pointer */ neh = ext_inode_hdr(inode); neh->eh_entries = cpu_to_le16(1); ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock); if (neh->eh_depth == 0) { /* Root extent block becomes index block */ neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0)); EXT_FIRST_INDEX(neh)->ei_block = EXT_FIRST_EXTENT(neh)->ee_block; } ext_debug(inode, "new root: num %d(%d), lblock %d, ptr %llu\n", le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), ext4_idx_pblock(EXT_FIRST_INDEX(neh))); le16_add_cpu(&neh->eh_depth, 1); err = ext4_mark_inode_dirty(handle, inode); out: brelse(bh); return err; } /* * ext4_ext_create_new_leaf: * finds empty index and adds new leaf. * if no free index is found, then it requests in-depth growing. */ static struct ext4_ext_path * ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, unsigned int mb_flags, unsigned int gb_flags, struct ext4_ext_path *path, struct ext4_extent *newext) { struct ext4_ext_path *curp; int depth, i, err = 0; ext4_lblk_t ee_block = le32_to_cpu(newext->ee_block); repeat: i = depth = ext_depth(inode); /* walk up to the tree and look for free index entry */ curp = path + depth; while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { i--; curp--; } /* we use already allocated block for index block, * so subsequent data blocks should be contiguous */ if (EXT_HAS_FREE_INDEX(curp)) { /* if we found index with free entry, then use that * entry: create all needed subtree and add new leaf */ err = ext4_ext_split(handle, inode, mb_flags, path, newext, i); if (err) goto errout; /* refill path */ path = ext4_find_extent(inode, ee_block, path, gb_flags); return path; } /* tree is full, time to grow in depth */ err = ext4_ext_grow_indepth(handle, inode, mb_flags); if (err) goto errout; /* refill path */ path = ext4_find_extent(inode, ee_block, path, gb_flags); if (IS_ERR(path)) return path; /* * only first (depth 0 -> 1) produces free space; * in all other cases we have to split the grown tree */ depth = ext_depth(inode); if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { /* now we need to split */ goto repeat; } return path; errout: ext4_free_ext_path(path); return ERR_PTR(err); } /* * search the closest allocated block to the left for *logical * and returns it at @logical + it's physical address at @phys * if *logical is the smallest allocated block, the function * returns 0 at @phys * return value contains 0 (success) or error code */ static int ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t *logical, ext4_fsblk_t *phys) { struct ext4_extent_idx *ix; struct ext4_extent *ex; int depth, ee_len; if (unlikely(path == NULL)) { EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); return -EFSCORRUPTED; } depth = path->p_depth; *phys = 0; if (depth == 0 && path->p_ext == NULL) return 0; /* usually extent in the path covers blocks smaller * then *logical, but it can be that extent is the * first one in the file */ ex = path[depth].p_ext; ee_len = ext4_ext_get_actual_len(ex); if (*logical < le32_to_cpu(ex->ee_block)) { if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { EXT4_ERROR_INODE(inode, "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!", *logical, le32_to_cpu(ex->ee_block)); return -EFSCORRUPTED; } while (--depth >= 0) { ix = path[depth].p_idx; if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { EXT4_ERROR_INODE(inode, "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!", ix != NULL ? le32_to_cpu(ix->ei_block) : 0, le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block), depth); return -EFSCORRUPTED; } } return 0; } if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { EXT4_ERROR_INODE(inode, "logical %d < ee_block %d + ee_len %d!", *logical, le32_to_cpu(ex->ee_block), ee_len); return -EFSCORRUPTED; } *logical = le32_to_cpu(ex->ee_block) + ee_len - 1; *phys = ext4_ext_pblock(ex) + ee_len - 1; return 0; } /* * Search the closest allocated block to the right for *logical * and returns it at @logical + it's physical address at @phys. * If not exists, return 0 and @phys is set to 0. We will return * 1 which means we found an allocated block and ret_ex is valid. * Or return a (< 0) error code. */ static int ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t *logical, ext4_fsblk_t *phys, struct ext4_extent *ret_ex, int flags) { struct buffer_head *bh = NULL; struct ext4_extent_header *eh; struct ext4_extent_idx *ix; struct ext4_extent *ex; int depth; /* Note, NOT eh_depth; depth from top of tree */ int ee_len; if (unlikely(path == NULL)) { EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); return -EFSCORRUPTED; } depth = path->p_depth; *phys = 0; if (depth == 0 && path->p_ext == NULL) return 0; /* usually extent in the path covers blocks smaller * then *logical, but it can be that extent is the * first one in the file */ ex = path[depth].p_ext; ee_len = ext4_ext_get_actual_len(ex); if (*logical < le32_to_cpu(ex->ee_block)) { if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { EXT4_ERROR_INODE(inode, "first_extent(path[%d].p_hdr) != ex", depth); return -EFSCORRUPTED; } while (--depth >= 0) { ix = path[depth].p_idx; if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { EXT4_ERROR_INODE(inode, "ix != EXT_FIRST_INDEX *logical %d!", *logical); return -EFSCORRUPTED; } } goto found_extent; } if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { EXT4_ERROR_INODE(inode, "logical %d < ee_block %d + ee_len %d!", *logical, le32_to_cpu(ex->ee_block), ee_len); return -EFSCORRUPTED; } if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) { /* next allocated block in this leaf */ ex++; goto found_extent; } /* go up and search for index to the right */ while (--depth >= 0) { ix = path[depth].p_idx; if (ix != EXT_LAST_INDEX(path[depth].p_hdr)) goto got_index; } /* we've gone up to the root and found no index to the right */ return 0; got_index: /* we've found index to the right, let's * follow it and find the closest allocated * block to the right */ ix++; while (++depth < path->p_depth) { /* subtract from p_depth to get proper eh_depth */ bh = read_extent_tree_block(inode, ix, path->p_depth - depth, flags); if (IS_ERR(bh)) return PTR_ERR(bh); eh = ext_block_hdr(bh); ix = EXT_FIRST_INDEX(eh); put_bh(bh); } bh = read_extent_tree_block(inode, ix, path->p_depth - depth, flags); if (IS_ERR(bh)) return PTR_ERR(bh); eh = ext_block_hdr(bh); ex = EXT_FIRST_EXTENT(eh); found_extent: *logical = le32_to_cpu(ex->ee_block); *phys = ext4_ext_pblock(ex); if (ret_ex) *ret_ex = *ex; if (bh) put_bh(bh); return 1; } /* * ext4_ext_next_allocated_block: * returns allocated block in subsequent extent or EXT_MAX_BLOCKS. * NOTE: it considers block number from index entry as * allocated block. Thus, index entries have to be consistent * with leaves. */ ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path) { int depth; BUG_ON(path == NULL); depth = path->p_depth; if (depth == 0 && path->p_ext == NULL) return EXT_MAX_BLOCKS; while (depth >= 0) { struct ext4_ext_path *p = &path[depth]; if (depth == path->p_depth) { /* leaf */ if (p->p_ext && p->p_ext != EXT_LAST_EXTENT(p->p_hdr)) return le32_to_cpu(p->p_ext[1].ee_block); } else { /* index */ if (p->p_idx != EXT_LAST_INDEX(p->p_hdr)) return le32_to_cpu(p->p_idx[1].ei_block); } depth--; } return EXT_MAX_BLOCKS; } /* * ext4_ext_next_leaf_block: * returns first allocated block from next leaf or EXT_MAX_BLOCKS */ static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path) { int depth; BUG_ON(path == NULL); depth = path->p_depth; /* zero-tree has no leaf blocks at all */ if (depth == 0) return EXT_MAX_BLOCKS; /* go to index block */ depth--; while (depth >= 0) { if (path[depth].p_idx != EXT_LAST_INDEX(path[depth].p_hdr)) return (ext4_lblk_t) le32_to_cpu(path[depth].p_idx[1].ei_block); depth--; } return EXT_MAX_BLOCKS; } /* * ext4_ext_correct_indexes: * if leaf gets modified and modified extent is first in the leaf, * then we have to correct all indexes above. * TODO: do we need to correct tree in all cases? */ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, struct ext4_ext_path *path) { struct ext4_extent_header *eh; int depth = ext_depth(inode); struct ext4_extent *ex; __le32 border; int k, err = 0; eh = path[depth].p_hdr; ex = path[depth].p_ext; if (unlikely(ex == NULL || eh == NULL)) { EXT4_ERROR_INODE(inode, "ex %p == NULL or eh %p == NULL", ex, eh); return -EFSCORRUPTED; } if (depth == 0) { /* there is no tree at all */ return 0; } if (ex != EXT_FIRST_EXTENT(eh)) { /* we correct tree if first leaf got modified only */ return 0; } /* * TODO: we need correction if border is smaller than current one */ k = depth - 1; border = path[depth].p_ext->ee_block; err = ext4_ext_get_access(handle, inode, path + k); if (err) return err; path[k].p_idx->ei_block = border; err = ext4_ext_dirty(handle, inode, path + k); if (err) return err; while (k--) { /* change all left-side indexes */ if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) break; err = ext4_ext_get_access(handle, inode, path + k); if (err) goto clean; path[k].p_idx->ei_block = border; err = ext4_ext_dirty(handle, inode, path + k); if (err) goto clean; } return 0; clean: /* * The path[k].p_bh is either unmodified or with no verified bit * set (see ext4_ext_get_access()). So just clear the verified bit * of the successfully modified extents buffers, which will force * these extents to be checked to avoid using inconsistent data. */ while (++k < depth) clear_buffer_verified(path[k].p_bh); return err; } static int ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, struct ext4_extent *ex2) { unsigned short ext1_ee_len, ext2_ee_len; if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2)) return 0; ext1_ee_len = ext4_ext_get_actual_len(ex1); ext2_ee_len = ext4_ext_get_actual_len(ex2); if (le32_to_cpu(ex1->ee_block) + ext1_ee_len != le32_to_cpu(ex2->ee_block)) return 0; if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) return 0; if (ext4_ext_is_unwritten(ex1) && ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN) return 0; #ifdef AGGRESSIVE_TEST if (ext1_ee_len >= 4) return 0; #endif if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2)) return 1; return 0; } /* * This function tries to merge the "ex" extent to the next extent in the tree. * It always tries to merge towards right. If you want to merge towards * left, pass "ex - 1" as argument instead of "ex". * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns * 1 if they got merged. */ static int ext4_ext_try_to_merge_right(struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *ex) { struct ext4_extent_header *eh; unsigned int depth, len; int merge_done = 0, unwritten; depth = ext_depth(inode); BUG_ON(path[depth].p_hdr == NULL); eh = path[depth].p_hdr; while (ex < EXT_LAST_EXTENT(eh)) { if (!ext4_can_extents_be_merged(inode, ex, ex + 1)) break; /* merge with next extent! */ unwritten = ext4_ext_is_unwritten(ex); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(ex + 1)); if (unwritten) ext4_ext_mark_unwritten(ex); if (ex + 1 < EXT_LAST_EXTENT(eh)) { len = (EXT_LAST_EXTENT(eh) - ex - 1) * sizeof(struct ext4_extent); memmove(ex + 1, ex + 2, len); } le16_add_cpu(&eh->eh_entries, -1); merge_done = 1; WARN_ON(eh->eh_entries == 0); if (!eh->eh_entries) EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!"); } return merge_done; } /* * This function does a very simple check to see if we can collapse * an extent tree with a single extent tree leaf block into the inode. */ static void ext4_ext_try_to_merge_up(handle_t *handle, struct inode *inode, struct ext4_ext_path *path) { size_t s; unsigned max_root = ext4_ext_space_root(inode, 0); ext4_fsblk_t blk; if ((path[0].p_depth != 1) || (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) || (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root)) return; /* * We need to modify the block allocation bitmap and the block * group descriptor to release the extent tree block. If we * can't get the journal credits, give up. */ if (ext4_journal_extend(handle, 2, ext4_free_metadata_revoke_credits(inode->i_sb, 1))) return; /* * Copy the extent data up to the inode */ blk = ext4_idx_pblock(path[0].p_idx); s = le16_to_cpu(path[1].p_hdr->eh_entries) * sizeof(struct ext4_extent_idx); s += sizeof(struct ext4_extent_header); path[1].p_maxdepth = path[0].p_maxdepth; memcpy(path[0].p_hdr, path[1].p_hdr, s); path[0].p_depth = 0; path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) + (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr)); path[0].p_hdr->eh_max = cpu_to_le16(max_root); ext4_ext_path_brelse(path + 1); ext4_free_blocks(handle, inode, NULL, blk, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); } /* * This function tries to merge the @ex extent to neighbours in the tree, then * tries to collapse the extent tree into the inode. */ static void ext4_ext_try_to_merge(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *ex) { struct ext4_extent_header *eh; unsigned int depth; int merge_done = 0; depth = ext_depth(inode); BUG_ON(path[depth].p_hdr == NULL); eh = path[depth].p_hdr; if (ex > EXT_FIRST_EXTENT(eh)) merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1); if (!merge_done) (void) ext4_ext_try_to_merge_right(inode, path, ex); ext4_ext_try_to_merge_up(handle, inode, path); } /* * check if a portion of the "newext" extent overlaps with an * existing extent. * * If there is an overlap discovered, it updates the length of the newext * such that there will be no overlap, and then returns 1. * If there is no overlap found, it returns 0. */ static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi, struct inode *inode, struct ext4_extent *newext, struct ext4_ext_path *path) { ext4_lblk_t b1, b2; unsigned int depth, len1; unsigned int ret = 0; b1 = le32_to_cpu(newext->ee_block); len1 = ext4_ext_get_actual_len(newext); depth = ext_depth(inode); if (!path[depth].p_ext) goto out; b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block)); /* * get the next allocated block if the extent in the path * is before the requested block(s) */ if (b2 < b1) { b2 = ext4_ext_next_allocated_block(path); if (b2 == EXT_MAX_BLOCKS) goto out; b2 = EXT4_LBLK_CMASK(sbi, b2); } /* check for wrap through zero on extent logical start block*/ if (b1 + len1 < b1) { len1 = EXT_MAX_BLOCKS - b1; newext->ee_len = cpu_to_le16(len1); ret = 1; } /* check for overlap */ if (b1 + len1 > b2) { newext->ee_len = cpu_to_le16(b2 - b1); ret = 1; } out: return ret; } /* * ext4_ext_insert_extent: * tries to merge requested extent into the existing extent or * inserts requested extent as new one into the tree, * creating new leaf in the no-space case. */ struct ext4_ext_path * ext4_ext_insert_extent(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *newext, int gb_flags) { struct ext4_extent_header *eh; struct ext4_extent *ex, *fex; struct ext4_extent *nearex; /* nearest extent */ int depth, len, err = 0; ext4_lblk_t next; int mb_flags = 0, unwritten; if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) mb_flags |= EXT4_MB_DELALLOC_RESERVED; if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); err = -EFSCORRUPTED; goto errout; } depth = ext_depth(inode); ex = path[depth].p_ext; eh = path[depth].p_hdr; if (unlikely(path[depth].p_hdr == NULL)) { EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); err = -EFSCORRUPTED; goto errout; } /* try to insert block into found extent and return */ if (ex && !(gb_flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE)) { /* * Try to see whether we should rather test the extent on * right from ex, or from the left of ex. This is because * ext4_find_extent() can return either extent on the * left, or on the right from the searched position. This * will make merging more effective. */ if (ex < EXT_LAST_EXTENT(eh) && (le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex) < le32_to_cpu(newext->ee_block))) { ex += 1; goto prepend; } else if ((ex > EXT_FIRST_EXTENT(eh)) && (le32_to_cpu(newext->ee_block) + ext4_ext_get_actual_len(newext) < le32_to_cpu(ex->ee_block))) ex -= 1; /* Try to append newex to the ex */ if (ext4_can_extents_be_merged(inode, ex, newext)) { ext_debug(inode, "append [%d]%d block to %u:[%d]%d" "(from %llu)\n", ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), le32_to_cpu(ex->ee_block), ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto errout; unwritten = ext4_ext_is_unwritten(ex); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(newext)); if (unwritten) ext4_ext_mark_unwritten(ex); nearex = ex; goto merge; } prepend: /* Try to prepend newex to the ex */ if (ext4_can_extents_be_merged(inode, newext, ex)) { ext_debug(inode, "prepend %u[%d]%d block to %u:[%d]%d" "(from %llu)\n", le32_to_cpu(newext->ee_block), ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), le32_to_cpu(ex->ee_block), ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto errout; unwritten = ext4_ext_is_unwritten(ex); ex->ee_block = newext->ee_block; ext4_ext_store_pblock(ex, ext4_ext_pblock(newext)); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(newext)); if (unwritten) ext4_ext_mark_unwritten(ex); nearex = ex; goto merge; } } depth = ext_depth(inode); eh = path[depth].p_hdr; if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) goto has_space; /* probably next leaf has space for us? */ fex = EXT_LAST_EXTENT(eh); next = EXT_MAX_BLOCKS; if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)) next = ext4_ext_next_leaf_block(path); if (next != EXT_MAX_BLOCKS) { struct ext4_ext_path *npath; ext_debug(inode, "next leaf block - %u\n", next); npath = ext4_find_extent(inode, next, NULL, gb_flags); if (IS_ERR(npath)) { err = PTR_ERR(npath); goto errout; } BUG_ON(npath->p_depth != path->p_depth); eh = npath[depth].p_hdr; if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) { ext_debug(inode, "next leaf isn't full(%d)\n", le16_to_cpu(eh->eh_entries)); ext4_free_ext_path(path); path = npath; goto has_space; } ext_debug(inode, "next leaf has no free space(%d,%d)\n", le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); ext4_free_ext_path(npath); } /* * There is no free space in the found leaf. * We're gonna add a new leaf in the tree. */ if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) mb_flags |= EXT4_MB_USE_RESERVED; path = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags, path, newext); if (IS_ERR(path)) return path; depth = ext_depth(inode); eh = path[depth].p_hdr; has_space: nearex = path[depth].p_ext; err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto errout; if (!nearex) { /* there is no extent in this leaf, create first one */ ext_debug(inode, "first extent in the leaf: %u:%llu:[%d]%d\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext)); nearex = EXT_FIRST_EXTENT(eh); } else { if (le32_to_cpu(newext->ee_block) > le32_to_cpu(nearex->ee_block)) { /* Insert after */ ext_debug(inode, "insert %u:%llu:[%d]%d before: " "nearest %p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), nearex); nearex++; } else { /* Insert before */ BUG_ON(newext->ee_block == nearex->ee_block); ext_debug(inode, "insert %u:%llu:[%d]%d after: " "nearest %p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), nearex); } len = EXT_LAST_EXTENT(eh) - nearex + 1; if (len > 0) { ext_debug(inode, "insert %u:%llu:[%d]%d: " "move %d extents from 0x%p to 0x%p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), len, nearex, nearex + 1); memmove(nearex + 1, nearex, len * sizeof(struct ext4_extent)); } } le16_add_cpu(&eh->eh_entries, 1); path[depth].p_ext = nearex; nearex->ee_block = newext->ee_block; ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext)); nearex->ee_len = newext->ee_len; merge: /* try to merge extents */ if (!(gb_flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE)) ext4_ext_try_to_merge(handle, inode, path, nearex); /* time to correct all indexes above */ err = ext4_ext_correct_indexes(handle, inode, path); if (err) goto errout; err = ext4_ext_dirty(handle, inode, path + path->p_depth); if (err) goto errout; return path; errout: ext4_free_ext_path(path); return ERR_PTR(err); } static int ext4_fill_es_cache_info(struct inode *inode, ext4_lblk_t block, ext4_lblk_t num, struct fiemap_extent_info *fieinfo) { ext4_lblk_t next, end = block + num - 1; struct extent_status es; unsigned char blksize_bits = inode->i_sb->s_blocksize_bits; unsigned int flags; int err; while (block <= end) { next = 0; flags = 0; if (!ext4_es_lookup_extent(inode, block, &next, &es, NULL)) break; if (ext4_es_is_unwritten(&es)) flags |= FIEMAP_EXTENT_UNWRITTEN; if (ext4_es_is_delayed(&es)) flags |= (FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN); if (ext4_es_is_hole(&es)) flags |= EXT4_FIEMAP_EXTENT_HOLE; if (next == 0) flags |= FIEMAP_EXTENT_LAST; if (flags & (FIEMAP_EXTENT_DELALLOC| EXT4_FIEMAP_EXTENT_HOLE)) es.es_pblk = 0; else es.es_pblk = ext4_es_pblock(&es); err = fiemap_fill_next_extent(fieinfo, (__u64)es.es_lblk << blksize_bits, (__u64)es.es_pblk << blksize_bits, (__u64)es.es_len << blksize_bits, flags); if (next == 0) break; block = next; if (err < 0) return err; if (err == 1) return 0; } return 0; } /* * ext4_ext_find_hole - find hole around given block according to the given path * @inode: inode we lookup in * @path: path in extent tree to @lblk * @lblk: pointer to logical block around which we want to determine hole * * Determine hole length (and start if easily possible) around given logical * block. We don't try too hard to find the beginning of the hole but @path * actually points to extent before @lblk, we provide it. * * The function returns the length of a hole starting at @lblk. We update @lblk * to the beginning of the hole if we managed to find it. */ static ext4_lblk_t ext4_ext_find_hole(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t *lblk) { int depth = ext_depth(inode); struct ext4_extent *ex; ext4_lblk_t len; ex = path[depth].p_ext; if (ex == NULL) { /* there is no extent yet, so gap is [0;-] */ *lblk = 0; len = EXT_MAX_BLOCKS; } else if (*lblk < le32_to_cpu(ex->ee_block)) { len = le32_to_cpu(ex->ee_block) - *lblk; } else if (*lblk >= le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex)) { ext4_lblk_t next; *lblk = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); next = ext4_ext_next_allocated_block(path); BUG_ON(next == *lblk); len = next - *lblk; } else { BUG(); } return len; } /* * ext4_ext_rm_idx: * removes index from the index block. */ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, int depth) { int err; ext4_fsblk_t leaf; int k = depth - 1; /* free index block */ leaf = ext4_idx_pblock(path[k].p_idx); if (unlikely(path[k].p_hdr->eh_entries == 0)) { EXT4_ERROR_INODE(inode, "path[%d].p_hdr->eh_entries == 0", k); return -EFSCORRUPTED; } err = ext4_ext_get_access(handle, inode, path + k); if (err) return err; if (path[k].p_idx != EXT_LAST_INDEX(path[k].p_hdr)) { int len = EXT_LAST_INDEX(path[k].p_hdr) - path[k].p_idx; len *= sizeof(struct ext4_extent_idx); memmove(path[k].p_idx, path[k].p_idx + 1, len); } le16_add_cpu(&path[k].p_hdr->eh_entries, -1); err = ext4_ext_dirty(handle, inode, path + k); if (err) return err; ext_debug(inode, "index is empty, remove it, free block %llu\n", leaf); trace_ext4_ext_rm_idx(inode, leaf); ext4_free_blocks(handle, inode, NULL, leaf, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); while (--k >= 0) { if (path[k + 1].p_idx != EXT_FIRST_INDEX(path[k + 1].p_hdr)) break; err = ext4_ext_get_access(handle, inode, path + k); if (err) goto clean; path[k].p_idx->ei_block = path[k + 1].p_idx->ei_block; err = ext4_ext_dirty(handle, inode, path + k); if (err) goto clean; } return 0; clean: /* * The path[k].p_bh is either unmodified or with no verified bit * set (see ext4_ext_get_access()). So just clear the verified bit * of the successfully modified extents buffers, which will force * these extents to be checked to avoid using inconsistent data. */ while (++k < depth) clear_buffer_verified(path[k].p_bh); return err; } /* * ext4_ext_calc_credits_for_single_extent: * This routine returns max. credits that needed to insert an extent * to the extent tree. * When pass the actual path, the caller should calculate credits * under i_data_sem. */ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, struct ext4_ext_path *path) { if (path) { int depth = ext_depth(inode); int ret = 0; /* probably there is space in leaf? */ if (le16_to_cpu(path[depth].p_hdr->eh_entries) < le16_to_cpu(path[depth].p_hdr->eh_max)) { /* * There are some space in the leaf tree, no * need to account for leaf block credit * * bitmaps and block group descriptor blocks * and other metadata blocks still need to be * accounted. */ /* 1 bitmap, 1 block group descriptor */ ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb); return ret; } } return ext4_chunk_trans_blocks(inode, nrblocks); } /* * How many index/leaf blocks need to change/allocate to add @extents extents? * * If we add a single extent, then in the worse case, each tree level * index/leaf need to be changed in case of the tree split. * * If more extents are inserted, they could cause the whole tree split more * than once, but this is really rare. */ int ext4_ext_index_trans_blocks(struct inode *inode, int extents) { int index; /* If we are converting the inline data, only one is needed here. */ if (ext4_has_inline_data(inode)) return 1; /* * Extent tree can change between the time we estimate credits and * the time we actually modify the tree. Assume the worst case. */ if (extents <= 1) index = (EXT4_MAX_EXTENT_DEPTH * 2) + extents; else index = (EXT4_MAX_EXTENT_DEPTH * 3) + DIV_ROUND_UP(extents, ext4_ext_space_block(inode, 0)); return index; } static inline int get_default_free_blocks_flags(struct inode *inode) { if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) || ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE)) return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET; else if (ext4_should_journal_data(inode)) return EXT4_FREE_BLOCKS_FORGET; return 0; } /* * ext4_rereserve_cluster - increment the reserved cluster count when * freeing a cluster with a pending reservation * * @inode - file containing the cluster * @lblk - logical block in cluster to be reserved * * Increments the reserved cluster count and adjusts quota in a bigalloc * file system when freeing a partial cluster containing at least one * delayed and unwritten block. A partial cluster meeting that * requirement will have a pending reservation. If so, the * RERESERVE_CLUSTER flag is used when calling ext4_free_blocks() to * defer reserved and allocated space accounting to a subsequent call * to this function. */ static void ext4_rereserve_cluster(struct inode *inode, ext4_lblk_t lblk) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); dquot_reclaim_block(inode, EXT4_C2B(sbi, 1)); spin_lock(&ei->i_block_reservation_lock); ei->i_reserved_data_blocks++; percpu_counter_add(&sbi->s_dirtyclusters_counter, 1); spin_unlock(&ei->i_block_reservation_lock); percpu_counter_add(&sbi->s_freeclusters_counter, 1); ext4_remove_pending(inode, lblk); } static int ext4_remove_blocks(handle_t *handle, struct inode *inode, struct ext4_extent *ex, struct partial_cluster *partial, ext4_lblk_t from, ext4_lblk_t to) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); unsigned short ee_len = ext4_ext_get_actual_len(ex); ext4_fsblk_t last_pblk, pblk; ext4_lblk_t num; int flags; /* only extent tail removal is allowed */ if (from < le32_to_cpu(ex->ee_block) || to != le32_to_cpu(ex->ee_block) + ee_len - 1) { ext4_error(sbi->s_sb, "strange request: removal(2) %u-%u from %u:%u", from, to, le32_to_cpu(ex->ee_block), ee_len); return 0; } #ifdef EXTENTS_STATS spin_lock(&sbi->s_ext_stats_lock); sbi->s_ext_blocks += ee_len; sbi->s_ext_extents++; if (ee_len < sbi->s_ext_min) sbi->s_ext_min = ee_len; if (ee_len > sbi->s_ext_max) sbi->s_ext_max = ee_len; if (ext_depth(inode) > sbi->s_depth_max) sbi->s_depth_max = ext_depth(inode); spin_unlock(&sbi->s_ext_stats_lock); #endif trace_ext4_remove_blocks(inode, ex, from, to, partial); /* * if we have a partial cluster, and it's different from the * cluster of the last block in the extent, we free it */ last_pblk = ext4_ext_pblock(ex) + ee_len - 1; if (partial->state != initial && partial->pclu != EXT4_B2C(sbi, last_pblk)) { if (partial->state == tofree) { flags = get_default_free_blocks_flags(inode); if (ext4_is_pending(inode, partial->lblk)) flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; ext4_free_blocks(handle, inode, NULL, EXT4_C2B(sbi, partial->pclu), sbi->s_cluster_ratio, flags); if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) ext4_rereserve_cluster(inode, partial->lblk); } partial->state = initial; } num = le32_to_cpu(ex->ee_block) + ee_len - from; pblk = ext4_ext_pblock(ex) + ee_len - num; /* * We free the partial cluster at the end of the extent (if any), * unless the cluster is used by another extent (partial_cluster * state is nofree). If a partial cluster exists here, it must be * shared with the last block in the extent. */ flags = get_default_free_blocks_flags(inode); /* partial, left end cluster aligned, right end unaligned */ if ((EXT4_LBLK_COFF(sbi, to) != sbi->s_cluster_ratio - 1) && (EXT4_LBLK_CMASK(sbi, to) >= from) && (partial->state != nofree)) { if (ext4_is_pending(inode, to)) flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; ext4_free_blocks(handle, inode, NULL, EXT4_PBLK_CMASK(sbi, last_pblk), sbi->s_cluster_ratio, flags); if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) ext4_rereserve_cluster(inode, to); partial->state = initial; flags = get_default_free_blocks_flags(inode); } flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER; /* * For bigalloc file systems, we never free a partial cluster * at the beginning of the extent. Instead, we check to see if we * need to free it on a subsequent call to ext4_remove_blocks, * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space. */ flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER; ext4_free_blocks(handle, inode, NULL, pblk, num, flags); /* reset the partial cluster if we've freed past it */ if (partial->state != initial && partial->pclu != EXT4_B2C(sbi, pblk)) partial->state = initial; /* * If we've freed the entire extent but the beginning is not left * cluster aligned and is not marked as ineligible for freeing we * record the partial cluster at the beginning of the extent. It * wasn't freed by the preceding ext4_free_blocks() call, and we * need to look farther to the left to determine if it's to be freed * (not shared with another extent). Else, reset the partial * cluster - we're either done freeing or the beginning of the * extent is left cluster aligned. */ if (EXT4_LBLK_COFF(sbi, from) && num == ee_len) { if (partial->state == initial) { partial->pclu = EXT4_B2C(sbi, pblk); partial->lblk = from; partial->state = tofree; } } else { partial->state = initial; } return 0; } /* * ext4_ext_rm_leaf() Removes the extents associated with the * blocks appearing between "start" and "end". Both "start" * and "end" must appear in the same extent or EIO is returned. * * @handle: The journal handle * @inode: The files inode * @path: The path to the leaf * @partial_cluster: The cluster which we'll have to free if all extents * has been released from it. However, if this value is * negative, it's a cluster just to the right of the * punched region and it must not be freed. * @start: The first block to remove * @end: The last block to remove */ static int ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, struct partial_cluster *partial, ext4_lblk_t start, ext4_lblk_t end) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int err = 0, correct_index = 0; int depth = ext_depth(inode), credits, revoke_credits; struct ext4_extent_header *eh; ext4_lblk_t a, b; unsigned num; ext4_lblk_t ex_ee_block; unsigned short ex_ee_len; unsigned unwritten = 0; struct ext4_extent *ex; ext4_fsblk_t pblk; /* the header must be checked already in ext4_ext_remove_space() */ ext_debug(inode, "truncate since %u in leaf to %u\n", start, end); if (!path[depth].p_hdr) path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); eh = path[depth].p_hdr; if (unlikely(path[depth].p_hdr == NULL)) { EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); return -EFSCORRUPTED; } /* find where to start removing */ ex = path[depth].p_ext; if (!ex) ex = EXT_LAST_EXTENT(eh); ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); trace_ext4_ext_rm_leaf(inode, start, ex, partial); while (ex >= EXT_FIRST_EXTENT(eh) && ex_ee_block + ex_ee_len > start) { if (ext4_ext_is_unwritten(ex)) unwritten = 1; else unwritten = 0; ext_debug(inode, "remove ext %u:[%d]%d\n", ex_ee_block, unwritten, ex_ee_len); path[depth].p_ext = ex; a = max(ex_ee_block, start); b = min(ex_ee_block + ex_ee_len - 1, end); ext_debug(inode, " border %u:%u\n", a, b); /* If this extent is beyond the end of the hole, skip it */ if (end < ex_ee_block) { /* * We're going to skip this extent and move to another, * so note that its first cluster is in use to avoid * freeing it when removing blocks. Eventually, the * right edge of the truncated/punched region will * be just to the left. */ if (sbi->s_cluster_ratio > 1) { pblk = ext4_ext_pblock(ex); partial->pclu = EXT4_B2C(sbi, pblk); partial->state = nofree; } ex--; ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); continue; } else if (b != ex_ee_block + ex_ee_len - 1) { EXT4_ERROR_INODE(inode, "can not handle truncate %u:%u " "on extent %u:%u", start, end, ex_ee_block, ex_ee_block + ex_ee_len - 1); err = -EFSCORRUPTED; goto out; } else if (a != ex_ee_block) { /* remove tail of the extent */ num = a - ex_ee_block; } else { /* remove whole extent: excellent! */ num = 0; } /* * 3 for leaf, sb, and inode plus 2 (bmap and group * descriptor) for each block group; assume two block * groups plus ex_ee_len/blocks_per_block_group for * the worst case */ credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb)); if (ex == EXT_FIRST_EXTENT(eh)) { correct_index = 1; credits += (ext_depth(inode)) + 1; } credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); /* * We may end up freeing some index blocks and data from the * punched range. Note that partial clusters are accounted for * by ext4_free_data_revoke_credits(). */ revoke_credits = ext4_free_metadata_revoke_credits(inode->i_sb, ext_depth(inode)) + ext4_free_data_revoke_credits(inode, b - a + 1); err = ext4_datasem_ensure_credits(handle, inode, credits, credits, revoke_credits); if (err) { if (err > 0) err = -EAGAIN; goto out; } err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; err = ext4_remove_blocks(handle, inode, ex, partial, a, b); if (err) goto out; if (num == 0) /* this extent is removed; mark slot entirely unused */ ext4_ext_store_pblock(ex, 0); ex->ee_len = cpu_to_le16(num); /* * Do not mark unwritten if all the blocks in the * extent have been removed. */ if (unwritten && num) ext4_ext_mark_unwritten(ex); /* * If the extent was completely released, * we need to remove it from the leaf */ if (num == 0) { if (end != EXT_MAX_BLOCKS - 1) { /* * For hole punching, we need to scoot all the * extents up when an extent is removed so that * we dont have blank extents in the middle */ memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) * sizeof(struct ext4_extent)); /* Now get rid of the one at the end */ memset(EXT_LAST_EXTENT(eh), 0, sizeof(struct ext4_extent)); } le16_add_cpu(&eh->eh_entries, -1); } err = ext4_ext_dirty(handle, inode, path + depth); if (err) goto out; ext_debug(inode, "new extent: %u:%u:%llu\n", ex_ee_block, num, ext4_ext_pblock(ex)); ex--; ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); } if (correct_index && eh->eh_entries) err = ext4_ext_correct_indexes(handle, inode, path); /* * If there's a partial cluster and at least one extent remains in * the leaf, free the partial cluster if it isn't shared with the * current extent. If it is shared with the current extent * we reset the partial cluster because we've reached the start of the * truncated/punched region and we're done removing blocks. */ if (partial->state == tofree && ex >= EXT_FIRST_EXTENT(eh)) { pblk = ext4_ext_pblock(ex) + ex_ee_len - 1; if (partial->pclu != EXT4_B2C(sbi, pblk)) { int flags = get_default_free_blocks_flags(inode); if (ext4_is_pending(inode, partial->lblk)) flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; ext4_free_blocks(handle, inode, NULL, EXT4_C2B(sbi, partial->pclu), sbi->s_cluster_ratio, flags); if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) ext4_rereserve_cluster(inode, partial->lblk); } partial->state = initial; } /* if this leaf is free, then we should * remove it from index block above */ if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) err = ext4_ext_rm_idx(handle, inode, path, depth); out: return err; } /* * ext4_ext_more_to_rm: * returns 1 if current index has to be freed (even partial) */ static int ext4_ext_more_to_rm(struct ext4_ext_path *path) { BUG_ON(path->p_idx == NULL); if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) return 0; /* * if truncate on deeper level happened, it wasn't partial, * so we have to consider current index for truncation */ if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block) return 0; return 1; } int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int depth = ext_depth(inode); struct ext4_ext_path *path = NULL; struct partial_cluster partial; handle_t *handle; int i = 0, err = 0; int flags = EXT4_EX_NOCACHE | EXT4_EX_NOFAIL; partial.pclu = 0; partial.lblk = 0; partial.state = initial; ext_debug(inode, "truncate since %u to %u\n", start, end); /* probably first extent we're gonna free will be last in block */ handle = ext4_journal_start_with_revoke(inode, EXT4_HT_TRUNCATE, depth + 1, ext4_free_metadata_revoke_credits(inode->i_sb, depth)); if (IS_ERR(handle)) return PTR_ERR(handle); again: trace_ext4_ext_remove_space(inode, start, end, depth); /* * Check if we are removing extents inside the extent tree. If that * is the case, we are going to punch a hole inside the extent tree * so we have to check whether we need to split the extent covering * the last block to remove so we can easily remove the part of it * in ext4_ext_rm_leaf(). */ if (end < EXT_MAX_BLOCKS - 1) { struct ext4_extent *ex; ext4_lblk_t ee_block, ex_end, lblk; ext4_fsblk_t pblk; /* find extent for or closest extent to this block */ path = ext4_find_extent(inode, end, NULL, flags); if (IS_ERR(path)) { ext4_journal_stop(handle); return PTR_ERR(path); } depth = ext_depth(inode); /* Leaf not may not exist only if inode has no blocks at all */ ex = path[depth].p_ext; if (!ex) { if (depth) { EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); err = -EFSCORRUPTED; } goto out; } ee_block = le32_to_cpu(ex->ee_block); ex_end = ee_block + ext4_ext_get_actual_len(ex) - 1; /* * See if the last block is inside the extent, if so split * the extent at 'end' block so we can easily remove the * tail of the first part of the split extent in * ext4_ext_rm_leaf(). */ if (end >= ee_block && end < ex_end) { /* * If we're going to split the extent, note that * the cluster containing the block after 'end' is * in use to avoid freeing it when removing blocks. */ if (sbi->s_cluster_ratio > 1) { pblk = ext4_ext_pblock(ex) + end - ee_block + 1; partial.pclu = EXT4_B2C(sbi, pblk); partial.state = nofree; } /* * Split the extent in two so that 'end' is the last * block in the first new extent. Also we should not * fail removing space due to ENOSPC so try to use * reserved block if that happens. */ path = ext4_force_split_extent_at(handle, inode, path, end + 1, 1); if (IS_ERR(path)) { err = PTR_ERR(path); goto out; } } else if (sbi->s_cluster_ratio > 1 && end >= ex_end && partial.state == initial) { /* * If we're punching, there's an extent to the right. * If the partial cluster hasn't been set, set it to * that extent's first cluster and its state to nofree * so it won't be freed should it contain blocks to be * removed. If it's already set (tofree/nofree), we're * retrying and keep the original partial cluster info * so a cluster marked tofree as a result of earlier * extent removal is not lost. */ lblk = ex_end + 1; err = ext4_ext_search_right(inode, path, &lblk, &pblk, NULL, flags); if (err < 0) goto out; if (pblk) { partial.pclu = EXT4_B2C(sbi, pblk); partial.state = nofree; } } } /* * We start scanning from right side, freeing all the blocks * after i_size and walking into the tree depth-wise. */ depth = ext_depth(inode); if (path) { int k = i = depth; while (--k > 0) path[k].p_block = le16_to_cpu(path[k].p_hdr->eh_entries)+1; } else { path = kcalloc(depth + 1, sizeof(struct ext4_ext_path), GFP_NOFS | __GFP_NOFAIL); if (path == NULL) { ext4_journal_stop(handle); return -ENOMEM; } path[0].p_maxdepth = path[0].p_depth = depth; path[0].p_hdr = ext_inode_hdr(inode); i = 0; if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) { err = -EFSCORRUPTED; goto out; } } err = 0; while (i >= 0 && err == 0) { if (i == depth) { /* this is leaf block */ err = ext4_ext_rm_leaf(handle, inode, path, &partial, start, end); /* root level has p_bh == NULL, brelse() eats this */ ext4_ext_path_brelse(path + i); i--; continue; } /* this is index block */ if (!path[i].p_hdr) { ext_debug(inode, "initialize header\n"); path[i].p_hdr = ext_block_hdr(path[i].p_bh); } if (!path[i].p_idx) { /* this level hasn't been touched yet */ path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr); path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1; ext_debug(inode, "init index ptr: hdr 0x%p, num %d\n", path[i].p_hdr, le16_to_cpu(path[i].p_hdr->eh_entries)); } else { /* we were already here, see at next index */ path[i].p_idx--; } ext_debug(inode, "level %d - index, first 0x%p, cur 0x%p\n", i, EXT_FIRST_INDEX(path[i].p_hdr), path[i].p_idx); if (ext4_ext_more_to_rm(path + i)) { struct buffer_head *bh; /* go to the next level */ ext_debug(inode, "move to level %d (block %llu)\n", i + 1, ext4_idx_pblock(path[i].p_idx)); memset(path + i + 1, 0, sizeof(*path)); bh = read_extent_tree_block(inode, path[i].p_idx, depth - i - 1, flags); if (IS_ERR(bh)) { /* should we reset i_size? */ err = PTR_ERR(bh); break; } /* Yield here to deal with large extent trees. * Should be a no-op if we did IO above. */ cond_resched(); if (WARN_ON(i + 1 > depth)) { err = -EFSCORRUPTED; break; } path[i + 1].p_bh = bh; /* save actual number of indexes since this * number is changed at the next iteration */ path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries); i++; } else { /* we finished processing this index, go up */ if (path[i].p_hdr->eh_entries == 0 && i > 0) { /* index is empty, remove it; * handle must be already prepared by the * truncatei_leaf() */ err = ext4_ext_rm_idx(handle, inode, path, i); } /* root level has p_bh == NULL, brelse() eats this */ ext4_ext_path_brelse(path + i); i--; ext_debug(inode, "return to level %d\n", i); } } trace_ext4_ext_remove_space_done(inode, start, end, depth, &partial, path->p_hdr->eh_entries); /* * if there's a partial cluster and we have removed the first extent * in the file, then we also free the partial cluster, if any */ if (partial.state == tofree && err == 0) { int flags = get_default_free_blocks_flags(inode); if (ext4_is_pending(inode, partial.lblk)) flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; ext4_free_blocks(handle, inode, NULL, EXT4_C2B(sbi, partial.pclu), sbi->s_cluster_ratio, flags); if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) ext4_rereserve_cluster(inode, partial.lblk); partial.state = initial; } /* TODO: flexible tree reduction should be here */ if (path->p_hdr->eh_entries == 0) { /* * truncate to zero freed all the tree, * so we need to correct eh_depth */ err = ext4_ext_get_access(handle, inode, path); if (err == 0) { ext_inode_hdr(inode)->eh_depth = 0; ext_inode_hdr(inode)->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0)); err = ext4_ext_dirty(handle, inode, path); } } out: ext4_free_ext_path(path); path = NULL; if (err == -EAGAIN) goto again; ext4_journal_stop(handle); return err; } /* * called at mount time */ void ext4_ext_init(struct super_block *sb) { /* * possible initialization would be here */ if (ext4_has_feature_extents(sb)) { #if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS) printk(KERN_INFO "EXT4-fs: file extents enabled" #ifdef AGGRESSIVE_TEST ", aggressive tests" #endif #ifdef CHECK_BINSEARCH ", check binsearch" #endif #ifdef EXTENTS_STATS ", stats" #endif "\n"); #endif #ifdef EXTENTS_STATS spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); EXT4_SB(sb)->s_ext_min = 1 << 30; EXT4_SB(sb)->s_ext_max = 0; #endif } } /* * called at umount time */ void ext4_ext_release(struct super_block *sb) { if (!ext4_has_feature_extents(sb)) return; #ifdef EXTENTS_STATS if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) { struct ext4_sb_info *sbi = EXT4_SB(sb); printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n", sbi->s_ext_blocks, sbi->s_ext_extents, sbi->s_ext_blocks / sbi->s_ext_extents); printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n", sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max); } #endif } static void ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex) { ext4_lblk_t ee_block; ext4_fsblk_t ee_pblock; unsigned int ee_len; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); ee_pblock = ext4_ext_pblock(ex); if (ee_len == 0) return; ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock, EXTENT_STATUS_WRITTEN, false); } /* FIXME!! we need to try to merge to left or right after zero-out */ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) { ext4_fsblk_t ee_pblock; unsigned int ee_len; ee_len = ext4_ext_get_actual_len(ex); ee_pblock = ext4_ext_pblock(ex); return ext4_issue_zeroout(inode, le32_to_cpu(ex->ee_block), ee_pblock, ee_len); } /* * ext4_split_extent_at() splits an extent at given block. * * @handle: the journal handle * @inode: the file inode * @path: the path to the extent * @split: the logical block where the extent is splitted. * @split_flags: indicates if the extent could be zeroout if split fails, and * the states(init or unwritten) of new extents. * @flags: flags used to insert new extent to extent tree. * * * Splits extent [a, b] into two extents [a, @split) and [@split, b], states * of which are determined by split_flag. * * There are two cases: * a> the extent are splitted into two extent. * b> split is not needed, and just mark the extent. * * Return an extent path pointer on success, or an error pointer on failure. */ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t split, int split_flag, int flags) { ext4_fsblk_t newblock; ext4_lblk_t ee_block; struct ext4_extent *ex, newex, orig_ex, zero_ex; struct ext4_extent *ex2 = NULL; unsigned int ee_len, depth; int err = 0; BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) == (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)); ext_debug(inode, "logical block %llu\n", (unsigned long long)split); ext4_ext_show_leaf(inode, path); depth = ext_depth(inode); ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); newblock = split - ee_block + ext4_ext_pblock(ex); BUG_ON(split < ee_block || split >= (ee_block + ee_len)); BUG_ON(!ext4_ext_is_unwritten(ex) && split_flag & (EXT4_EXT_MAY_ZEROOUT | EXT4_EXT_MARK_UNWRIT1 | EXT4_EXT_MARK_UNWRIT2)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; if (split == ee_block) { /* * case b: block @split is the block that the extent begins with * then we just change the state of the extent, and splitting * is not needed. */ if (split_flag & EXT4_EXT_MARK_UNWRIT2) ext4_ext_mark_unwritten(ex); else ext4_ext_mark_initialized(ex); if (!(flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE)) ext4_ext_try_to_merge(handle, inode, path, ex); err = ext4_ext_dirty(handle, inode, path + path->p_depth); goto out; } /* case a */ memcpy(&orig_ex, ex, sizeof(orig_ex)); ex->ee_len = cpu_to_le16(split - ee_block); if (split_flag & EXT4_EXT_MARK_UNWRIT1) ext4_ext_mark_unwritten(ex); /* * path may lead to new leaf, not to original leaf any more * after ext4_ext_insert_extent() returns, */ err = ext4_ext_dirty(handle, inode, path + depth); if (err) goto fix_extent_len; ex2 = &newex; ex2->ee_block = cpu_to_le32(split); ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block)); ext4_ext_store_pblock(ex2, newblock); if (split_flag & EXT4_EXT_MARK_UNWRIT2) ext4_ext_mark_unwritten(ex2); path = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (!IS_ERR(path)) goto out; err = PTR_ERR(path); if (err != -ENOSPC && err != -EDQUOT && err != -ENOMEM) return path; /* * Get a new path to try to zeroout or fix the extent length. * Using EXT4_EX_NOFAIL guarantees that ext4_find_extent() * will not return -ENOMEM, otherwise -ENOMEM will cause a * retry in do_writepages(), and a WARN_ON may be triggered * in ext4_da_update_reserve_space() due to an incorrect * ee_len causing the i_reserved_data_blocks exception. */ path = ext4_find_extent(inode, ee_block, NULL, flags | EXT4_EX_NOFAIL); if (IS_ERR(path)) { EXT4_ERROR_INODE(inode, "Failed split extent on %u, err %ld", split, PTR_ERR(path)); return path; } depth = ext_depth(inode); ex = path[depth].p_ext; if (EXT4_EXT_MAY_ZEROOUT & split_flag) { if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) { if (split_flag & EXT4_EXT_DATA_VALID1) { err = ext4_ext_zeroout(inode, ex2); zero_ex.ee_block = ex2->ee_block; zero_ex.ee_len = cpu_to_le16( ext4_ext_get_actual_len(ex2)); ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex2)); } else { err = ext4_ext_zeroout(inode, ex); zero_ex.ee_block = ex->ee_block; zero_ex.ee_len = cpu_to_le16( ext4_ext_get_actual_len(ex)); ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex)); } } else { err = ext4_ext_zeroout(inode, &orig_ex); zero_ex.ee_block = orig_ex.ee_block; zero_ex.ee_len = cpu_to_le16( ext4_ext_get_actual_len(&orig_ex)); ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(&orig_ex)); } if (!err) { /* update the extent length and mark as initialized */ ex->ee_len = cpu_to_le16(ee_len); ext4_ext_try_to_merge(handle, inode, path, ex); err = ext4_ext_dirty(handle, inode, path + path->p_depth); if (!err) /* update extent status tree */ ext4_zeroout_es(inode, &zero_ex); /* If we failed at this point, we don't know in which * state the extent tree exactly is so don't try to fix * length of the original extent as it may do even more * damage. */ goto out; } } fix_extent_len: ex->ee_len = orig_ex.ee_len; /* * Ignore ext4_ext_dirty return value since we are already in error path * and err is a non-zero error code. */ ext4_ext_dirty(handle, inode, path + path->p_depth); out: if (err) { ext4_free_ext_path(path); path = ERR_PTR(err); } ext4_ext_show_leaf(inode, path); return path; } /* * ext4_split_extent() splits an extent and mark extent which is covered * by @map as split_flags indicates * * It may result in splitting the extent into multiple extents (up to three) * There are three possibilities: * a> There is no split required * b> Splits in two extents: Split is happening at either end of the extent * c> Splits in three extents: Somone is splitting in middle of the extent * */ static struct ext4_ext_path *ext4_split_extent(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, struct ext4_map_blocks *map, int split_flag, int flags, unsigned int *allocated) { ext4_lblk_t ee_block; struct ext4_extent *ex; unsigned int ee_len, depth; int unwritten; int split_flag1, flags1; depth = ext_depth(inode); ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); unwritten = ext4_ext_is_unwritten(ex); if (map->m_lblk + map->m_len < ee_block + ee_len) { split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT; flags1 = flags | EXT4_GET_BLOCKS_SPLIT_NOMERGE; if (unwritten) split_flag1 |= EXT4_EXT_MARK_UNWRIT1 | EXT4_EXT_MARK_UNWRIT2; if (split_flag & EXT4_EXT_DATA_VALID2) split_flag1 |= EXT4_EXT_DATA_VALID1; path = ext4_split_extent_at(handle, inode, path, map->m_lblk + map->m_len, split_flag1, flags1); if (IS_ERR(path)) return path; /* * Update path is required because previous ext4_split_extent_at * may result in split of original leaf or extent zeroout. */ path = ext4_find_extent(inode, map->m_lblk, path, flags); if (IS_ERR(path)) return path; depth = ext_depth(inode); ex = path[depth].p_ext; if (!ex) { EXT4_ERROR_INODE(inode, "unexpected hole at %lu", (unsigned long) map->m_lblk); ext4_free_ext_path(path); return ERR_PTR(-EFSCORRUPTED); } unwritten = ext4_ext_is_unwritten(ex); } if (map->m_lblk >= ee_block) { split_flag1 = split_flag & EXT4_EXT_DATA_VALID2; if (unwritten) { split_flag1 |= EXT4_EXT_MARK_UNWRIT1; split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT | EXT4_EXT_MARK_UNWRIT2); } path = ext4_split_extent_at(handle, inode, path, map->m_lblk, split_flag1, flags); if (IS_ERR(path)) return path; } if (allocated) { if (map->m_lblk + map->m_len > ee_block + ee_len) *allocated = ee_len - (map->m_lblk - ee_block); else *allocated = map->m_len; } ext4_ext_show_leaf(inode, path); return path; } /* * This function is called by ext4_ext_map_blocks() if someone tries to write * to an unwritten extent. It may result in splitting the unwritten * extent into multiple extents (up to three - one initialized and two * unwritten). * There are three possibilities: * a> There is no split required: Entire extent should be initialized * b> Splits in two extents: Write is happening at either end of the extent * c> Splits in three extents: Somone is writing in middle of the extent * * Pre-conditions: * - The extent pointed to by 'path' is unwritten. * - The extent pointed to by 'path' contains a superset * of the logical span [map->m_lblk, map->m_lblk + map->m_len). * * Post-conditions on success: * - the returned value is the number of blocks beyond map->l_lblk * that are allocated and initialized. * It is guaranteed to be >= map->m_len. */ static struct ext4_ext_path * ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path *path, int flags, unsigned int *allocated) { struct ext4_sb_info *sbi; struct ext4_extent_header *eh; struct ext4_map_blocks split_map; struct ext4_extent zero_ex1, zero_ex2; struct ext4_extent *ex, *abut_ex; ext4_lblk_t ee_block, eof_block; unsigned int ee_len, depth, map_len = map->m_len; int err = 0; int split_flag = EXT4_EXT_DATA_VALID2; unsigned int max_zeroout = 0; ext_debug(inode, "logical block %llu, max_blocks %u\n", (unsigned long long)map->m_lblk, map_len); sbi = EXT4_SB(inode->i_sb); eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits; if (eof_block < map->m_lblk + map_len) eof_block = map->m_lblk + map_len; depth = ext_depth(inode); eh = path[depth].p_hdr; ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); zero_ex1.ee_len = 0; zero_ex2.ee_len = 0; trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); /* Pre-conditions */ BUG_ON(!ext4_ext_is_unwritten(ex)); BUG_ON(!in_range(map->m_lblk, ee_block, ee_len)); /* * Attempt to transfer newly initialized blocks from the currently * unwritten extent to its neighbor. This is much cheaper * than an insertion followed by a merge as those involve costly * memmove() calls. Transferring to the left is the common case in * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE) * followed by append writes. * * Limitations of the current logic: * - L1: we do not deal with writes covering the whole extent. * This would require removing the extent if the transfer * is possible. * - L2: we only attempt to merge with an extent stored in the * same extent tree node. */ *allocated = 0; if ((map->m_lblk == ee_block) && /* See if we can merge left */ (map_len < ee_len) && /*L1*/ (ex > EXT_FIRST_EXTENT(eh))) { /*L2*/ ext4_lblk_t prev_lblk; ext4_fsblk_t prev_pblk, ee_pblk; unsigned int prev_len; abut_ex = ex - 1; prev_lblk = le32_to_cpu(abut_ex->ee_block); prev_len = ext4_ext_get_actual_len(abut_ex); prev_pblk = ext4_ext_pblock(abut_ex); ee_pblk = ext4_ext_pblock(ex); /* * A transfer of blocks from 'ex' to 'abut_ex' is allowed * upon those conditions: * - C1: abut_ex is initialized, * - C2: abut_ex is logically abutting ex, * - C3: abut_ex is physically abutting ex, * - C4: abut_ex can receive the additional blocks without * overflowing the (initialized) length limit. */ if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/ ((prev_lblk + prev_len) == ee_block) && /*C2*/ ((prev_pblk + prev_len) == ee_pblk) && /*C3*/ (prev_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto errout; trace_ext4_ext_convert_to_initialized_fastpath(inode, map, ex, abut_ex); /* Shift the start of ex by 'map_len' blocks */ ex->ee_block = cpu_to_le32(ee_block + map_len); ext4_ext_store_pblock(ex, ee_pblk + map_len); ex->ee_len = cpu_to_le16(ee_len - map_len); ext4_ext_mark_unwritten(ex); /* Restore the flag */ /* Extend abut_ex by 'map_len' blocks */ abut_ex->ee_len = cpu_to_le16(prev_len + map_len); /* Result: number of initialized blocks past m_lblk */ *allocated = map_len; } } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) && (map_len < ee_len) && /*L1*/ ex < EXT_LAST_EXTENT(eh)) { /*L2*/ /* See if we can merge right */ ext4_lblk_t next_lblk; ext4_fsblk_t next_pblk, ee_pblk; unsigned int next_len; abut_ex = ex + 1; next_lblk = le32_to_cpu(abut_ex->ee_block); next_len = ext4_ext_get_actual_len(abut_ex); next_pblk = ext4_ext_pblock(abut_ex); ee_pblk = ext4_ext_pblock(ex); /* * A transfer of blocks from 'ex' to 'abut_ex' is allowed * upon those conditions: * - C1: abut_ex is initialized, * - C2: abut_ex is logically abutting ex, * - C3: abut_ex is physically abutting ex, * - C4: abut_ex can receive the additional blocks without * overflowing the (initialized) length limit. */ if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/ ((map->m_lblk + map_len) == next_lblk) && /*C2*/ ((ee_pblk + ee_len) == next_pblk) && /*C3*/ (next_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto errout; trace_ext4_ext_convert_to_initialized_fastpath(inode, map, ex, abut_ex); /* Shift the start of abut_ex by 'map_len' blocks */ abut_ex->ee_block = cpu_to_le32(next_lblk - map_len); ext4_ext_store_pblock(abut_ex, next_pblk - map_len); ex->ee_len = cpu_to_le16(ee_len - map_len); ext4_ext_mark_unwritten(ex); /* Restore the flag */ /* Extend abut_ex by 'map_len' blocks */ abut_ex->ee_len = cpu_to_le16(next_len + map_len); /* Result: number of initialized blocks past m_lblk */ *allocated = map_len; } } if (*allocated) { /* Mark the block containing both extents as dirty */ err = ext4_ext_dirty(handle, inode, path + depth); /* Update path to point to the right extent */ path[depth].p_ext = abut_ex; if (err) goto errout; goto out; } else *allocated = ee_len - (map->m_lblk - ee_block); WARN_ON(map->m_lblk < ee_block); /* * It is safe to convert extent to initialized via explicit * zeroout only if extent is fully inside i_size or new_size. */ split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; if (EXT4_EXT_MAY_ZEROOUT & split_flag) max_zeroout = sbi->s_extent_max_zeroout_kb >> (inode->i_sb->s_blocksize_bits - 10); /* * five cases: * 1. split the extent into three extents. * 2. split the extent into two extents, zeroout the head of the first * extent. * 3. split the extent into two extents, zeroout the tail of the second * extent. * 4. split the extent into two extents with out zeroout. * 5. no splitting needed, just possibly zeroout the head and / or the * tail of the extent. */ split_map.m_lblk = map->m_lblk; split_map.m_len = map->m_len; if (max_zeroout && (*allocated > split_map.m_len)) { if (*allocated <= max_zeroout) { /* case 3 or 5 */ zero_ex1.ee_block = cpu_to_le32(split_map.m_lblk + split_map.m_len); zero_ex1.ee_len = cpu_to_le16(*allocated - split_map.m_len); ext4_ext_store_pblock(&zero_ex1, ext4_ext_pblock(ex) + split_map.m_lblk + split_map.m_len - ee_block); err = ext4_ext_zeroout(inode, &zero_ex1); if (err) goto fallback; split_map.m_len = *allocated; } if (split_map.m_lblk - ee_block + split_map.m_len < max_zeroout) { /* case 2 or 5 */ if (split_map.m_lblk != ee_block) { zero_ex2.ee_block = ex->ee_block; zero_ex2.ee_len = cpu_to_le16(split_map.m_lblk - ee_block); ext4_ext_store_pblock(&zero_ex2, ext4_ext_pblock(ex)); err = ext4_ext_zeroout(inode, &zero_ex2); if (err) goto fallback; } split_map.m_len += split_map.m_lblk - ee_block; split_map.m_lblk = ee_block; *allocated = map->m_len; } } fallback: path = ext4_split_extent(handle, inode, path, &split_map, split_flag, flags, NULL); if (IS_ERR(path)) return path; out: /* If we have gotten a failure, don't zero out status tree */ ext4_zeroout_es(inode, &zero_ex1); ext4_zeroout_es(inode, &zero_ex2); return path; errout: ext4_free_ext_path(path); return ERR_PTR(err); } /* * This function is called by ext4_ext_map_blocks() from * ext4_get_blocks_dio_write() when DIO to write * to an unwritten extent. * * Writing to an unwritten extent may result in splitting the unwritten * extent into multiple initialized/unwritten extents (up to three) * There are three possibilities: * a> There is no split required: Entire extent should be unwritten * b> Splits in two extents: Write is happening at either end of the extent * c> Splits in three extents: Somone is writing in middle of the extent * * This works the same way in the case of initialized -> unwritten conversion. * * One of more index blocks maybe needed if the extent tree grow after * the unwritten extent split. To prevent ENOSPC occur at the IO * complete, we need to split the unwritten extent before DIO submit * the IO. The unwritten extent called at this time will be split * into three unwritten extent(at most). After IO complete, the part * being filled will be convert to initialized by the end_io callback function * via ext4_convert_unwritten_extents(). * * The size of unwritten extent to be written is passed to the caller via the * allocated pointer. Return an extent path pointer on success, or an error * pointer on failure. */ static struct ext4_ext_path *ext4_split_convert_extents(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path *path, int flags, unsigned int *allocated) { ext4_lblk_t eof_block; ext4_lblk_t ee_block; struct ext4_extent *ex; unsigned int ee_len; int split_flag = 0, depth; ext_debug(inode, "logical block %llu, max_blocks %u\n", (unsigned long long)map->m_lblk, map->m_len); eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits; if (eof_block < map->m_lblk + map->m_len) eof_block = map->m_lblk + map->m_len; depth = ext_depth(inode); ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); /* Convert to unwritten */ if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) { split_flag |= EXT4_EXT_DATA_VALID1; /* Convert to initialized */ } else if (flags & EXT4_GET_BLOCKS_CONVERT) { /* * It is safe to convert extent to initialized via explicit * zeroout only if extent is fully inside i_size or new_size. */ split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2); } flags |= EXT4_GET_BLOCKS_SPLIT_NOMERGE; return ext4_split_extent(handle, inode, path, map, split_flag, flags, allocated); } static struct ext4_ext_path * ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path *path) { struct ext4_extent *ex; ext4_lblk_t ee_block; unsigned int ee_len; int depth; int err = 0; depth = ext_depth(inode); ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); ext_debug(inode, "logical block %llu, max_blocks %u\n", (unsigned long long)ee_block, ee_len); /* If extent is larger than requested it is a clear sign that we still * have some extent state machine issues left. So extent_split is still * required. * TODO: Once all related issues will be fixed this situation should be * illegal. */ if (ee_block != map->m_lblk || ee_len > map->m_len) { #ifdef CONFIG_EXT4_DEBUG ext4_warning(inode->i_sb, "Inode (%ld) finished: extent logical block %llu," " len %u; IO logical block %llu, len %u", inode->i_ino, (unsigned long long)ee_block, ee_len, (unsigned long long)map->m_lblk, map->m_len); #endif path = ext4_split_convert_extents(handle, inode, map, path, EXT4_GET_BLOCKS_CONVERT, NULL); if (IS_ERR(path)) return path; path = ext4_find_extent(inode, map->m_lblk, path, 0); if (IS_ERR(path)) return path; depth = ext_depth(inode); ex = path[depth].p_ext; } err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto errout; /* first mark the extent as initialized */ ext4_ext_mark_initialized(ex); /* note: ext4_ext_correct_indexes() isn't needed here because * borders are not changed */ ext4_ext_try_to_merge(handle, inode, path, ex); /* Mark modified extent as dirty */ err = ext4_ext_dirty(handle, inode, path + path->p_depth); if (err) goto errout; ext4_ext_show_leaf(inode, path); return path; errout: ext4_free_ext_path(path); return ERR_PTR(err); } static struct ext4_ext_path * convert_initialized_extent(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path *path, unsigned int *allocated) { struct ext4_extent *ex; ext4_lblk_t ee_block; unsigned int ee_len; int depth; int err = 0; /* * Make sure that the extent is no bigger than we support with * unwritten extent */ if (map->m_len > EXT_UNWRITTEN_MAX_LEN) map->m_len = EXT_UNWRITTEN_MAX_LEN / 2; depth = ext_depth(inode); ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); ext_debug(inode, "logical block %llu, max_blocks %u\n", (unsigned long long)ee_block, ee_len); if (ee_block != map->m_lblk || ee_len > map->m_len) { path = ext4_split_convert_extents(handle, inode, map, path, EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, NULL); if (IS_ERR(path)) return path; path = ext4_find_extent(inode, map->m_lblk, path, 0); if (IS_ERR(path)) return path; depth = ext_depth(inode); ex = path[depth].p_ext; if (!ex) { EXT4_ERROR_INODE(inode, "unexpected hole at %lu", (unsigned long) map->m_lblk); err = -EFSCORRUPTED; goto errout; } } err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto errout; /* first mark the extent as unwritten */ ext4_ext_mark_unwritten(ex); /* note: ext4_ext_correct_indexes() isn't needed here because * borders are not changed */ ext4_ext_try_to_merge(handle, inode, path, ex); /* Mark modified extent as dirty */ err = ext4_ext_dirty(handle, inode, path + path->p_depth); if (err) goto errout; ext4_ext_show_leaf(inode, path); ext4_update_inode_fsync_trans(handle, inode, 1); map->m_flags |= EXT4_MAP_UNWRITTEN; if (*allocated > map->m_len) *allocated = map->m_len; map->m_len = *allocated; return path; errout: ext4_free_ext_path(path); return ERR_PTR(err); } static struct ext4_ext_path * ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path *path, int flags, unsigned int *allocated, ext4_fsblk_t newblock) { int err = 0; ext_debug(inode, "logical block %llu, max_blocks %u, flags 0x%x, allocated %u\n", (unsigned long long)map->m_lblk, map->m_len, flags, *allocated); ext4_ext_show_leaf(inode, path); /* * When writing into unwritten space, we should not fail to * allocate metadata blocks for the new extent block if needed. */ flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL; trace_ext4_ext_handle_unwritten_extents(inode, map, flags, *allocated, newblock); /* get_block() before submitting IO, split the extent */ if (flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE) { path = ext4_split_convert_extents(handle, inode, map, path, flags | EXT4_GET_BLOCKS_CONVERT, allocated); if (IS_ERR(path)) return path; /* * shouldn't get a 0 allocated when splitting an extent unless * m_len is 0 (bug) or extent has been corrupted */ if (unlikely(*allocated == 0)) { EXT4_ERROR_INODE(inode, "unexpected allocated == 0, m_len = %u", map->m_len); err = -EFSCORRUPTED; goto errout; } map->m_flags |= EXT4_MAP_UNWRITTEN; goto out; } /* IO end_io complete, convert the filled extent to written */ if (flags & EXT4_GET_BLOCKS_CONVERT) { path = ext4_convert_unwritten_extents_endio(handle, inode, map, path); if (IS_ERR(path)) return path; ext4_update_inode_fsync_trans(handle, inode, 1); goto map_out; } /* buffered IO cases */ /* * repeat fallocate creation request * we already have an unwritten extent */ if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) { map->m_flags |= EXT4_MAP_UNWRITTEN; goto map_out; } /* buffered READ or buffered write_begin() lookup */ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { /* * We have blocks reserved already. We * return allocated blocks so that delalloc * won't do block reservation for us. But * the buffer head will be unmapped so that * a read from the block returns 0s. */ map->m_flags |= EXT4_MAP_UNWRITTEN; goto out1; } /* * Default case when (flags & EXT4_GET_BLOCKS_CREATE) == 1. * For buffered writes, at writepage time, etc. Convert a * discovered unwritten extent to written. */ path = ext4_ext_convert_to_initialized(handle, inode, map, path, flags, allocated); if (IS_ERR(path)) return path; ext4_update_inode_fsync_trans(handle, inode, 1); /* * shouldn't get a 0 allocated when converting an unwritten extent * unless m_len is 0 (bug) or extent has been corrupted */ if (unlikely(*allocated == 0)) { EXT4_ERROR_INODE(inode, "unexpected allocated == 0, m_len = %u", map->m_len); err = -EFSCORRUPTED; goto errout; } out: map->m_flags |= EXT4_MAP_NEW; map_out: map->m_flags |= EXT4_MAP_MAPPED; out1: map->m_pblk = newblock; if (*allocated > map->m_len) *allocated = map->m_len; map->m_len = *allocated; ext4_ext_show_leaf(inode, path); return path; errout: ext4_free_ext_path(path); return ERR_PTR(err); } /* * get_implied_cluster_alloc - check to see if the requested * allocation (in the map structure) overlaps with a cluster already * allocated in an extent. * @sb The filesystem superblock structure * @map The requested lblk->pblk mapping * @ex The extent structure which might contain an implied * cluster allocation * * This function is called by ext4_ext_map_blocks() after we failed to * find blocks that were already in the inode's extent tree. Hence, * we know that the beginning of the requested region cannot overlap * the extent from the inode's extent tree. There are three cases we * want to catch. The first is this case: * * |--- cluster # N--| * |--- extent ---| |---- requested region ---| * |==========| * * The second case that we need to test for is this one: * * |--------- cluster # N ----------------| * |--- requested region --| |------- extent ----| * |=======================| * * The third case is when the requested region lies between two extents * within the same cluster: * |------------- cluster # N-------------| * |----- ex -----| |---- ex_right ----| * |------ requested region ------| * |================| * * In each of the above cases, we need to set the map->m_pblk and * map->m_len so it corresponds to the return the extent labelled as * "|====|" from cluster #N, since it is already in use for data in * cluster EXT4_B2C(sbi, map->m_lblk). We will then return 1 to * signal to ext4_ext_map_blocks() that map->m_pblk should be treated * as a new "allocated" block region. Otherwise, we will return 0 and * ext4_ext_map_blocks() will then allocate one or more new clusters * by calling ext4_mb_new_blocks(). */ static int get_implied_cluster_alloc(struct super_block *sb, struct ext4_map_blocks *map, struct ext4_extent *ex, struct ext4_ext_path *path) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); ext4_lblk_t ex_cluster_start, ex_cluster_end; ext4_lblk_t rr_cluster_start; ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); ext4_fsblk_t ee_start = ext4_ext_pblock(ex); unsigned short ee_len = ext4_ext_get_actual_len(ex); /* The extent passed in that we are trying to match */ ex_cluster_start = EXT4_B2C(sbi, ee_block); ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1); /* The requested region passed into ext4_map_blocks() */ rr_cluster_start = EXT4_B2C(sbi, map->m_lblk); if ((rr_cluster_start == ex_cluster_end) || (rr_cluster_start == ex_cluster_start)) { if (rr_cluster_start == ex_cluster_end) ee_start += ee_len - 1; map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset; map->m_len = min(map->m_len, (unsigned) sbi->s_cluster_ratio - c_offset); /* * Check for and handle this case: * * |--------- cluster # N-------------| * |------- extent ----| * |--- requested region ---| * |===========| */ if (map->m_lblk < ee_block) map->m_len = min(map->m_len, ee_block - map->m_lblk); /* * Check for the case where there is already another allocated * block to the right of 'ex' but before the end of the cluster. * * |------------- cluster # N-------------| * |----- ex -----| |---- ex_right ----| * |------ requested region ------| * |================| */ if (map->m_lblk > ee_block) { ext4_lblk_t next = ext4_ext_next_allocated_block(path); map->m_len = min(map->m_len, next - map->m_lblk); } trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1); return 1; } trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0); return 0; } /* * Determine hole length around the given logical block, first try to * locate and expand the hole from the given @path, and then adjust it * if it's partially or completely converted to delayed extents, insert * it into the extent cache tree if it's indeed a hole, finally return * the length of the determined extent. */ static ext4_lblk_t ext4_ext_determine_insert_hole(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t lblk) { ext4_lblk_t hole_start, len; struct extent_status es; hole_start = lblk; len = ext4_ext_find_hole(inode, path, &hole_start); again: ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start, hole_start + len - 1, &es); if (!es.es_len) goto insert_hole; /* * There's a delalloc extent in the hole, handle it if the delalloc * extent is in front of, behind and straddle the queried range. */ if (lblk >= es.es_lblk + es.es_len) { /* * The delalloc extent is in front of the queried range, * find again from the queried start block. */ len -= lblk - hole_start; hole_start = lblk; goto again; } else if (in_range(lblk, es.es_lblk, es.es_len)) { /* * The delalloc extent containing lblk, it must have been * added after ext4_map_blocks() checked the extent status * tree so we are not holding i_rwsem and delalloc info is * only stabilized by i_data_sem we are going to release * soon. Don't modify the extent status tree and report * extent as a hole, just adjust the length to the delalloc * extent's after lblk. */ len = es.es_lblk + es.es_len - lblk; return len; } else { /* * The delalloc extent is partially or completely behind * the queried range, update hole length until the * beginning of the delalloc extent. */ len = min(es.es_lblk - hole_start, len); } insert_hole: /* Put just found gap into cache to speed up subsequent requests */ ext_debug(inode, " -> %u:%u\n", hole_start, len); ext4_es_insert_extent(inode, hole_start, len, ~0, EXTENT_STATUS_HOLE, false); /* Update hole_len to reflect hole size after lblk */ if (hole_start != lblk) len -= lblk - hole_start; return len; } /* * Block allocation/map/preallocation routine for extents based files * * * Need to be called with * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block * (ie, flags is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) * * return > 0, number of blocks already mapped/allocated * if flags doesn't contain EXT4_GET_BLOCKS_CREATE and these are pre-allocated blocks * buffer head is unmapped * otherwise blocks are mapped * * return = 0, if plain look up failed (blocks have not been allocated) * buffer head is unmapped * * return < 0, error case. */ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { struct ext4_ext_path *path = NULL; struct ext4_extent newex, *ex, ex2; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_fsblk_t newblock = 0, pblk; int err = 0, depth; unsigned int allocated = 0, offset = 0; unsigned int allocated_clusters = 0; struct ext4_allocation_request ar; ext4_lblk_t cluster_offset; ext_debug(inode, "blocks %u/%u requested\n", map->m_lblk, map->m_len); trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); /* find extent for this block */ path = ext4_find_extent(inode, map->m_lblk, NULL, flags); if (IS_ERR(path)) { err = PTR_ERR(path); goto out; } depth = ext_depth(inode); /* * consistent leaf must not be empty; * this situation is possible, though, _during_ tree modification; * this is why assert can't be put in ext4_find_extent() */ if (unlikely(path[depth].p_ext == NULL && depth != 0)) { EXT4_ERROR_INODE(inode, "bad extent address " "lblock: %lu, depth: %d pblock %lld", (unsigned long) map->m_lblk, depth, path[depth].p_block); err = -EFSCORRUPTED; goto out; } ex = path[depth].p_ext; if (ex) { ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); ext4_fsblk_t ee_start = ext4_ext_pblock(ex); unsigned short ee_len; /* * unwritten extents are treated as holes, except that * we split out initialized portions during a write. */ ee_len = ext4_ext_get_actual_len(ex); trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len); /* if found extent covers block, simply return it */ if (in_range(map->m_lblk, ee_block, ee_len)) { newblock = map->m_lblk - ee_block + ee_start; /* number of remaining blocks in the extent */ allocated = ee_len - (map->m_lblk - ee_block); ext_debug(inode, "%u fit into %u:%d -> %llu\n", map->m_lblk, ee_block, ee_len, newblock); /* * If the extent is initialized check whether the * caller wants to convert it to unwritten. */ if ((!ext4_ext_is_unwritten(ex)) && (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { path = convert_initialized_extent(handle, inode, map, path, &allocated); if (IS_ERR(path)) err = PTR_ERR(path); goto out; } else if (!ext4_ext_is_unwritten(ex)) { map->m_flags |= EXT4_MAP_MAPPED; map->m_pblk = newblock; if (allocated > map->m_len) allocated = map->m_len; map->m_len = allocated; ext4_ext_show_leaf(inode, path); goto out; } path = ext4_ext_handle_unwritten_extents( handle, inode, map, path, flags, &allocated, newblock); if (IS_ERR(path)) err = PTR_ERR(path); goto out; } } /* * requested block isn't allocated yet; * we couldn't try to create block if flags doesn't contain EXT4_GET_BLOCKS_CREATE */ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { ext4_lblk_t len; len = ext4_ext_determine_insert_hole(inode, path, map->m_lblk); map->m_pblk = 0; map->m_len = min_t(unsigned int, map->m_len, len); goto out; } /* * Okay, we need to do block allocation. */ newex.ee_block = cpu_to_le32(map->m_lblk); cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); /* * If we are doing bigalloc, check to see if the extent returned * by ext4_find_extent() implies a cluster we can use. */ if (cluster_offset && ex && get_implied_cluster_alloc(inode->i_sb, map, ex, path)) { ar.len = allocated = map->m_len; newblock = map->m_pblk; goto got_allocated_blocks; } /* find neighbour allocated blocks */ ar.lleft = map->m_lblk; err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft); if (err) goto out; ar.lright = map->m_lblk; err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2, flags); if (err < 0) goto out; /* Check if the extent after searching to the right implies a * cluster we can use. */ if ((sbi->s_cluster_ratio > 1) && err && get_implied_cluster_alloc(inode->i_sb, map, &ex2, path)) { ar.len = allocated = map->m_len; newblock = map->m_pblk; err = 0; goto got_allocated_blocks; } /* * See if request is beyond maximum number of blocks we can have in * a single extent. For an initialized extent this limit is * EXT_INIT_MAX_LEN and for an unwritten extent this limit is * EXT_UNWRITTEN_MAX_LEN. */ if (map->m_len > EXT_INIT_MAX_LEN && !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT)) map->m_len = EXT_INIT_MAX_LEN; else if (map->m_len > EXT_UNWRITTEN_MAX_LEN && (flags & EXT4_GET_BLOCKS_UNWRIT_EXT)) map->m_len = EXT_UNWRITTEN_MAX_LEN; /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */ newex.ee_len = cpu_to_le16(map->m_len); err = ext4_ext_check_overlap(sbi, inode, &newex, path); if (err) allocated = ext4_ext_get_actual_len(&newex); else allocated = map->m_len; /* allocate new block */ ar.inode = inode; ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk); ar.logical = map->m_lblk; /* * We calculate the offset from the beginning of the cluster * for the logical block number, since when we allocate a * physical cluster, the physical block should start at the * same offset from the beginning of the cluster. This is * needed so that future calls to get_implied_cluster_alloc() * work correctly. */ offset = EXT4_LBLK_COFF(sbi, map->m_lblk); ar.len = EXT4_NUM_B2C(sbi, offset+allocated); ar.goal -= offset; ar.logical -= offset; if (S_ISREG(inode->i_mode)) ar.flags = EXT4_MB_HINT_DATA; else /* disable in-core preallocation for non-regular files */ ar.flags = 0; if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE) ar.flags |= EXT4_MB_HINT_NOPREALLOC; if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) ar.flags |= EXT4_MB_DELALLOC_RESERVED; if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) ar.flags |= EXT4_MB_USE_RESERVED; newblock = ext4_mb_new_blocks(handle, &ar, &err); if (!newblock) goto out; allocated_clusters = ar.len; ar.len = EXT4_C2B(sbi, ar.len) - offset; ext_debug(inode, "allocate new block: goal %llu, found %llu/%u, requested %u\n", ar.goal, newblock, ar.len, allocated); if (ar.len > allocated) ar.len = allocated; got_allocated_blocks: /* try to insert new extent into found leaf and return */ pblk = newblock + offset; ext4_ext_store_pblock(&newex, pblk); newex.ee_len = cpu_to_le16(ar.len); /* Mark unwritten */ if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) { ext4_ext_mark_unwritten(&newex); map->m_flags |= EXT4_MAP_UNWRITTEN; } path = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (IS_ERR(path)) { err = PTR_ERR(path); if (allocated_clusters) { int fb_flags = 0; /* * free data blocks we just allocated. * not a good idea to call discard here directly, * but otherwise we'd need to call it every free(). */ ext4_discard_preallocations(inode); if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) fb_flags = EXT4_FREE_BLOCKS_NO_QUOT_UPDATE; ext4_free_blocks(handle, inode, NULL, newblock, EXT4_C2B(sbi, allocated_clusters), fb_flags); } goto out; } /* * Cache the extent and update transaction to commit on fdatasync only * when it is _not_ an unwritten extent. */ if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0) ext4_update_inode_fsync_trans(handle, inode, 1); else ext4_update_inode_fsync_trans(handle, inode, 0); map->m_flags |= (EXT4_MAP_NEW | EXT4_MAP_MAPPED); map->m_pblk = pblk; map->m_len = ar.len; allocated = map->m_len; ext4_ext_show_leaf(inode, path); out: /* * We never use EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF with CREATE flag. * So we know that the depth used here is correct, since there was no * block allocation done if EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF is set. * If tomorrow we start using this QUERY flag with CREATE, then we will * need to re-calculate the depth as it might have changed due to block * allocation. */ if (flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) { WARN_ON_ONCE(flags & EXT4_GET_BLOCKS_CREATE); if (!err && ex && (ex == EXT_LAST_EXTENT(path[depth].p_hdr))) map->m_flags |= EXT4_MAP_QUERY_LAST_IN_LEAF; } ext4_free_ext_path(path); trace_ext4_ext_map_blocks_exit(inode, flags, map, err ? err : allocated); return err ? err : allocated; } int ext4_ext_truncate(handle_t *handle, struct inode *inode) { struct super_block *sb = inode->i_sb; ext4_lblk_t last_block; int err = 0; /* * TODO: optimization is possible here. * Probably we need not scan at all, * because page truncation is enough. */ /* we have to know where to truncate from in crash case */ EXT4_I(inode)->i_disksize = inode->i_size; err = ext4_mark_inode_dirty(handle, inode); if (err) return err; last_block = (inode->i_size + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); retry_remove_space: err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); if (err == -ENOMEM) { memalloc_retry_wait(GFP_ATOMIC); goto retry_remove_space; } return err; } static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, ext4_lblk_t len, loff_t new_size, int flags) { struct inode *inode = file_inode(file); handle_t *handle; int ret = 0, ret2 = 0, ret3 = 0; int retries = 0; int depth = 0; struct ext4_map_blocks map; unsigned int credits; loff_t epos, old_size = i_size_read(inode); unsigned int blkbits = inode->i_blkbits; bool alloc_zero = false; BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)); map.m_lblk = offset; map.m_len = len; /* * Don't normalize the request if it can fit in one extent so * that it doesn't get unnecessarily split into multiple * extents. */ if (len <= EXT_UNWRITTEN_MAX_LEN) flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; /* * Do the actual write zero during a running journal transaction * costs a lot. First allocate an unwritten extent and then * convert it to written after zeroing it out. */ if (flags & EXT4_GET_BLOCKS_ZERO) { flags &= ~EXT4_GET_BLOCKS_ZERO; flags |= EXT4_GET_BLOCKS_UNWRIT_EXT; alloc_zero = true; } /* * credits to insert 1 extent into extent tree */ credits = ext4_chunk_trans_blocks(inode, len); depth = ext_depth(inode); retry: while (len) { /* * Recalculate credits when extent tree depth changes. */ if (depth != ext_depth(inode)) { credits = ext4_chunk_trans_blocks(inode, len); depth = ext_depth(inode); } handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); break; } ret = ext4_map_blocks(handle, inode, &map, flags); if (ret <= 0) { ext4_debug("inode #%lu: block %u: len %u: " "ext4_ext_map_blocks returned %d", inode->i_ino, map.m_lblk, map.m_len, ret); ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); break; } /* * allow a full retry cycle for any remaining allocations */ retries = 0; epos = EXT4_LBLK_TO_B(inode, map.m_lblk + ret); inode_set_ctime_current(inode); if (new_size) { if (epos > new_size) epos = new_size; if (ext4_update_inode_size(inode, epos) & 0x1) inode_set_mtime_to_ts(inode, inode_get_ctime(inode)); if (epos > old_size) { pagecache_isize_extended(inode, old_size, epos); ext4_zero_partial_blocks(handle, inode, old_size, epos - old_size); } } ret2 = ext4_mark_inode_dirty(handle, inode); ext4_update_inode_fsync_trans(handle, inode, 1); ret3 = ext4_journal_stop(handle); ret2 = ret3 ? ret3 : ret2; if (unlikely(ret2)) break; if (alloc_zero && (map.m_flags & (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN))) { ret2 = ext4_issue_zeroout(inode, map.m_lblk, map.m_pblk, map.m_len); if (likely(!ret2)) ret2 = ext4_convert_unwritten_extents(NULL, inode, (loff_t)map.m_lblk << blkbits, (loff_t)map.m_len << blkbits); if (ret2) break; } map.m_lblk += ret; map.m_len = len = len - ret; } if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; return ret > 0 ? ret2 : ret; } static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len); static int ext4_insert_range(struct file *file, loff_t offset, loff_t len); static long ext4_zero_range(struct file *file, loff_t offset, loff_t len, int mode) { struct inode *inode = file_inode(file); handle_t *handle = NULL; loff_t new_size = 0; loff_t end = offset + len; ext4_lblk_t start_lblk, end_lblk; unsigned int blocksize = i_blocksize(inode); unsigned int blkbits = inode->i_blkbits; int ret, flags, credits; trace_ext4_zero_range(inode, offset, len, mode); WARN_ON_ONCE(!inode_is_locked(inode)); /* Indirect files do not support unwritten extents */ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return -EOPNOTSUPP; if (!(mode & FALLOC_FL_KEEP_SIZE) && (end > inode->i_size || end > EXT4_I(inode)->i_disksize)) { new_size = end; ret = inode_newsize_ok(inode, new_size); if (ret) return ret; } flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; /* Preallocate the range including the unaligned edges */ if (!IS_ALIGNED(offset | end, blocksize)) { ext4_lblk_t alloc_lblk = offset >> blkbits; ext4_lblk_t len_lblk = EXT4_MAX_BLOCKS(len, offset, blkbits); ret = ext4_alloc_file_blocks(file, alloc_lblk, len_lblk, new_size, flags); if (ret) return ret; } ret = ext4_update_disksize_before_punch(inode, offset, len); if (ret) return ret; /* Now release the pages and zero block aligned part of pages */ ret = ext4_truncate_page_cache_block_range(inode, offset, end); if (ret) return ret; /* Zero range excluding the unaligned edges */ start_lblk = EXT4_B_TO_LBLK(inode, offset); end_lblk = end >> blkbits; if (end_lblk > start_lblk) { ext4_lblk_t zero_blks = end_lblk - start_lblk; if (mode & FALLOC_FL_WRITE_ZEROES) flags = EXT4_GET_BLOCKS_CREATE_ZERO | EXT4_EX_NOCACHE; else flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | EXT4_EX_NOCACHE); ret = ext4_alloc_file_blocks(file, start_lblk, zero_blks, new_size, flags); if (ret) return ret; } /* Finish zeroing out if it doesn't contain partial block */ if (IS_ALIGNED(offset | end, blocksize)) return ret; /* * In worst case we have to writeout two nonadjacent unwritten * blocks and update the inode */ credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1; if (ext4_should_journal_data(inode)) credits += 2; handle = ext4_journal_start(inode, EXT4_HT_MISC, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_std_error(inode->i_sb, ret); return ret; } /* Zero out partial block at the edges of the range */ ret = ext4_zero_partial_blocks(handle, inode, offset, len); if (ret) goto out_handle; if (new_size) ext4_update_inode_size(inode, new_size); ret = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret)) goto out_handle; ext4_update_inode_fsync_trans(handle, inode, 1); if (file->f_flags & O_SYNC) ext4_handle_sync(handle); out_handle: ext4_journal_stop(handle); return ret; } static long ext4_do_fallocate(struct file *file, loff_t offset, loff_t len, int mode) { struct inode *inode = file_inode(file); loff_t end = offset + len; loff_t new_size = 0; ext4_lblk_t start_lblk, len_lblk; int ret; trace_ext4_fallocate_enter(inode, offset, len, mode); WARN_ON_ONCE(!inode_is_locked(inode)); start_lblk = offset >> inode->i_blkbits; len_lblk = EXT4_MAX_BLOCKS(len, offset, inode->i_blkbits); /* We only support preallocation for extent-based files only. */ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { ret = -EOPNOTSUPP; goto out; } if (!(mode & FALLOC_FL_KEEP_SIZE) && (end > inode->i_size || end > EXT4_I(inode)->i_disksize)) { new_size = end; ret = inode_newsize_ok(inode, new_size); if (ret) goto out; } ret = ext4_alloc_file_blocks(file, start_lblk, len_lblk, new_size, EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT); if (ret) goto out; if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) { ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal, EXT4_I(inode)->i_sync_tid); } out: trace_ext4_fallocate_exit(inode, offset, len_lblk, ret); return ret; } /* * preallocate space for a file. This implements ext4's fallocate file * operation, which gets called from sys_fallocate system call. * For block-mapped files, posix_fallocate should fall back to the method * of writing zeroes to the required new blocks (the same behavior which is * expected for file systems which do not support fallocate() system call). */ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); struct address_space *mapping = file->f_mapping; int ret; /* * Encrypted inodes can't handle collapse range or insert * range since we would need to re-encrypt blocks with a * different IV or XTS tweak (which are based on the logical * block number). */ if (IS_ENCRYPTED(inode) && (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE))) return -EOPNOTSUPP; /* * Don't allow writing zeroes if the underlying device does not * enable the unmap write zeroes operation. */ if ((mode & FALLOC_FL_WRITE_ZEROES) && !bdev_write_zeroes_unmap_sectors(inode->i_sb->s_bdev)) return -EOPNOTSUPP; /* Return error if mode is not supported */ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE | FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE | FALLOC_FL_WRITE_ZEROES)) return -EOPNOTSUPP; inode_lock(inode); ret = ext4_convert_inline_data(inode); if (ret) goto out_inode_lock; /* Wait all existing dio workers, newcomers will block on i_rwsem */ inode_dio_wait(inode); ret = file_modified(file); if (ret) goto out_inode_lock; if ((mode & FALLOC_FL_MODE_MASK) == FALLOC_FL_ALLOCATE_RANGE) { ret = ext4_do_fallocate(file, offset, len, mode); goto out_inode_lock; } /* * Follow-up operations will drop page cache, hold invalidate lock * to prevent page faults from reinstantiating pages we have * released from page cache. */ filemap_invalidate_lock(mapping); ret = ext4_break_layouts(inode); if (ret) goto out_invalidate_lock; switch (mode & FALLOC_FL_MODE_MASK) { case FALLOC_FL_PUNCH_HOLE: ret = ext4_punch_hole(file, offset, len); break; case FALLOC_FL_COLLAPSE_RANGE: ret = ext4_collapse_range(file, offset, len); break; case FALLOC_FL_INSERT_RANGE: ret = ext4_insert_range(file, offset, len); break; case FALLOC_FL_ZERO_RANGE: case FALLOC_FL_WRITE_ZEROES: ret = ext4_zero_range(file, offset, len, mode); break; default: ret = -EOPNOTSUPP; } out_invalidate_lock: filemap_invalidate_unlock(mapping); out_inode_lock: inode_unlock(inode); return ret; } /* * This function converts a range of blocks to written extents. The caller of * this function will pass the start offset and the size. all unwritten extents * within this range will be converted to written extents. * * This function is called from the direct IO end io call back function for * atomic writes, to convert the unwritten extents after IO is completed. * * Note that the requirement for atomic writes is that all conversion should * happen atomically in a single fs journal transaction. We mainly only allocate * unwritten extents either on a hole on a pre-exiting unwritten extent range in * ext4_map_blocks_atomic_write(). The only case where we can have multiple * unwritten extents in a range [offset, offset+len) is when there is a split * unwritten extent between two leaf nodes which was cached in extent status * cache during ext4_iomap_alloc() time. That will allow * ext4_map_blocks_atomic_write() to return the unwritten extent range w/o going * into the slow path. That means we might need a loop for conversion of this * unwritten extent split across leaf block within a single journal transaction. * Split extents across leaf nodes is a rare case, but let's still handle that * to meet the requirements of multi-fsblock atomic writes. * * Returns 0 on success. */ int ext4_convert_unwritten_extents_atomic(handle_t *handle, struct inode *inode, loff_t offset, ssize_t len) { unsigned int max_blocks; int ret = 0, ret2 = 0, ret3 = 0; struct ext4_map_blocks map; unsigned int blkbits = inode->i_blkbits; unsigned int credits = 0; int flags = EXT4_GET_BLOCKS_IO_CONVERT_EXT | EXT4_EX_NOCACHE; map.m_lblk = offset >> blkbits; max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); if (!handle) { /* * TODO: An optimization can be added later by having an extent * status flag e.g. EXTENT_STATUS_SPLIT_LEAF. If we query that * it can tell if the extent in the cache is a split extent. * But for now let's assume pextents as 2 always. */ credits = ext4_meta_trans_blocks(inode, max_blocks, 2); } if (credits) { handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); return ret; } } while (ret >= 0 && ret < max_blocks) { map.m_lblk += ret; map.m_len = (max_blocks -= ret); ret = ext4_map_blocks(handle, inode, &map, flags); if (ret != max_blocks) ext4_msg(inode->i_sb, KERN_INFO, "inode #%lu: block %u: len %u: " "split block mapping found for atomic write, " "ret = %d", inode->i_ino, map.m_lblk, map.m_len, ret); if (ret <= 0) break; } ret2 = ext4_mark_inode_dirty(handle, inode); if (credits) { ret3 = ext4_journal_stop(handle); if (unlikely(ret3)) ret2 = ret3; } if (ret <= 0 || ret2) ext4_warning(inode->i_sb, "inode #%lu: block %u: len %u: " "returned %d or %d", inode->i_ino, map.m_lblk, map.m_len, ret, ret2); return ret > 0 ? ret2 : ret; } /* * This function convert a range of blocks to written extents * The caller of this function will pass the start offset and the size. * all unwritten extents within this range will be converted to * written extents. * * This function is called from the direct IO end io call back * function, to convert the fallocated extents after IO is completed. * Returns 0 on success. */ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, loff_t offset, ssize_t len) { unsigned int max_blocks; int ret = 0, ret2 = 0, ret3 = 0; struct ext4_map_blocks map; unsigned int blkbits = inode->i_blkbits; unsigned int credits = 0; map.m_lblk = offset >> blkbits; max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); if (!handle) { /* * credits to insert 1 extent into extent tree */ credits = ext4_chunk_trans_blocks(inode, max_blocks); } while (ret >= 0 && ret < max_blocks) { map.m_lblk += ret; map.m_len = (max_blocks -= ret); if (credits) { handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); break; } } /* * Do not cache any unrelated extents, as it does not hold the * i_rwsem or invalidate_lock, which could corrupt the extent * status tree. */ ret = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_IO_CONVERT_EXT | EXT4_EX_NOCACHE); if (ret <= 0) ext4_warning(inode->i_sb, "inode #%lu: block %u: len %u: " "ext4_ext_map_blocks returned %d", inode->i_ino, map.m_lblk, map.m_len, ret); ret2 = ext4_mark_inode_dirty(handle, inode); if (credits) { ret3 = ext4_journal_stop(handle); if (unlikely(ret3)) ret2 = ret3; } if (ret <= 0 || ret2) break; } return ret > 0 ? ret2 : ret; } int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end) { int ret = 0, err = 0; struct ext4_io_end_vec *io_end_vec; /* * This is somewhat ugly but the idea is clear: When transaction is * reserved, everything goes into it. Otherwise we rather start several * smaller transactions for conversion of each extent separately. */ if (handle) { handle = ext4_journal_start_reserved(handle, EXT4_HT_EXT_CONVERT); if (IS_ERR(handle)) return PTR_ERR(handle); } list_for_each_entry(io_end_vec, &io_end->list_vec, list) { ret = ext4_convert_unwritten_extents(handle, io_end->inode, io_end_vec->offset, io_end_vec->size); if (ret) break; } if (handle) err = ext4_journal_stop(handle); return ret < 0 ? ret : err; } static int ext4_iomap_xattr_fiemap(struct inode *inode, struct iomap *iomap) { __u64 physical = 0; __u64 length = 0; int blockbits = inode->i_sb->s_blocksize_bits; int error = 0; u16 iomap_type; /* in-inode? */ if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { struct ext4_iloc iloc; int offset; /* offset of xattr in inode */ error = ext4_get_inode_loc(inode, &iloc); if (error) return error; physical = (__u64)iloc.bh->b_blocknr << blockbits; offset = EXT4_GOOD_OLD_INODE_SIZE + EXT4_I(inode)->i_extra_isize; physical += offset; length = EXT4_SB(inode->i_sb)->s_inode_size - offset; brelse(iloc.bh); iomap_type = IOMAP_INLINE; } else if (EXT4_I(inode)->i_file_acl) { /* external block */ physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits; length = inode->i_sb->s_blocksize; iomap_type = IOMAP_MAPPED; } else { /* no in-inode or external block for xattr, so return -ENOENT */ error = -ENOENT; goto out; } iomap->addr = physical; iomap->offset = 0; iomap->length = length; iomap->type = iomap_type; iomap->flags = 0; out: return error; } static int ext4_iomap_xattr_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap, struct iomap *srcmap) { int error; error = ext4_iomap_xattr_fiemap(inode, iomap); if (error == 0 && (offset >= iomap->length)) error = -ENOENT; return error; } static const struct iomap_ops ext4_iomap_xattr_ops = { .iomap_begin = ext4_iomap_xattr_begin, }; static int ext4_fiemap_check_ranges(struct inode *inode, u64 start, u64 *len) { u64 maxbytes = ext4_get_maxbytes(inode); if (*len == 0) return -EINVAL; if (start > maxbytes) return -EFBIG; /* * Shrink request scope to what the fs can actually handle. */ if (*len > maxbytes || (maxbytes - *len) < start) *len = maxbytes - start; return 0; } int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { int error = 0; inode_lock_shared(inode); if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { error = ext4_ext_precache(inode); if (error) goto unlock; fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; } /* * For bitmap files the maximum size limit could be smaller than * s_maxbytes, so check len here manually instead of just relying on the * generic check. */ error = ext4_fiemap_check_ranges(inode, start, &len); if (error) goto unlock; if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR; error = iomap_fiemap(inode, fieinfo, start, len, &ext4_iomap_xattr_ops); } else { error = iomap_fiemap(inode, fieinfo, start, len, &ext4_iomap_report_ops); } unlock: inode_unlock_shared(inode); return error; } int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) { ext4_lblk_t start_blk, len_blks; __u64 last_blk; int error = 0; if (ext4_has_inline_data(inode)) { int has_inline; down_read(&EXT4_I(inode)->xattr_sem); has_inline = ext4_has_inline_data(inode); up_read(&EXT4_I(inode)->xattr_sem); if (has_inline) return 0; } if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { inode_lock_shared(inode); error = ext4_ext_precache(inode); inode_unlock_shared(inode); if (error) return error; fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; } error = fiemap_prep(inode, fieinfo, start, &len, 0); if (error) return error; error = ext4_fiemap_check_ranges(inode, start, &len); if (error) return error; start_blk = start >> inode->i_sb->s_blocksize_bits; last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits; if (last_blk >= EXT_MAX_BLOCKS) last_blk = EXT_MAX_BLOCKS-1; len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; /* * Walk the extent tree gathering extent information * and pushing extents back to the user. */ return ext4_fill_es_cache_info(inode, start_blk, len_blks, fieinfo); } /* * ext4_ext_shift_path_extents: * Shift the extents of a path structure lying between path[depth].p_ext * and EXT_LAST_EXTENT(path[depth].p_hdr), by @shift blocks. @SHIFT tells * if it is right shift or left shift operation. */ static int ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, struct inode *inode, handle_t *handle, enum SHIFT_DIRECTION SHIFT) { int depth, err = 0; struct ext4_extent *ex_start, *ex_last; bool update = false; int credits, restart_credits; depth = path->p_depth; while (depth >= 0) { if (depth == path->p_depth) { ex_start = path[depth].p_ext; if (!ex_start) return -EFSCORRUPTED; ex_last = EXT_LAST_EXTENT(path[depth].p_hdr); /* leaf + sb + inode */ credits = 3; if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) { update = true; /* extent tree + sb + inode */ credits = depth + 2; } restart_credits = ext4_chunk_trans_extent(inode, 0); err = ext4_datasem_ensure_credits(handle, inode, credits, restart_credits, 0); if (err) { if (err > 0) err = -EAGAIN; goto out; } err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; while (ex_start <= ex_last) { if (SHIFT == SHIFT_LEFT) { le32_add_cpu(&ex_start->ee_block, -shift); /* Try to merge to the left. */ if ((ex_start > EXT_FIRST_EXTENT(path[depth].p_hdr)) && ext4_ext_try_to_merge_right(inode, path, ex_start - 1)) ex_last--; else ex_start++; } else { le32_add_cpu(&ex_last->ee_block, shift); ext4_ext_try_to_merge_right(inode, path, ex_last); ex_last--; } } err = ext4_ext_dirty(handle, inode, path + depth); if (err) goto out; if (--depth < 0 || !update) break; } /* Update index too */ err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; if (SHIFT == SHIFT_LEFT) le32_add_cpu(&path[depth].p_idx->ei_block, -shift); else le32_add_cpu(&path[depth].p_idx->ei_block, shift); err = ext4_ext_dirty(handle, inode, path + depth); if (err) goto out; /* we are done if current index is not a starting index */ if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr)) break; depth--; } out: return err; } /* * ext4_ext_shift_extents: * All the extents which lies in the range from @start to the last allocated * block for the @inode are shifted either towards left or right (depending * upon @SHIFT) by @shift blocks. * On success, 0 is returned, error otherwise. */ static int ext4_ext_shift_extents(struct inode *inode, handle_t *handle, ext4_lblk_t start, ext4_lblk_t shift, enum SHIFT_DIRECTION SHIFT) { struct ext4_ext_path *path; int ret = 0, depth; struct ext4_extent *extent; ext4_lblk_t stop, *iterator, ex_start, ex_end; ext4_lblk_t tmp = EXT_MAX_BLOCKS; /* Let path point to the last extent */ path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); depth = path->p_depth; extent = path[depth].p_ext; if (!extent) goto out; stop = le32_to_cpu(extent->ee_block); /* * For left shifts, make sure the hole on the left is big enough to * accommodate the shift. For right shifts, make sure the last extent * won't be shifted beyond EXT_MAX_BLOCKS. */ if (SHIFT == SHIFT_LEFT) { path = ext4_find_extent(inode, start - 1, path, EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); depth = path->p_depth; extent = path[depth].p_ext; if (extent) { ex_start = le32_to_cpu(extent->ee_block); ex_end = le32_to_cpu(extent->ee_block) + ext4_ext_get_actual_len(extent); } else { ex_start = 0; ex_end = 0; } if ((start == ex_start && shift > ex_start) || (shift > start - ex_end)) { ret = -EINVAL; goto out; } } else { if (shift > EXT_MAX_BLOCKS - (stop + ext4_ext_get_actual_len(extent))) { ret = -EINVAL; goto out; } } /* * In case of left shift, iterator points to start and it is increased * till we reach stop. In case of right shift, iterator points to stop * and it is decreased till we reach start. */ again: ret = 0; if (SHIFT == SHIFT_LEFT) iterator = &start; else iterator = &stop; if (tmp != EXT_MAX_BLOCKS) *iterator = tmp; /* * Its safe to start updating extents. Start and stop are unsigned, so * in case of right shift if extent with 0 block is reached, iterator * becomes NULL to indicate the end of the loop. */ while (iterator && start <= stop) { path = ext4_find_extent(inode, *iterator, path, EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); depth = path->p_depth; extent = path[depth].p_ext; if (!extent) { EXT4_ERROR_INODE(inode, "unexpected hole at %lu", (unsigned long) *iterator); return -EFSCORRUPTED; } if (SHIFT == SHIFT_LEFT && *iterator > le32_to_cpu(extent->ee_block)) { /* Hole, move to the next extent */ if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) { path[depth].p_ext++; } else { *iterator = ext4_ext_next_allocated_block(path); continue; } } tmp = *iterator; if (SHIFT == SHIFT_LEFT) { extent = EXT_LAST_EXTENT(path[depth].p_hdr); *iterator = le32_to_cpu(extent->ee_block) + ext4_ext_get_actual_len(extent); } else { extent = EXT_FIRST_EXTENT(path[depth].p_hdr); if (le32_to_cpu(extent->ee_block) > start) *iterator = le32_to_cpu(extent->ee_block) - 1; else if (le32_to_cpu(extent->ee_block) == start) iterator = NULL; else { extent = EXT_LAST_EXTENT(path[depth].p_hdr); while (le32_to_cpu(extent->ee_block) >= start) extent--; if (extent == EXT_LAST_EXTENT(path[depth].p_hdr)) break; extent++; iterator = NULL; } path[depth].p_ext = extent; } ret = ext4_ext_shift_path_extents(path, shift, inode, handle, SHIFT); /* iterator can be NULL which means we should break */ if (ret == -EAGAIN) goto again; if (ret) break; } out: ext4_free_ext_path(path); return ret; } /* * ext4_collapse_range: * This implements the fallocate's collapse range functionality for ext4 * Returns: 0 and non-zero on error. */ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct address_space *mapping = inode->i_mapping; loff_t end = offset + len; ext4_lblk_t start_lblk, end_lblk; handle_t *handle; unsigned int credits; loff_t start, new_size; int ret; trace_ext4_collapse_range(inode, offset, len); WARN_ON_ONCE(!inode_is_locked(inode)); /* Currently just for extent based files */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return -EOPNOTSUPP; /* Collapse range works only on fs cluster size aligned regions. */ if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) return -EINVAL; /* * There is no need to overlap collapse range with EOF, in which case * it is effectively a truncate operation */ if (end >= inode->i_size) return -EINVAL; /* * Write tail of the last page before removed range and data that * will be shifted since they will get removed from the page cache * below. We are also protected from pages becoming dirty by * i_rwsem and invalidate_lock. * Need to round down offset to be aligned with page size boundary * for page size > block size. */ start = round_down(offset, PAGE_SIZE); ret = filemap_write_and_wait_range(mapping, start, offset); if (!ret) ret = filemap_write_and_wait_range(mapping, end, LLONG_MAX); if (ret) return ret; truncate_pagecache(inode, start); credits = ext4_chunk_trans_extent(inode, 0); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) return PTR_ERR(handle); ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle); start_lblk = offset >> inode->i_blkbits; end_lblk = (offset + len) >> inode->i_blkbits; ext4_check_map_extents_env(inode); down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk); ret = ext4_ext_remove_space(inode, start_lblk, end_lblk - 1); if (ret) { up_write(&EXT4_I(inode)->i_data_sem); goto out_handle; } ext4_discard_preallocations(inode); ret = ext4_ext_shift_extents(inode, handle, end_lblk, end_lblk - start_lblk, SHIFT_LEFT); if (ret) { up_write(&EXT4_I(inode)->i_data_sem); goto out_handle; } new_size = inode->i_size - len; i_size_write(inode, new_size); EXT4_I(inode)->i_disksize = new_size; up_write(&EXT4_I(inode)->i_data_sem); ret = ext4_mark_inode_dirty(handle, inode); if (ret) goto out_handle; ext4_update_inode_fsync_trans(handle, inode, 1); if (IS_SYNC(inode)) ext4_handle_sync(handle); out_handle: ext4_journal_stop(handle); return ret; } /* * ext4_insert_range: * This function implements the FALLOC_FL_INSERT_RANGE flag of fallocate. * The data blocks starting from @offset to the EOF are shifted by @len * towards right to create a hole in the @inode. Inode size is increased * by len bytes. * Returns 0 on success, error otherwise. */ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct address_space *mapping = inode->i_mapping; handle_t *handle; struct ext4_ext_path *path; struct ext4_extent *extent; ext4_lblk_t start_lblk, len_lblk, ee_start_lblk = 0; unsigned int credits, ee_len; int ret, depth, split_flag = 0; loff_t start; trace_ext4_insert_range(inode, offset, len); WARN_ON_ONCE(!inode_is_locked(inode)); /* Currently just for extent based files */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return -EOPNOTSUPP; /* Insert range works only on fs cluster size aligned regions. */ if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) return -EINVAL; /* Offset must be less than i_size */ if (offset >= inode->i_size) return -EINVAL; /* Check whether the maximum file size would be exceeded */ if (len > inode->i_sb->s_maxbytes - inode->i_size) return -EFBIG; /* * Write out all dirty pages. Need to round down to align start offset * to page size boundary for page size > block size. */ start = round_down(offset, PAGE_SIZE); ret = filemap_write_and_wait_range(mapping, start, LLONG_MAX); if (ret) return ret; truncate_pagecache(inode, start); credits = ext4_chunk_trans_extent(inode, 0); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) return PTR_ERR(handle); ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle); /* Expand file to avoid data loss if there is error while shifting */ inode->i_size += len; EXT4_I(inode)->i_disksize += len; ret = ext4_mark_inode_dirty(handle, inode); if (ret) goto out_handle; start_lblk = offset >> inode->i_blkbits; len_lblk = len >> inode->i_blkbits; ext4_check_map_extents_env(inode); down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); path = ext4_find_extent(inode, start_lblk, NULL, 0); if (IS_ERR(path)) { up_write(&EXT4_I(inode)->i_data_sem); ret = PTR_ERR(path); goto out_handle; } depth = ext_depth(inode); extent = path[depth].p_ext; if (extent) { ee_start_lblk = le32_to_cpu(extent->ee_block); ee_len = ext4_ext_get_actual_len(extent); /* * If start_lblk is not the starting block of extent, split * the extent @start_lblk */ if ((start_lblk > ee_start_lblk) && (start_lblk < (ee_start_lblk + ee_len))) { if (ext4_ext_is_unwritten(extent)) split_flag = EXT4_EXT_MARK_UNWRIT1 | EXT4_EXT_MARK_UNWRIT2; path = ext4_split_extent_at(handle, inode, path, start_lblk, split_flag, EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_SPLIT_NOMERGE | EXT4_GET_BLOCKS_METADATA_NOFAIL); } if (IS_ERR(path)) { up_write(&EXT4_I(inode)->i_data_sem); ret = PTR_ERR(path); goto out_handle; } } ext4_free_ext_path(path); ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk); /* * if start_lblk lies in a hole which is at start of file, use * ee_start_lblk to shift extents */ ret = ext4_ext_shift_extents(inode, handle, max(ee_start_lblk, start_lblk), len_lblk, SHIFT_RIGHT); up_write(&EXT4_I(inode)->i_data_sem); if (ret) goto out_handle; ext4_update_inode_fsync_trans(handle, inode, 1); if (IS_SYNC(inode)) ext4_handle_sync(handle); out_handle: ext4_journal_stop(handle); return ret; } /** * ext4_swap_extents() - Swap extents between two inodes * @handle: handle for this transaction * @inode1: First inode * @inode2: Second inode * @lblk1: Start block for first inode * @lblk2: Start block for second inode * @count: Number of blocks to swap * @unwritten: Mark second inode's extents as unwritten after swap * @erp: Pointer to save error value * * This helper routine does exactly what is promise "swap extents". All other * stuff such as page-cache locking consistency, bh mapping consistency or * extent's data copying must be performed by caller. * Locking: * i_rwsem is held for both inodes * i_data_sem is locked for write for both inodes * Assumptions: * All pages from requested range are locked for both inodes */ int ext4_swap_extents(handle_t *handle, struct inode *inode1, struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2, ext4_lblk_t count, int unwritten, int *erp) { struct ext4_ext_path *path1 = NULL; struct ext4_ext_path *path2 = NULL; int replaced_count = 0; BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem)); BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem)); BUG_ON(!inode_is_locked(inode1)); BUG_ON(!inode_is_locked(inode2)); ext4_es_remove_extent(inode1, lblk1, count); ext4_es_remove_extent(inode2, lblk2, count); while (count) { struct ext4_extent *ex1, *ex2, tmp_ex; ext4_lblk_t e1_blk, e2_blk; int e1_len, e2_len, len; int split = 0; path1 = ext4_find_extent(inode1, lblk1, path1, EXT4_EX_NOCACHE); if (IS_ERR(path1)) { *erp = PTR_ERR(path1); goto errout; } path2 = ext4_find_extent(inode2, lblk2, path2, EXT4_EX_NOCACHE); if (IS_ERR(path2)) { *erp = PTR_ERR(path2); goto errout; } ex1 = path1[path1->p_depth].p_ext; ex2 = path2[path2->p_depth].p_ext; /* Do we have something to swap ? */ if (unlikely(!ex2 || !ex1)) goto errout; e1_blk = le32_to_cpu(ex1->ee_block); e2_blk = le32_to_cpu(ex2->ee_block); e1_len = ext4_ext_get_actual_len(ex1); e2_len = ext4_ext_get_actual_len(ex2); /* Hole handling */ if (!in_range(lblk1, e1_blk, e1_len) || !in_range(lblk2, e2_blk, e2_len)) { ext4_lblk_t next1, next2; /* if hole after extent, then go to next extent */ next1 = ext4_ext_next_allocated_block(path1); next2 = ext4_ext_next_allocated_block(path2); /* If hole before extent, then shift to that extent */ if (e1_blk > lblk1) next1 = e1_blk; if (e2_blk > lblk2) next2 = e2_blk; /* Do we have something to swap */ if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS) goto errout; /* Move to the rightest boundary */ len = next1 - lblk1; if (len < next2 - lblk2) len = next2 - lblk2; if (len > count) len = count; lblk1 += len; lblk2 += len; count -= len; continue; } /* Prepare left boundary */ if (e1_blk < lblk1) { split = 1; path1 = ext4_force_split_extent_at(handle, inode1, path1, lblk1, 0); if (IS_ERR(path1)) { *erp = PTR_ERR(path1); goto errout; } } if (e2_blk < lblk2) { split = 1; path2 = ext4_force_split_extent_at(handle, inode2, path2, lblk2, 0); if (IS_ERR(path2)) { *erp = PTR_ERR(path2); goto errout; } } /* ext4_split_extent_at() may result in leaf extent split, * path must to be revalidated. */ if (split) continue; /* Prepare right boundary */ len = count; if (len > e1_blk + e1_len - lblk1) len = e1_blk + e1_len - lblk1; if (len > e2_blk + e2_len - lblk2) len = e2_blk + e2_len - lblk2; if (len != e1_len) { split = 1; path1 = ext4_force_split_extent_at(handle, inode1, path1, lblk1 + len, 0); if (IS_ERR(path1)) { *erp = PTR_ERR(path1); goto errout; } } if (len != e2_len) { split = 1; path2 = ext4_force_split_extent_at(handle, inode2, path2, lblk2 + len, 0); if (IS_ERR(path2)) { *erp = PTR_ERR(path2); goto errout; } } /* ext4_split_extent_at() may result in leaf extent split, * path must to be revalidated. */ if (split) continue; BUG_ON(e2_len != e1_len); *erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth); if (unlikely(*erp)) goto errout; *erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth); if (unlikely(*erp)) goto errout; /* Both extents are fully inside boundaries. Swap it now */ tmp_ex = *ex1; ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2)); ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex)); ex1->ee_len = cpu_to_le16(e2_len); ex2->ee_len = cpu_to_le16(e1_len); if (unwritten) ext4_ext_mark_unwritten(ex2); if (ext4_ext_is_unwritten(&tmp_ex)) ext4_ext_mark_unwritten(ex1); ext4_ext_try_to_merge(handle, inode2, path2, ex2); ext4_ext_try_to_merge(handle, inode1, path1, ex1); *erp = ext4_ext_dirty(handle, inode2, path2 + path2->p_depth); if (unlikely(*erp)) goto errout; *erp = ext4_ext_dirty(handle, inode1, path1 + path1->p_depth); /* * Looks scarry ah..? second inode already points to new blocks, * and it was successfully dirtied. But luckily error may happen * only due to journal error, so full transaction will be * aborted anyway. */ if (unlikely(*erp)) goto errout; lblk1 += len; lblk2 += len; replaced_count += len; count -= len; } errout: ext4_free_ext_path(path1); ext4_free_ext_path(path2); return replaced_count; } /* * ext4_clu_mapped - determine whether any block in a logical cluster has * been mapped to a physical cluster * * @inode - file containing the logical cluster * @lclu - logical cluster of interest * * Returns 1 if any block in the logical cluster is mapped, signifying * that a physical cluster has been allocated for it. Otherwise, * returns 0. Can also return negative error codes. Derived from * ext4_ext_map_blocks(). */ int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_ext_path *path; int depth, mapped = 0, err = 0; struct ext4_extent *extent; ext4_lblk_t first_lblk, first_lclu, last_lclu; /* * if data can be stored inline, the logical cluster isn't * mapped - no physical clusters have been allocated, and the * file has no extents */ if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) || ext4_has_inline_data(inode)) return 0; /* search for the extent closest to the first block in the cluster */ path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0); if (IS_ERR(path)) return PTR_ERR(path); depth = ext_depth(inode); /* * A consistent leaf must not be empty. This situation is possible, * though, _during_ tree modification, and it's why an assert can't * be put in ext4_find_extent(). */ if (unlikely(path[depth].p_ext == NULL && depth != 0)) { EXT4_ERROR_INODE(inode, "bad extent address - lblock: %lu, depth: %d, pblock: %lld", (unsigned long) EXT4_C2B(sbi, lclu), depth, path[depth].p_block); err = -EFSCORRUPTED; goto out; } extent = path[depth].p_ext; /* can't be mapped if the extent tree is empty */ if (extent == NULL) goto out; first_lblk = le32_to_cpu(extent->ee_block); first_lclu = EXT4_B2C(sbi, first_lblk); /* * Three possible outcomes at this point - found extent spanning * the target cluster, to the left of the target cluster, or to the * right of the target cluster. The first two cases are handled here. * The last case indicates the target cluster is not mapped. */ if (lclu >= first_lclu) { last_lclu = EXT4_B2C(sbi, first_lblk + ext4_ext_get_actual_len(extent) - 1); if (lclu <= last_lclu) { mapped = 1; } else { first_lblk = ext4_ext_next_allocated_block(path); first_lclu = EXT4_B2C(sbi, first_lblk); if (lclu == first_lclu) mapped = 1; } } out: ext4_free_ext_path(path); return err ? err : mapped; } /* * Updates physical block address and unwritten status of extent * starting at lblk start and of len. If such an extent doesn't exist, * this function splits the extent tree appropriately to create an * extent like this. This function is called in the fast commit * replay path. Returns 0 on success and error on failure. */ int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start, int len, int unwritten, ext4_fsblk_t pblk) { struct ext4_ext_path *path; struct ext4_extent *ex; int ret; path = ext4_find_extent(inode, start, NULL, 0); if (IS_ERR(path)) return PTR_ERR(path); ex = path[path->p_depth].p_ext; if (!ex) { ret = -EFSCORRUPTED; goto out; } if (le32_to_cpu(ex->ee_block) != start || ext4_ext_get_actual_len(ex) != len) { /* We need to split this extent to match our extent first */ down_write(&EXT4_I(inode)->i_data_sem); path = ext4_force_split_extent_at(NULL, inode, path, start, 1); up_write(&EXT4_I(inode)->i_data_sem); if (IS_ERR(path)) { ret = PTR_ERR(path); goto out; } path = ext4_find_extent(inode, start, path, 0); if (IS_ERR(path)) return PTR_ERR(path); ex = path[path->p_depth].p_ext; WARN_ON(le32_to_cpu(ex->ee_block) != start); if (ext4_ext_get_actual_len(ex) != len) { down_write(&EXT4_I(inode)->i_data_sem); path = ext4_force_split_extent_at(NULL, inode, path, start + len, 1); up_write(&EXT4_I(inode)->i_data_sem); if (IS_ERR(path)) { ret = PTR_ERR(path); goto out; } path = ext4_find_extent(inode, start, path, 0); if (IS_ERR(path)) return PTR_ERR(path); ex = path[path->p_depth].p_ext; } } if (unwritten) ext4_ext_mark_unwritten(ex); else ext4_ext_mark_initialized(ex); ext4_ext_store_pblock(ex, pblk); down_write(&EXT4_I(inode)->i_data_sem); ret = ext4_ext_dirty(NULL, inode, &path[path->p_depth]); up_write(&EXT4_I(inode)->i_data_sem); out: ext4_free_ext_path(path); ext4_mark_inode_dirty(NULL, inode); return ret; } /* Try to shrink the extent tree */ void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end) { struct ext4_ext_path *path = NULL; struct ext4_extent *ex; ext4_lblk_t old_cur, cur = 0; while (cur < end) { path = ext4_find_extent(inode, cur, NULL, 0); if (IS_ERR(path)) return; ex = path[path->p_depth].p_ext; if (!ex) { ext4_free_ext_path(path); ext4_mark_inode_dirty(NULL, inode); return; } old_cur = cur; cur = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); if (cur <= old_cur) cur = old_cur + 1; ext4_ext_try_to_merge(NULL, inode, path, ex); down_write(&EXT4_I(inode)->i_data_sem); ext4_ext_dirty(NULL, inode, &path[path->p_depth]); up_write(&EXT4_I(inode)->i_data_sem); ext4_mark_inode_dirty(NULL, inode); ext4_free_ext_path(path); } } /* Check if *cur is a hole and if it is, skip it */ static int skip_hole(struct inode *inode, ext4_lblk_t *cur) { int ret; struct ext4_map_blocks map; map.m_lblk = *cur; map.m_len = ((inode->i_size) >> inode->i_sb->s_blocksize_bits) - *cur; ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) return ret; if (ret != 0) return 0; *cur = *cur + map.m_len; return 0; } /* Count number of blocks used by this inode and update i_blocks */ int ext4_ext_replay_set_iblocks(struct inode *inode) { struct ext4_ext_path *path = NULL, *path2 = NULL; struct ext4_extent *ex; ext4_lblk_t cur = 0, end; int numblks = 0, i, ret = 0; ext4_fsblk_t cmp1, cmp2; struct ext4_map_blocks map; /* Determin the size of the file first */ path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); ex = path[path->p_depth].p_ext; if (!ex) goto out; end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); /* Count the number of data blocks */ cur = 0; while (cur < end) { map.m_lblk = cur; map.m_len = end - cur; ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) break; if (ret > 0) numblks += ret; cur = cur + map.m_len; } /* * Count the number of extent tree blocks. We do it by looking up * two successive extents and determining the difference between * their paths. When path is different for 2 successive extents * we compare the blocks in the path at each level and increment * iblocks by total number of differences found. */ cur = 0; ret = skip_hole(inode, &cur); if (ret < 0) goto out; path = ext4_find_extent(inode, cur, path, 0); if (IS_ERR(path)) goto out; numblks += path->p_depth; while (cur < end) { path = ext4_find_extent(inode, cur, path, 0); if (IS_ERR(path)) break; ex = path[path->p_depth].p_ext; if (!ex) goto cleanup; cur = max(cur + 1, le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex)); ret = skip_hole(inode, &cur); if (ret < 0) break; path2 = ext4_find_extent(inode, cur, path2, 0); if (IS_ERR(path2)) break; for (i = 0; i <= max(path->p_depth, path2->p_depth); i++) { cmp1 = cmp2 = 0; if (i <= path->p_depth) cmp1 = path[i].p_bh ? path[i].p_bh->b_blocknr : 0; if (i <= path2->p_depth) cmp2 = path2[i].p_bh ? path2[i].p_bh->b_blocknr : 0; if (cmp1 != cmp2 && cmp2 != 0) numblks++; } } out: inode->i_blocks = numblks << (inode->i_sb->s_blocksize_bits - 9); ext4_mark_inode_dirty(NULL, inode); cleanup: ext4_free_ext_path(path); ext4_free_ext_path(path2); return 0; } int ext4_ext_clear_bb(struct inode *inode) { struct ext4_ext_path *path = NULL; struct ext4_extent *ex; ext4_lblk_t cur = 0, end; int j, ret = 0; struct ext4_map_blocks map; if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) return 0; /* Determin the size of the file first */ path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); ex = path[path->p_depth].p_ext; if (!ex) goto out; end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); cur = 0; while (cur < end) { map.m_lblk = cur; map.m_len = end - cur; ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) break; if (ret > 0) { path = ext4_find_extent(inode, map.m_lblk, path, 0); if (!IS_ERR(path)) { for (j = 0; j < path->p_depth; j++) { ext4_mb_mark_bb(inode->i_sb, path[j].p_block, 1, false); ext4_fc_record_regions(inode->i_sb, inode->i_ino, 0, path[j].p_block, 1, 1); } } else { path = NULL; } ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); ext4_fc_record_regions(inode->i_sb, inode->i_ino, map.m_lblk, map.m_pblk, map.m_len, 1); } cur = cur + map.m_len; } out: ext4_free_ext_path(path); return 0; } |
| 1 1 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 | // SPDX-License-Identifier: GPL-2.0 /* * NVMe over Fabrics TCP target. * Copyright (c) 2018 Lightbits Labs. All rights reserved. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/crc32c.h> #include <linux/err.h> #include <linux/nvme-tcp.h> #include <linux/nvme-keyring.h> #include <net/sock.h> #include <net/tcp.h> #include <net/tls.h> #include <net/tls_prot.h> #include <net/handshake.h> #include <linux/inet.h> #include <linux/llist.h> #include <trace/events/sock.h> #include "nvmet.h" #define NVMET_TCP_DEF_INLINE_DATA_SIZE (4 * PAGE_SIZE) #define NVMET_TCP_MAXH2CDATA 0x400000 /* 16M arbitrary limit */ #define NVMET_TCP_BACKLOG 128 static int param_store_val(const char *str, int *val, int min, int max) { int ret, new_val; ret = kstrtoint(str, 10, &new_val); if (ret) return -EINVAL; if (new_val < min || new_val > max) return -EINVAL; *val = new_val; return 0; } static int set_params(const char *str, const struct kernel_param *kp) { return param_store_val(str, kp->arg, 0, INT_MAX); } static const struct kernel_param_ops set_param_ops = { .set = set_params, .get = param_get_int, }; /* Define the socket priority to use for connections were it is desirable * that the NIC consider performing optimized packet processing or filtering. * A non-zero value being sufficient to indicate general consideration of any * possible optimization. Making it a module param allows for alternative * values that may be unique for some NIC implementations. */ static int so_priority; device_param_cb(so_priority, &set_param_ops, &so_priority, 0644); MODULE_PARM_DESC(so_priority, "nvmet tcp socket optimize priority: Default 0"); /* Define a time period (in usecs) that io_work() shall sample an activated * queue before determining it to be idle. This optional module behavior * can enable NIC solutions that support socket optimized packet processing * using advanced interrupt moderation techniques. */ static int idle_poll_period_usecs; device_param_cb(idle_poll_period_usecs, &set_param_ops, &idle_poll_period_usecs, 0644); MODULE_PARM_DESC(idle_poll_period_usecs, "nvmet tcp io_work poll till idle time period in usecs: Default 0"); #ifdef CONFIG_NVME_TARGET_TCP_TLS /* * TLS handshake timeout */ static int tls_handshake_timeout = 10; module_param(tls_handshake_timeout, int, 0644); MODULE_PARM_DESC(tls_handshake_timeout, "nvme TLS handshake timeout in seconds (default 10)"); #endif #define NVMET_TCP_RECV_BUDGET 8 #define NVMET_TCP_SEND_BUDGET 8 #define NVMET_TCP_IO_WORK_BUDGET 64 enum nvmet_tcp_send_state { NVMET_TCP_SEND_DATA_PDU, NVMET_TCP_SEND_DATA, NVMET_TCP_SEND_R2T, NVMET_TCP_SEND_DDGST, NVMET_TCP_SEND_RESPONSE }; enum nvmet_tcp_recv_state { NVMET_TCP_RECV_PDU, NVMET_TCP_RECV_DATA, NVMET_TCP_RECV_DDGST, NVMET_TCP_RECV_ERR, }; enum { NVMET_TCP_F_INIT_FAILED = (1 << 0), }; struct nvmet_tcp_cmd { struct nvmet_tcp_queue *queue; struct nvmet_req req; struct nvme_tcp_cmd_pdu *cmd_pdu; struct nvme_tcp_rsp_pdu *rsp_pdu; struct nvme_tcp_data_pdu *data_pdu; struct nvme_tcp_r2t_pdu *r2t_pdu; u32 rbytes_done; u32 wbytes_done; u32 pdu_len; u32 pdu_recv; int sg_idx; char recv_cbuf[CMSG_LEN(sizeof(char))]; struct msghdr recv_msg; struct bio_vec *iov; u32 flags; struct list_head entry; struct llist_node lentry; /* send state */ u32 offset; struct scatterlist *cur_sg; enum nvmet_tcp_send_state state; __le32 exp_ddgst; __le32 recv_ddgst; }; enum nvmet_tcp_queue_state { NVMET_TCP_Q_CONNECTING, NVMET_TCP_Q_TLS_HANDSHAKE, NVMET_TCP_Q_LIVE, NVMET_TCP_Q_DISCONNECTING, NVMET_TCP_Q_FAILED, }; struct nvmet_tcp_queue { struct socket *sock; struct nvmet_tcp_port *port; struct work_struct io_work; struct nvmet_cq nvme_cq; struct nvmet_sq nvme_sq; struct kref kref; /* send state */ struct nvmet_tcp_cmd *cmds; unsigned int nr_cmds; struct list_head free_list; struct llist_head resp_list; struct list_head resp_send_list; int send_list_len; struct nvmet_tcp_cmd *snd_cmd; /* recv state */ int offset; int left; enum nvmet_tcp_recv_state rcv_state; struct nvmet_tcp_cmd *cmd; union nvme_tcp_pdu pdu; /* digest state */ bool hdr_digest; bool data_digest; /* TLS state */ key_serial_t tls_pskid; struct delayed_work tls_handshake_tmo_work; unsigned long poll_end; spinlock_t state_lock; enum nvmet_tcp_queue_state state; struct sockaddr_storage sockaddr; struct sockaddr_storage sockaddr_peer; struct work_struct release_work; int idx; struct list_head queue_list; struct nvmet_tcp_cmd connect; struct page_frag_cache pf_cache; void (*data_ready)(struct sock *); void (*state_change)(struct sock *); void (*write_space)(struct sock *); }; struct nvmet_tcp_port { struct socket *sock; struct work_struct accept_work; struct nvmet_port *nport; struct sockaddr_storage addr; void (*data_ready)(struct sock *); }; static DEFINE_IDA(nvmet_tcp_queue_ida); static LIST_HEAD(nvmet_tcp_queue_list); static DEFINE_MUTEX(nvmet_tcp_queue_mutex); static struct workqueue_struct *nvmet_tcp_wq; static const struct nvmet_fabrics_ops nvmet_tcp_ops; static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c); static void nvmet_tcp_free_cmd_buffers(struct nvmet_tcp_cmd *cmd); static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue, struct nvmet_tcp_cmd *cmd) { if (unlikely(!queue->nr_cmds)) { /* We didn't allocate cmds yet, send 0xffff */ return USHRT_MAX; } return cmd - queue->cmds; } static inline bool nvmet_tcp_has_data_in(struct nvmet_tcp_cmd *cmd) { return nvme_is_write(cmd->req.cmd) && cmd->rbytes_done < cmd->req.transfer_len; } static inline bool nvmet_tcp_need_data_in(struct nvmet_tcp_cmd *cmd) { return nvmet_tcp_has_data_in(cmd) && !cmd->req.cqe->status; } static inline bool nvmet_tcp_need_data_out(struct nvmet_tcp_cmd *cmd) { return !nvme_is_write(cmd->req.cmd) && cmd->req.transfer_len > 0 && !cmd->req.cqe->status; } static inline bool nvmet_tcp_has_inline_data(struct nvmet_tcp_cmd *cmd) { return nvme_is_write(cmd->req.cmd) && cmd->pdu_len && !cmd->rbytes_done; } static inline struct nvmet_tcp_cmd * nvmet_tcp_get_cmd(struct nvmet_tcp_queue *queue) { struct nvmet_tcp_cmd *cmd; cmd = list_first_entry_or_null(&queue->free_list, struct nvmet_tcp_cmd, entry); if (!cmd) return NULL; list_del_init(&cmd->entry); cmd->rbytes_done = cmd->wbytes_done = 0; cmd->pdu_len = 0; cmd->pdu_recv = 0; cmd->iov = NULL; cmd->flags = 0; return cmd; } static inline void nvmet_tcp_put_cmd(struct nvmet_tcp_cmd *cmd) { if (unlikely(cmd == &cmd->queue->connect)) return; list_add_tail(&cmd->entry, &cmd->queue->free_list); } static inline int queue_cpu(struct nvmet_tcp_queue *queue) { return queue->sock->sk->sk_incoming_cpu; } static inline u8 nvmet_tcp_hdgst_len(struct nvmet_tcp_queue *queue) { return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0; } static inline u8 nvmet_tcp_ddgst_len(struct nvmet_tcp_queue *queue) { return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0; } static inline void nvmet_tcp_hdgst(void *pdu, size_t len) { put_unaligned_le32(~crc32c(~0, pdu, len), pdu + len); } static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue *queue, void *pdu, size_t len) { struct nvme_tcp_hdr *hdr = pdu; __le32 recv_digest; __le32 exp_digest; if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) { pr_err("queue %d: header digest enabled but no header digest\n", queue->idx); return -EPROTO; } recv_digest = *(__le32 *)(pdu + hdr->hlen); nvmet_tcp_hdgst(pdu, len); exp_digest = *(__le32 *)(pdu + hdr->hlen); if (recv_digest != exp_digest) { pr_err("queue %d: header digest error: recv %#x expected %#x\n", queue->idx, le32_to_cpu(recv_digest), le32_to_cpu(exp_digest)); return -EPROTO; } return 0; } static int nvmet_tcp_check_ddgst(struct nvmet_tcp_queue *queue, void *pdu) { struct nvme_tcp_hdr *hdr = pdu; u8 digest_len = nvmet_tcp_hdgst_len(queue); u32 len; len = le32_to_cpu(hdr->plen) - hdr->hlen - (hdr->flags & NVME_TCP_F_HDGST ? digest_len : 0); if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) { pr_err("queue %d: data digest flag is cleared\n", queue->idx); return -EPROTO; } return 0; } /* If cmd buffers are NULL, no operation is performed */ static void nvmet_tcp_free_cmd_buffers(struct nvmet_tcp_cmd *cmd) { kfree(cmd->iov); sgl_free(cmd->req.sg); cmd->iov = NULL; cmd->req.sg = NULL; } static void nvmet_tcp_build_pdu_iovec(struct nvmet_tcp_cmd *cmd) { struct bio_vec *iov = cmd->iov; struct scatterlist *sg; u32 length, offset, sg_offset; int nr_pages; length = cmd->pdu_len; nr_pages = DIV_ROUND_UP(length, PAGE_SIZE); offset = cmd->rbytes_done; cmd->sg_idx = offset / PAGE_SIZE; sg_offset = offset % PAGE_SIZE; sg = &cmd->req.sg[cmd->sg_idx]; while (length) { u32 iov_len = min_t(u32, length, sg->length - sg_offset); bvec_set_page(iov, sg_page(sg), iov_len, sg->offset + sg_offset); length -= iov_len; sg = sg_next(sg); iov++; sg_offset = 0; } iov_iter_bvec(&cmd->recv_msg.msg_iter, ITER_DEST, cmd->iov, nr_pages, cmd->pdu_len); } static void nvmet_tcp_fatal_error(struct nvmet_tcp_queue *queue) { queue->rcv_state = NVMET_TCP_RECV_ERR; if (queue->nvme_sq.ctrl) nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl); else kernel_sock_shutdown(queue->sock, SHUT_RDWR); } static void nvmet_tcp_socket_error(struct nvmet_tcp_queue *queue, int status) { queue->rcv_state = NVMET_TCP_RECV_ERR; if (status == -EPIPE || status == -ECONNRESET) kernel_sock_shutdown(queue->sock, SHUT_RDWR); else nvmet_tcp_fatal_error(queue); } static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd) { struct nvme_sgl_desc *sgl = &cmd->req.cmd->common.dptr.sgl; u32 len = le32_to_cpu(sgl->length); if (!len) return 0; if (sgl->type == ((NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET)) { if (!nvme_is_write(cmd->req.cmd)) return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; if (len > cmd->req.port->inline_data_size) return NVME_SC_SGL_INVALID_OFFSET | NVME_STATUS_DNR; cmd->pdu_len = len; } cmd->req.transfer_len += len; cmd->req.sg = sgl_alloc(len, GFP_KERNEL, &cmd->req.sg_cnt); if (!cmd->req.sg) return NVME_SC_INTERNAL; cmd->cur_sg = cmd->req.sg; if (nvmet_tcp_has_data_in(cmd)) { cmd->iov = kmalloc_array(cmd->req.sg_cnt, sizeof(*cmd->iov), GFP_KERNEL); if (!cmd->iov) goto err; } return 0; err: nvmet_tcp_free_cmd_buffers(cmd); return NVME_SC_INTERNAL; } static void nvmet_tcp_calc_ddgst(struct nvmet_tcp_cmd *cmd) { size_t total_len = cmd->req.transfer_len; struct scatterlist *sg = cmd->req.sg; u32 crc = ~0; while (total_len) { size_t len = min_t(size_t, total_len, sg->length); /* * Note that the scatterlist does not contain any highmem pages, * as it was allocated by sgl_alloc() with GFP_KERNEL. */ crc = crc32c(crc, sg_virt(sg), len); total_len -= len; sg = sg_next(sg); } cmd->exp_ddgst = cpu_to_le32(~crc); } static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd) { struct nvme_tcp_data_pdu *pdu = cmd->data_pdu; struct nvmet_tcp_queue *queue = cmd->queue; u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); u8 ddgst = nvmet_tcp_ddgst_len(cmd->queue); cmd->offset = 0; cmd->state = NVMET_TCP_SEND_DATA_PDU; pdu->hdr.type = nvme_tcp_c2h_data; pdu->hdr.flags = NVME_TCP_F_DATA_LAST | (queue->nvme_sq.sqhd_disabled ? NVME_TCP_F_DATA_SUCCESS : 0); pdu->hdr.hlen = sizeof(*pdu); pdu->hdr.pdo = pdu->hdr.hlen + hdgst; pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst + cmd->req.transfer_len + ddgst); pdu->command_id = cmd->req.cqe->command_id; pdu->data_length = cpu_to_le32(cmd->req.transfer_len); pdu->data_offset = cpu_to_le32(cmd->wbytes_done); if (queue->data_digest) { pdu->hdr.flags |= NVME_TCP_F_DDGST; nvmet_tcp_calc_ddgst(cmd); } if (cmd->queue->hdr_digest) { pdu->hdr.flags |= NVME_TCP_F_HDGST; nvmet_tcp_hdgst(pdu, sizeof(*pdu)); } } static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd *cmd) { struct nvme_tcp_r2t_pdu *pdu = cmd->r2t_pdu; u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); cmd->offset = 0; cmd->state = NVMET_TCP_SEND_R2T; pdu->hdr.type = nvme_tcp_r2t; pdu->hdr.flags = 0; pdu->hdr.hlen = sizeof(*pdu); pdu->hdr.pdo = 0; pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst); pdu->command_id = cmd->req.cmd->common.command_id; pdu->ttag = nvmet_tcp_cmd_tag(cmd->queue, cmd); pdu->r2t_length = cpu_to_le32(cmd->req.transfer_len - cmd->rbytes_done); pdu->r2t_offset = cpu_to_le32(cmd->rbytes_done); if (cmd->queue->hdr_digest) { pdu->hdr.flags |= NVME_TCP_F_HDGST; nvmet_tcp_hdgst(pdu, sizeof(*pdu)); } } static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd) { struct nvme_tcp_rsp_pdu *pdu = cmd->rsp_pdu; u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); cmd->offset = 0; cmd->state = NVMET_TCP_SEND_RESPONSE; pdu->hdr.type = nvme_tcp_rsp; pdu->hdr.flags = 0; pdu->hdr.hlen = sizeof(*pdu); pdu->hdr.pdo = 0; pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst); if (cmd->queue->hdr_digest) { pdu->hdr.flags |= NVME_TCP_F_HDGST; nvmet_tcp_hdgst(pdu, sizeof(*pdu)); } } static void nvmet_tcp_process_resp_list(struct nvmet_tcp_queue *queue) { struct llist_node *node; struct nvmet_tcp_cmd *cmd; for (node = llist_del_all(&queue->resp_list); node; node = node->next) { cmd = llist_entry(node, struct nvmet_tcp_cmd, lentry); list_add(&cmd->entry, &queue->resp_send_list); queue->send_list_len++; } } static struct nvmet_tcp_cmd *nvmet_tcp_fetch_cmd(struct nvmet_tcp_queue *queue) { queue->snd_cmd = list_first_entry_or_null(&queue->resp_send_list, struct nvmet_tcp_cmd, entry); if (!queue->snd_cmd) { nvmet_tcp_process_resp_list(queue); queue->snd_cmd = list_first_entry_or_null(&queue->resp_send_list, struct nvmet_tcp_cmd, entry); if (unlikely(!queue->snd_cmd)) return NULL; } list_del_init(&queue->snd_cmd->entry); queue->send_list_len--; if (nvmet_tcp_need_data_out(queue->snd_cmd)) nvmet_setup_c2h_data_pdu(queue->snd_cmd); else if (nvmet_tcp_need_data_in(queue->snd_cmd)) nvmet_setup_r2t_pdu(queue->snd_cmd); else nvmet_setup_response_pdu(queue->snd_cmd); return queue->snd_cmd; } static void nvmet_tcp_queue_response(struct nvmet_req *req) { struct nvmet_tcp_cmd *cmd = container_of(req, struct nvmet_tcp_cmd, req); struct nvmet_tcp_queue *queue = cmd->queue; enum nvmet_tcp_recv_state queue_state; struct nvmet_tcp_cmd *queue_cmd; struct nvme_sgl_desc *sgl; u32 len; /* Pairs with store_release in nvmet_prepare_receive_pdu() */ queue_state = smp_load_acquire(&queue->rcv_state); queue_cmd = READ_ONCE(queue->cmd); if (unlikely(cmd == queue_cmd)) { sgl = &cmd->req.cmd->common.dptr.sgl; len = le32_to_cpu(sgl->length); /* * Wait for inline data before processing the response. * Avoid using helpers, this might happen before * nvmet_req_init is completed. */ if (queue_state == NVMET_TCP_RECV_PDU && len && len <= cmd->req.port->inline_data_size && nvme_is_write(cmd->req.cmd)) return; } llist_add(&cmd->lentry, &queue->resp_list); queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &cmd->queue->io_work); } static void nvmet_tcp_execute_request(struct nvmet_tcp_cmd *cmd) { if (unlikely(cmd->flags & NVMET_TCP_F_INIT_FAILED)) nvmet_tcp_queue_response(&cmd->req); else cmd->req.execute(&cmd->req); } static int nvmet_try_send_data_pdu(struct nvmet_tcp_cmd *cmd) { struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_MORE | MSG_SPLICE_PAGES, }; struct bio_vec bvec; u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); int left = sizeof(*cmd->data_pdu) - cmd->offset + hdgst; int ret; bvec_set_virt(&bvec, (void *)cmd->data_pdu + cmd->offset, left); iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, left); ret = sock_sendmsg(cmd->queue->sock, &msg); if (ret <= 0) return ret; cmd->offset += ret; left -= ret; if (left) return -EAGAIN; cmd->state = NVMET_TCP_SEND_DATA; cmd->offset = 0; return 1; } static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd, bool last_in_batch) { struct nvmet_tcp_queue *queue = cmd->queue; int ret; while (cmd->cur_sg) { struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, }; struct page *page = sg_page(cmd->cur_sg); struct bio_vec bvec; u32 left = cmd->cur_sg->length - cmd->offset; if ((!last_in_batch && cmd->queue->send_list_len) || cmd->wbytes_done + left < cmd->req.transfer_len || queue->data_digest || !queue->nvme_sq.sqhd_disabled) msg.msg_flags |= MSG_MORE; bvec_set_page(&bvec, page, left, cmd->offset); iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, left); ret = sock_sendmsg(cmd->queue->sock, &msg); if (ret <= 0) return ret; cmd->offset += ret; cmd->wbytes_done += ret; /* Done with sg?*/ if (cmd->offset == cmd->cur_sg->length) { cmd->cur_sg = sg_next(cmd->cur_sg); cmd->offset = 0; } } if (queue->data_digest) { cmd->state = NVMET_TCP_SEND_DDGST; cmd->offset = 0; } else { if (queue->nvme_sq.sqhd_disabled) { cmd->queue->snd_cmd = NULL; nvmet_tcp_put_cmd(cmd); } else { nvmet_setup_response_pdu(cmd); } } if (queue->nvme_sq.sqhd_disabled) nvmet_tcp_free_cmd_buffers(cmd); return 1; } static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd, bool last_in_batch) { struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, }; struct bio_vec bvec; u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); int left = sizeof(*cmd->rsp_pdu) - cmd->offset + hdgst; int ret; if (!last_in_batch && cmd->queue->send_list_len) msg.msg_flags |= MSG_MORE; else msg.msg_flags |= MSG_EOR; bvec_set_virt(&bvec, (void *)cmd->rsp_pdu + cmd->offset, left); iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, left); ret = sock_sendmsg(cmd->queue->sock, &msg); if (ret <= 0) return ret; cmd->offset += ret; left -= ret; if (left) return -EAGAIN; nvmet_tcp_free_cmd_buffers(cmd); cmd->queue->snd_cmd = NULL; nvmet_tcp_put_cmd(cmd); return 1; } static int nvmet_try_send_r2t(struct nvmet_tcp_cmd *cmd, bool last_in_batch) { struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, }; struct bio_vec bvec; u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); int left = sizeof(*cmd->r2t_pdu) - cmd->offset + hdgst; int ret; if (!last_in_batch && cmd->queue->send_list_len) msg.msg_flags |= MSG_MORE; else msg.msg_flags |= MSG_EOR; bvec_set_virt(&bvec, (void *)cmd->r2t_pdu + cmd->offset, left); iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, left); ret = sock_sendmsg(cmd->queue->sock, &msg); if (ret <= 0) return ret; cmd->offset += ret; left -= ret; if (left) return -EAGAIN; cmd->queue->snd_cmd = NULL; return 1; } static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd *cmd, bool last_in_batch) { struct nvmet_tcp_queue *queue = cmd->queue; int left = NVME_TCP_DIGEST_LENGTH - cmd->offset; struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; struct kvec iov = { .iov_base = (u8 *)&cmd->exp_ddgst + cmd->offset, .iov_len = left }; int ret; if (!last_in_batch && cmd->queue->send_list_len) msg.msg_flags |= MSG_MORE; else msg.msg_flags |= MSG_EOR; ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len); if (unlikely(ret <= 0)) return ret; cmd->offset += ret; left -= ret; if (left) return -EAGAIN; if (queue->nvme_sq.sqhd_disabled) { cmd->queue->snd_cmd = NULL; nvmet_tcp_put_cmd(cmd); } else { nvmet_setup_response_pdu(cmd); } return 1; } static int nvmet_tcp_try_send_one(struct nvmet_tcp_queue *queue, bool last_in_batch) { struct nvmet_tcp_cmd *cmd = queue->snd_cmd; int ret = 0; if (!cmd || queue->state == NVMET_TCP_Q_DISCONNECTING) { cmd = nvmet_tcp_fetch_cmd(queue); if (unlikely(!cmd)) return 0; } if (cmd->state == NVMET_TCP_SEND_DATA_PDU) { ret = nvmet_try_send_data_pdu(cmd); if (ret <= 0) goto done_send; } if (cmd->state == NVMET_TCP_SEND_DATA) { ret = nvmet_try_send_data(cmd, last_in_batch); if (ret <= 0) goto done_send; } if (cmd->state == NVMET_TCP_SEND_DDGST) { ret = nvmet_try_send_ddgst(cmd, last_in_batch); if (ret <= 0) goto done_send; } if (cmd->state == NVMET_TCP_SEND_R2T) { ret = nvmet_try_send_r2t(cmd, last_in_batch); if (ret <= 0) goto done_send; } if (cmd->state == NVMET_TCP_SEND_RESPONSE) ret = nvmet_try_send_response(cmd, last_in_batch); done_send: if (ret < 0) { if (ret == -EAGAIN) return 0; return ret; } return 1; } static int nvmet_tcp_try_send(struct nvmet_tcp_queue *queue, int budget, int *sends) { int i, ret = 0; for (i = 0; i < budget; i++) { ret = nvmet_tcp_try_send_one(queue, i == budget - 1); if (unlikely(ret < 0)) { nvmet_tcp_socket_error(queue, ret); goto done; } else if (ret == 0) { break; } (*sends)++; } done: return ret; } static void nvmet_prepare_receive_pdu(struct nvmet_tcp_queue *queue) { queue->offset = 0; queue->left = sizeof(struct nvme_tcp_hdr); WRITE_ONCE(queue->cmd, NULL); /* Ensure rcv_state is visible only after queue->cmd is set */ smp_store_release(&queue->rcv_state, NVMET_TCP_RECV_PDU); } static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue) { struct nvme_tcp_icreq_pdu *icreq = &queue->pdu.icreq; struct nvme_tcp_icresp_pdu *icresp = &queue->pdu.icresp; struct msghdr msg = {}; struct kvec iov; int ret; if (le32_to_cpu(icreq->hdr.plen) != sizeof(struct nvme_tcp_icreq_pdu)) { pr_err("bad nvme-tcp pdu length (%d)\n", le32_to_cpu(icreq->hdr.plen)); nvmet_tcp_fatal_error(queue); return -EPROTO; } if (icreq->pfv != NVME_TCP_PFV_1_0) { pr_err("queue %d: bad pfv %d\n", queue->idx, icreq->pfv); return -EPROTO; } if (icreq->hpda != 0) { pr_err("queue %d: unsupported hpda %d\n", queue->idx, icreq->hpda); return -EPROTO; } queue->hdr_digest = !!(icreq->digest & NVME_TCP_HDR_DIGEST_ENABLE); queue->data_digest = !!(icreq->digest & NVME_TCP_DATA_DIGEST_ENABLE); memset(icresp, 0, sizeof(*icresp)); icresp->hdr.type = nvme_tcp_icresp; icresp->hdr.hlen = sizeof(*icresp); icresp->hdr.pdo = 0; icresp->hdr.plen = cpu_to_le32(icresp->hdr.hlen); icresp->pfv = cpu_to_le16(NVME_TCP_PFV_1_0); icresp->maxdata = cpu_to_le32(NVMET_TCP_MAXH2CDATA); icresp->cpda = 0; if (queue->hdr_digest) icresp->digest |= NVME_TCP_HDR_DIGEST_ENABLE; if (queue->data_digest) icresp->digest |= NVME_TCP_DATA_DIGEST_ENABLE; iov.iov_base = icresp; iov.iov_len = sizeof(*icresp); ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len); if (ret < 0) { queue->state = NVMET_TCP_Q_FAILED; return ret; /* queue removal will cleanup */ } queue->state = NVMET_TCP_Q_LIVE; nvmet_prepare_receive_pdu(queue); return 0; } static void nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue *queue, struct nvmet_tcp_cmd *cmd, struct nvmet_req *req) { size_t data_len = le32_to_cpu(req->cmd->common.dptr.sgl.length); int ret; /* * This command has not been processed yet, hence we are trying to * figure out if there is still pending data left to receive. If * we don't, we can simply prepare for the next pdu and bail out, * otherwise we will need to prepare a buffer and receive the * stale data before continuing forward. */ if (!nvme_is_write(cmd->req.cmd) || !data_len || data_len > cmd->req.port->inline_data_size) { nvmet_prepare_receive_pdu(queue); return; } ret = nvmet_tcp_map_data(cmd); if (unlikely(ret)) { pr_err("queue %d: failed to map data\n", queue->idx); nvmet_tcp_fatal_error(queue); return; } queue->rcv_state = NVMET_TCP_RECV_DATA; nvmet_tcp_build_pdu_iovec(cmd); cmd->flags |= NVMET_TCP_F_INIT_FAILED; } static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue) { struct nvme_tcp_data_pdu *data = &queue->pdu.data; struct nvmet_tcp_cmd *cmd; unsigned int exp_data_len; if (likely(queue->nr_cmds)) { if (unlikely(data->ttag >= queue->nr_cmds)) { pr_err("queue %d: received out of bound ttag %u, nr_cmds %u\n", queue->idx, data->ttag, queue->nr_cmds); goto err_proto; } cmd = &queue->cmds[data->ttag]; } else { cmd = &queue->connect; } if (le32_to_cpu(data->data_offset) != cmd->rbytes_done) { pr_err("ttag %u unexpected data offset %u (expected %u)\n", data->ttag, le32_to_cpu(data->data_offset), cmd->rbytes_done); goto err_proto; } exp_data_len = le32_to_cpu(data->hdr.plen) - nvmet_tcp_hdgst_len(queue) - nvmet_tcp_ddgst_len(queue) - sizeof(*data); cmd->pdu_len = le32_to_cpu(data->data_length); if (unlikely(cmd->pdu_len != exp_data_len || cmd->pdu_len == 0 || cmd->pdu_len > NVMET_TCP_MAXH2CDATA)) { pr_err("H2CData PDU len %u is invalid\n", cmd->pdu_len); goto err_proto; } /* * Ensure command data structures are initialized. We must check both * cmd->req.sg and cmd->iov because they can have different NULL states: * - Uninitialized commands: both NULL * - READ commands: cmd->req.sg allocated, cmd->iov NULL * - WRITE commands: both allocated */ if (unlikely(!cmd->req.sg || !cmd->iov)) { pr_err("queue %d: H2CData PDU received for invalid command state (ttag %u)\n", queue->idx, data->ttag); goto err_proto; } cmd->pdu_recv = 0; nvmet_tcp_build_pdu_iovec(cmd); queue->cmd = cmd; queue->rcv_state = NVMET_TCP_RECV_DATA; return 0; err_proto: /* FIXME: use proper transport errors */ nvmet_tcp_fatal_error(queue); return -EPROTO; } static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue) { struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr; struct nvme_command *nvme_cmd = &queue->pdu.cmd.cmd; struct nvmet_req *req; int ret; if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) { if (hdr->type != nvme_tcp_icreq) { pr_err("unexpected pdu type (%d) before icreq\n", hdr->type); nvmet_tcp_fatal_error(queue); return -EPROTO; } return nvmet_tcp_handle_icreq(queue); } if (unlikely(hdr->type == nvme_tcp_icreq)) { pr_err("queue %d: received icreq pdu in state %d\n", queue->idx, queue->state); nvmet_tcp_fatal_error(queue); return -EPROTO; } if (hdr->type == nvme_tcp_h2c_data) { ret = nvmet_tcp_handle_h2c_data_pdu(queue); if (unlikely(ret)) return ret; return 0; } queue->cmd = nvmet_tcp_get_cmd(queue); if (unlikely(!queue->cmd)) { /* This should never happen */ pr_err("queue %d: out of commands (%d) send_list_len: %d, opcode: %d", queue->idx, queue->nr_cmds, queue->send_list_len, nvme_cmd->common.opcode); nvmet_tcp_fatal_error(queue); return -ENOMEM; } req = &queue->cmd->req; memcpy(req->cmd, nvme_cmd, sizeof(*nvme_cmd)); if (unlikely(!nvmet_req_init(req, &queue->nvme_sq, &nvmet_tcp_ops))) { pr_err("failed cmd %p id %d opcode %d, data_len: %d, status: %04x\n", req->cmd, req->cmd->common.command_id, req->cmd->common.opcode, le32_to_cpu(req->cmd->common.dptr.sgl.length), le16_to_cpu(req->cqe->status)); nvmet_tcp_handle_req_failure(queue, queue->cmd, req); return 0; } ret = nvmet_tcp_map_data(queue->cmd); if (unlikely(ret)) { pr_err("queue %d: failed to map data\n", queue->idx); if (nvmet_tcp_has_inline_data(queue->cmd)) nvmet_tcp_fatal_error(queue); else nvmet_req_complete(req, ret); ret = -EAGAIN; goto out; } if (nvmet_tcp_need_data_in(queue->cmd)) { if (nvmet_tcp_has_inline_data(queue->cmd)) { queue->rcv_state = NVMET_TCP_RECV_DATA; nvmet_tcp_build_pdu_iovec(queue->cmd); return 0; } /* send back R2T */ nvmet_tcp_queue_response(&queue->cmd->req); goto out; } queue->cmd->req.execute(&queue->cmd->req); out: nvmet_prepare_receive_pdu(queue); return ret; } static const u8 nvme_tcp_pdu_sizes[] = { [nvme_tcp_icreq] = sizeof(struct nvme_tcp_icreq_pdu), [nvme_tcp_cmd] = sizeof(struct nvme_tcp_cmd_pdu), [nvme_tcp_h2c_data] = sizeof(struct nvme_tcp_data_pdu), }; static inline u8 nvmet_tcp_pdu_size(u8 type) { size_t idx = type; return (idx < ARRAY_SIZE(nvme_tcp_pdu_sizes) && nvme_tcp_pdu_sizes[idx]) ? nvme_tcp_pdu_sizes[idx] : 0; } static inline bool nvmet_tcp_pdu_valid(u8 type) { switch (type) { case nvme_tcp_icreq: case nvme_tcp_cmd: case nvme_tcp_h2c_data: /* fallthru */ return true; } return false; } static int nvmet_tcp_tls_record_ok(struct nvmet_tcp_queue *queue, struct msghdr *msg, char *cbuf) { struct cmsghdr *cmsg = (struct cmsghdr *)cbuf; u8 ctype, level, description; int ret = 0; ctype = tls_get_record_type(queue->sock->sk, cmsg); switch (ctype) { case 0: break; case TLS_RECORD_TYPE_DATA: break; case TLS_RECORD_TYPE_ALERT: tls_alert_recv(queue->sock->sk, msg, &level, &description); if (level == TLS_ALERT_LEVEL_FATAL) { pr_err("queue %d: TLS Alert desc %u\n", queue->idx, description); ret = -ENOTCONN; } else { pr_warn("queue %d: TLS Alert desc %u\n", queue->idx, description); ret = -EAGAIN; } break; default: /* discard this record type */ pr_err("queue %d: TLS record %d unhandled\n", queue->idx, ctype); ret = -EAGAIN; break; } return ret; } static int nvmet_tcp_try_recv_pdu(struct nvmet_tcp_queue *queue) { struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr; int len, ret; struct kvec iov; char cbuf[CMSG_LEN(sizeof(char))] = {}; struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; recv: iov.iov_base = (void *)&queue->pdu + queue->offset; iov.iov_len = queue->left; if (queue->tls_pskid) { msg.msg_control = cbuf; msg.msg_controllen = sizeof(cbuf); } len = kernel_recvmsg(queue->sock, &msg, &iov, 1, iov.iov_len, msg.msg_flags); if (unlikely(len < 0)) return len; if (queue->tls_pskid) { ret = nvmet_tcp_tls_record_ok(queue, &msg, cbuf); if (ret < 0) return ret; } queue->offset += len; queue->left -= len; if (queue->left) return -EAGAIN; if (queue->offset == sizeof(struct nvme_tcp_hdr)) { u8 hdgst = nvmet_tcp_hdgst_len(queue); if (unlikely(!nvmet_tcp_pdu_valid(hdr->type))) { pr_err("unexpected pdu type %d\n", hdr->type); nvmet_tcp_fatal_error(queue); return -EIO; } if (unlikely(hdr->hlen != nvmet_tcp_pdu_size(hdr->type))) { pr_err("pdu %d bad hlen %d\n", hdr->type, hdr->hlen); return -EIO; } queue->left = hdr->hlen - queue->offset + hdgst; goto recv; } if (queue->hdr_digest && nvmet_tcp_verify_hdgst(queue, &queue->pdu, hdr->hlen)) { nvmet_tcp_fatal_error(queue); /* fatal */ return -EPROTO; } if (queue->data_digest && nvmet_tcp_check_ddgst(queue, &queue->pdu)) { nvmet_tcp_fatal_error(queue); /* fatal */ return -EPROTO; } return nvmet_tcp_done_recv_pdu(queue); } static void nvmet_tcp_prep_recv_ddgst(struct nvmet_tcp_cmd *cmd) { struct nvmet_tcp_queue *queue = cmd->queue; nvmet_tcp_calc_ddgst(cmd); queue->offset = 0; queue->left = NVME_TCP_DIGEST_LENGTH; queue->rcv_state = NVMET_TCP_RECV_DDGST; } static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue) { struct nvmet_tcp_cmd *cmd = queue->cmd; int len, ret; while (msg_data_left(&cmd->recv_msg)) { len = sock_recvmsg(cmd->queue->sock, &cmd->recv_msg, cmd->recv_msg.msg_flags); if (len <= 0) return len; if (queue->tls_pskid) { ret = nvmet_tcp_tls_record_ok(cmd->queue, &cmd->recv_msg, cmd->recv_cbuf); if (ret < 0) return ret; } cmd->pdu_recv += len; cmd->rbytes_done += len; } if (queue->data_digest) { nvmet_tcp_prep_recv_ddgst(cmd); return 0; } if (cmd->rbytes_done == cmd->req.transfer_len) nvmet_tcp_execute_request(cmd); nvmet_prepare_receive_pdu(queue); return 0; } static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue) { struct nvmet_tcp_cmd *cmd = queue->cmd; int ret, len; char cbuf[CMSG_LEN(sizeof(char))] = {}; struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; struct kvec iov = { .iov_base = (void *)&cmd->recv_ddgst + queue->offset, .iov_len = queue->left }; if (queue->tls_pskid) { msg.msg_control = cbuf; msg.msg_controllen = sizeof(cbuf); } len = kernel_recvmsg(queue->sock, &msg, &iov, 1, iov.iov_len, msg.msg_flags); if (unlikely(len < 0)) return len; if (queue->tls_pskid) { ret = nvmet_tcp_tls_record_ok(queue, &msg, cbuf); if (ret < 0) return ret; } queue->offset += len; queue->left -= len; if (queue->left) return -EAGAIN; if (queue->data_digest && cmd->exp_ddgst != cmd->recv_ddgst) { pr_err("queue %d: cmd %d pdu (%d) data digest error: recv %#x expected %#x\n", queue->idx, cmd->req.cmd->common.command_id, queue->pdu.cmd.hdr.type, le32_to_cpu(cmd->recv_ddgst), le32_to_cpu(cmd->exp_ddgst)); nvmet_req_uninit(&cmd->req); nvmet_tcp_free_cmd_buffers(cmd); nvmet_tcp_fatal_error(queue); ret = -EPROTO; goto out; } if (cmd->rbytes_done == cmd->req.transfer_len) nvmet_tcp_execute_request(cmd); ret = 0; out: nvmet_prepare_receive_pdu(queue); return ret; } static int nvmet_tcp_try_recv_one(struct nvmet_tcp_queue *queue) { int result = 0; if (unlikely(queue->rcv_state == NVMET_TCP_RECV_ERR)) return 0; if (queue->rcv_state == NVMET_TCP_RECV_PDU) { result = nvmet_tcp_try_recv_pdu(queue); if (result != 0) goto done_recv; } if (queue->rcv_state == NVMET_TCP_RECV_DATA) { result = nvmet_tcp_try_recv_data(queue); if (result != 0) goto done_recv; } if (queue->rcv_state == NVMET_TCP_RECV_DDGST) { result = nvmet_tcp_try_recv_ddgst(queue); if (result != 0) goto done_recv; } done_recv: if (result < 0) { if (result == -EAGAIN) return 0; return result; } return 1; } static int nvmet_tcp_try_recv(struct nvmet_tcp_queue *queue, int budget, int *recvs) { int i, ret = 0; for (i = 0; i < budget; i++) { ret = nvmet_tcp_try_recv_one(queue); if (unlikely(ret < 0)) { nvmet_tcp_socket_error(queue, ret); goto done; } else if (ret == 0) { break; } (*recvs)++; } done: return ret; } static void nvmet_tcp_release_queue(struct kref *kref) { struct nvmet_tcp_queue *queue = container_of(kref, struct nvmet_tcp_queue, kref); WARN_ON(queue->state != NVMET_TCP_Q_DISCONNECTING); queue_work(nvmet_wq, &queue->release_work); } static void nvmet_tcp_schedule_release_queue(struct nvmet_tcp_queue *queue) { spin_lock_bh(&queue->state_lock); if (queue->state == NVMET_TCP_Q_TLS_HANDSHAKE) { /* Socket closed during handshake */ tls_handshake_cancel(queue->sock->sk); } if (queue->state != NVMET_TCP_Q_DISCONNECTING) { queue->state = NVMET_TCP_Q_DISCONNECTING; kref_put(&queue->kref, nvmet_tcp_release_queue); } spin_unlock_bh(&queue->state_lock); } static inline void nvmet_tcp_arm_queue_deadline(struct nvmet_tcp_queue *queue) { queue->poll_end = jiffies + usecs_to_jiffies(idle_poll_period_usecs); } static bool nvmet_tcp_check_queue_deadline(struct nvmet_tcp_queue *queue, int ops) { if (!idle_poll_period_usecs) return false; if (ops) nvmet_tcp_arm_queue_deadline(queue); return !time_after(jiffies, queue->poll_end); } static void nvmet_tcp_io_work(struct work_struct *w) { struct nvmet_tcp_queue *queue = container_of(w, struct nvmet_tcp_queue, io_work); bool pending; int ret, ops = 0; do { pending = false; ret = nvmet_tcp_try_recv(queue, NVMET_TCP_RECV_BUDGET, &ops); if (ret > 0) pending = true; else if (ret < 0) return; ret = nvmet_tcp_try_send(queue, NVMET_TCP_SEND_BUDGET, &ops); if (ret > 0) pending = true; else if (ret < 0) return; } while (pending && ops < NVMET_TCP_IO_WORK_BUDGET); /* * Requeue the worker if idle deadline period is in progress or any * ops activity was recorded during the do-while loop above. */ if (nvmet_tcp_check_queue_deadline(queue, ops) || pending) queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work); } static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue *queue, struct nvmet_tcp_cmd *c) { u8 hdgst = nvmet_tcp_hdgst_len(queue); c->queue = queue; c->req.port = queue->port->nport; c->cmd_pdu = page_frag_alloc(&queue->pf_cache, sizeof(*c->cmd_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO); if (!c->cmd_pdu) return -ENOMEM; c->req.cmd = &c->cmd_pdu->cmd; c->rsp_pdu = page_frag_alloc(&queue->pf_cache, sizeof(*c->rsp_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO); if (!c->rsp_pdu) goto out_free_cmd; c->req.cqe = &c->rsp_pdu->cqe; c->data_pdu = page_frag_alloc(&queue->pf_cache, sizeof(*c->data_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO); if (!c->data_pdu) goto out_free_rsp; c->r2t_pdu = page_frag_alloc(&queue->pf_cache, sizeof(*c->r2t_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO); if (!c->r2t_pdu) goto out_free_data; if (queue->state == NVMET_TCP_Q_TLS_HANDSHAKE) { c->recv_msg.msg_control = c->recv_cbuf; c->recv_msg.msg_controllen = sizeof(c->recv_cbuf); } c->recv_msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; list_add_tail(&c->entry, &queue->free_list); return 0; out_free_data: page_frag_free(c->data_pdu); out_free_rsp: page_frag_free(c->rsp_pdu); out_free_cmd: page_frag_free(c->cmd_pdu); return -ENOMEM; } static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c) { page_frag_free(c->r2t_pdu); page_frag_free(c->data_pdu); page_frag_free(c->rsp_pdu); page_frag_free(c->cmd_pdu); } static int nvmet_tcp_alloc_cmds(struct nvmet_tcp_queue *queue) { struct nvmet_tcp_cmd *cmds; int i, ret = -EINVAL, nr_cmds = queue->nr_cmds; cmds = kvcalloc(nr_cmds, sizeof(struct nvmet_tcp_cmd), GFP_KERNEL); if (!cmds) goto out; for (i = 0; i < nr_cmds; i++) { ret = nvmet_tcp_alloc_cmd(queue, cmds + i); if (ret) goto out_free; } queue->cmds = cmds; return 0; out_free: while (--i >= 0) nvmet_tcp_free_cmd(cmds + i); kvfree(cmds); out: return ret; } static void nvmet_tcp_free_cmds(struct nvmet_tcp_queue *queue) { struct nvmet_tcp_cmd *cmds = queue->cmds; int i; for (i = 0; i < queue->nr_cmds; i++) nvmet_tcp_free_cmd(cmds + i); nvmet_tcp_free_cmd(&queue->connect); kvfree(cmds); } static void nvmet_tcp_restore_socket_callbacks(struct nvmet_tcp_queue *queue) { struct socket *sock = queue->sock; if (!queue->state_change) return; write_lock_bh(&sock->sk->sk_callback_lock); sock->sk->sk_data_ready = queue->data_ready; sock->sk->sk_state_change = queue->state_change; sock->sk->sk_write_space = queue->write_space; sock->sk->sk_user_data = NULL; write_unlock_bh(&sock->sk->sk_callback_lock); } static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue) { struct nvmet_tcp_cmd *cmd = queue->cmds; int i; for (i = 0; i < queue->nr_cmds; i++, cmd++) { if (nvmet_tcp_need_data_in(cmd)) nvmet_req_uninit(&cmd->req); } if (!queue->nr_cmds && nvmet_tcp_need_data_in(&queue->connect)) { /* failed in connect */ nvmet_req_uninit(&queue->connect.req); } } static void nvmet_tcp_free_cmd_data_in_buffers(struct nvmet_tcp_queue *queue) { struct nvmet_tcp_cmd *cmd = queue->cmds; int i; for (i = 0; i < queue->nr_cmds; i++, cmd++) nvmet_tcp_free_cmd_buffers(cmd); nvmet_tcp_free_cmd_buffers(&queue->connect); } static void nvmet_tcp_release_queue_work(struct work_struct *w) { struct nvmet_tcp_queue *queue = container_of(w, struct nvmet_tcp_queue, release_work); mutex_lock(&nvmet_tcp_queue_mutex); list_del_init(&queue->queue_list); mutex_unlock(&nvmet_tcp_queue_mutex); nvmet_tcp_restore_socket_callbacks(queue); cancel_delayed_work_sync(&queue->tls_handshake_tmo_work); cancel_work_sync(&queue->io_work); /* stop accepting incoming data */ queue->rcv_state = NVMET_TCP_RECV_ERR; nvmet_sq_put_tls_key(&queue->nvme_sq); nvmet_tcp_uninit_data_in_cmds(queue); nvmet_sq_destroy(&queue->nvme_sq); nvmet_cq_put(&queue->nvme_cq); cancel_work_sync(&queue->io_work); nvmet_tcp_free_cmd_data_in_buffers(queue); /* ->sock will be released by fput() */ fput(queue->sock->file); nvmet_tcp_free_cmds(queue); ida_free(&nvmet_tcp_queue_ida, queue->idx); page_frag_cache_drain(&queue->pf_cache); kfree(queue); } static void nvmet_tcp_data_ready(struct sock *sk) { struct nvmet_tcp_queue *queue; trace_sk_data_ready(sk); read_lock_bh(&sk->sk_callback_lock); queue = sk->sk_user_data; if (likely(queue)) { if (queue->data_ready) queue->data_ready(sk); if (queue->state != NVMET_TCP_Q_TLS_HANDSHAKE) queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work); } read_unlock_bh(&sk->sk_callback_lock); } static void nvmet_tcp_write_space(struct sock *sk) { struct nvmet_tcp_queue *queue; read_lock_bh(&sk->sk_callback_lock); queue = sk->sk_user_data; if (unlikely(!queue)) goto out; if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) { queue->write_space(sk); goto out; } if (sk_stream_is_writeable(sk)) { clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work); } out: read_unlock_bh(&sk->sk_callback_lock); } static void nvmet_tcp_state_change(struct sock *sk) { struct nvmet_tcp_queue *queue; read_lock_bh(&sk->sk_callback_lock); queue = sk->sk_user_data; if (!queue) goto done; switch (sk->sk_state) { case TCP_FIN_WAIT2: case TCP_LAST_ACK: break; case TCP_FIN_WAIT1: case TCP_CLOSE_WAIT: case TCP_CLOSE: /* FALLTHRU */ nvmet_tcp_schedule_release_queue(queue); break; default: pr_warn("queue %d unhandled state %d\n", queue->idx, sk->sk_state); } done: read_unlock_bh(&sk->sk_callback_lock); } static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue) { struct socket *sock = queue->sock; struct inet_sock *inet = inet_sk(sock->sk); int ret; ret = kernel_getsockname(sock, (struct sockaddr *)&queue->sockaddr); if (ret < 0) return ret; ret = kernel_getpeername(sock, (struct sockaddr *)&queue->sockaddr_peer); if (ret < 0) return ret; /* * Cleanup whatever is sitting in the TCP transmit queue on socket * close. This is done to prevent stale data from being sent should * the network connection be restored before TCP times out. */ sock_no_linger(sock->sk); if (so_priority > 0) sock_set_priority(sock->sk, so_priority); /* Set socket type of service */ if (inet->rcv_tos > 0) ip_sock_set_tos(sock->sk, inet->rcv_tos); ret = 0; write_lock_bh(&sock->sk->sk_callback_lock); if (sock->sk->sk_state != TCP_ESTABLISHED) { /* * If the socket is already closing, don't even start * consuming it */ ret = -ENOTCONN; } else { sock->sk->sk_user_data = queue; queue->data_ready = sock->sk->sk_data_ready; sock->sk->sk_data_ready = nvmet_tcp_data_ready; queue->state_change = sock->sk->sk_state_change; sock->sk->sk_state_change = nvmet_tcp_state_change; queue->write_space = sock->sk->sk_write_space; sock->sk->sk_write_space = nvmet_tcp_write_space; if (idle_poll_period_usecs) nvmet_tcp_arm_queue_deadline(queue); queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work); } write_unlock_bh(&sock->sk->sk_callback_lock); return ret; } #ifdef CONFIG_NVME_TARGET_TCP_TLS static int nvmet_tcp_try_peek_pdu(struct nvmet_tcp_queue *queue) { struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr; int len, ret; struct kvec iov = { .iov_base = (u8 *)&queue->pdu + queue->offset, .iov_len = sizeof(struct nvme_tcp_hdr), }; char cbuf[CMSG_LEN(sizeof(char))] = {}; struct msghdr msg = { .msg_control = cbuf, .msg_controllen = sizeof(cbuf), .msg_flags = MSG_PEEK, }; if (nvmet_port_secure_channel_required(queue->port->nport)) return 0; len = kernel_recvmsg(queue->sock, &msg, &iov, 1, iov.iov_len, msg.msg_flags); if (unlikely(len < 0)) { pr_debug("queue %d: peek error %d\n", queue->idx, len); return len; } ret = nvmet_tcp_tls_record_ok(queue, &msg, cbuf); if (ret < 0) return ret; if (len < sizeof(struct nvme_tcp_hdr)) { pr_debug("queue %d: short read, %d bytes missing\n", queue->idx, (int)iov.iov_len - len); return -EAGAIN; } pr_debug("queue %d: hdr type %d hlen %d plen %d size %d\n", queue->idx, hdr->type, hdr->hlen, hdr->plen, (int)sizeof(struct nvme_tcp_icreq_pdu)); if (hdr->type == nvme_tcp_icreq && hdr->hlen == sizeof(struct nvme_tcp_icreq_pdu) && hdr->plen == cpu_to_le32(sizeof(struct nvme_tcp_icreq_pdu))) { pr_debug("queue %d: icreq detected\n", queue->idx); return len; } return 0; } static int nvmet_tcp_tls_key_lookup(struct nvmet_tcp_queue *queue, key_serial_t peerid) { struct key *tls_key = nvme_tls_key_lookup(peerid); int status = 0; if (IS_ERR(tls_key)) { pr_warn("%s: queue %d failed to lookup key %x\n", __func__, queue->idx, peerid); spin_lock_bh(&queue->state_lock); queue->state = NVMET_TCP_Q_FAILED; spin_unlock_bh(&queue->state_lock); status = PTR_ERR(tls_key); } else { pr_debug("%s: queue %d using TLS PSK %x\n", __func__, queue->idx, peerid); queue->nvme_sq.tls_key = tls_key; } return status; } static void nvmet_tcp_tls_handshake_done(void *data, int status, key_serial_t peerid) { struct nvmet_tcp_queue *queue = data; pr_debug("queue %d: TLS handshake done, key %x, status %d\n", queue->idx, peerid, status); spin_lock_bh(&queue->state_lock); if (WARN_ON(queue->state != NVMET_TCP_Q_TLS_HANDSHAKE)) { spin_unlock_bh(&queue->state_lock); return; } if (!status) { queue->tls_pskid = peerid; queue->state = NVMET_TCP_Q_CONNECTING; } else queue->state = NVMET_TCP_Q_FAILED; spin_unlock_bh(&queue->state_lock); cancel_delayed_work_sync(&queue->tls_handshake_tmo_work); if (!status) status = nvmet_tcp_tls_key_lookup(queue, peerid); if (status) nvmet_tcp_schedule_release_queue(queue); else nvmet_tcp_set_queue_sock(queue); kref_put(&queue->kref, nvmet_tcp_release_queue); } static void nvmet_tcp_tls_handshake_timeout(struct work_struct *w) { struct nvmet_tcp_queue *queue = container_of(to_delayed_work(w), struct nvmet_tcp_queue, tls_handshake_tmo_work); pr_warn("queue %d: TLS handshake timeout\n", queue->idx); /* * If tls_handshake_cancel() fails we've lost the race with * nvmet_tcp_tls_handshake_done() */ if (!tls_handshake_cancel(queue->sock->sk)) return; spin_lock_bh(&queue->state_lock); if (WARN_ON(queue->state != NVMET_TCP_Q_TLS_HANDSHAKE)) { spin_unlock_bh(&queue->state_lock); return; } queue->state = NVMET_TCP_Q_FAILED; spin_unlock_bh(&queue->state_lock); nvmet_tcp_schedule_release_queue(queue); kref_put(&queue->kref, nvmet_tcp_release_queue); } static int nvmet_tcp_tls_handshake(struct nvmet_tcp_queue *queue) { int ret = -EOPNOTSUPP; struct tls_handshake_args args; if (queue->state != NVMET_TCP_Q_TLS_HANDSHAKE) { pr_warn("cannot start TLS in state %d\n", queue->state); return -EINVAL; } kref_get(&queue->kref); pr_debug("queue %d: TLS ServerHello\n", queue->idx); memset(&args, 0, sizeof(args)); args.ta_sock = queue->sock; args.ta_done = nvmet_tcp_tls_handshake_done; args.ta_data = queue; args.ta_keyring = key_serial(queue->port->nport->keyring); args.ta_timeout_ms = tls_handshake_timeout * 1000; ret = tls_server_hello_psk(&args, GFP_KERNEL); if (ret) { kref_put(&queue->kref, nvmet_tcp_release_queue); pr_err("failed to start TLS, err=%d\n", ret); } else { queue_delayed_work(nvmet_wq, &queue->tls_handshake_tmo_work, tls_handshake_timeout * HZ); } return ret; } #else static void nvmet_tcp_tls_handshake_timeout(struct work_struct *w) {} #endif static void nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port, struct socket *newsock) { struct nvmet_tcp_queue *queue; struct file *sock_file = NULL; int ret; queue = kzalloc(sizeof(*queue), GFP_KERNEL); if (!queue) { ret = -ENOMEM; goto out_release; } INIT_WORK(&queue->release_work, nvmet_tcp_release_queue_work); INIT_WORK(&queue->io_work, nvmet_tcp_io_work); kref_init(&queue->kref); queue->sock = newsock; queue->port = port; queue->nr_cmds = 0; spin_lock_init(&queue->state_lock); if (queue->port->nport->disc_addr.tsas.tcp.sectype == NVMF_TCP_SECTYPE_TLS13) queue->state = NVMET_TCP_Q_TLS_HANDSHAKE; else queue->state = NVMET_TCP_Q_CONNECTING; INIT_LIST_HEAD(&queue->free_list); init_llist_head(&queue->resp_list); INIT_LIST_HEAD(&queue->resp_send_list); sock_file = sock_alloc_file(queue->sock, O_CLOEXEC, NULL); if (IS_ERR(sock_file)) { ret = PTR_ERR(sock_file); goto out_free_queue; } queue->idx = ida_alloc(&nvmet_tcp_queue_ida, GFP_KERNEL); if (queue->idx < 0) { ret = queue->idx; goto out_sock; } ret = nvmet_tcp_alloc_cmd(queue, &queue->connect); if (ret) goto out_ida_remove; nvmet_cq_init(&queue->nvme_cq); ret = nvmet_sq_init(&queue->nvme_sq, &queue->nvme_cq); if (ret) goto out_free_connect; nvmet_prepare_receive_pdu(queue); mutex_lock(&nvmet_tcp_queue_mutex); list_add_tail(&queue->queue_list, &nvmet_tcp_queue_list); mutex_unlock(&nvmet_tcp_queue_mutex); INIT_DELAYED_WORK(&queue->tls_handshake_tmo_work, nvmet_tcp_tls_handshake_timeout); #ifdef CONFIG_NVME_TARGET_TCP_TLS if (queue->state == NVMET_TCP_Q_TLS_HANDSHAKE) { struct sock *sk = queue->sock->sk; /* Restore the default callbacks before starting upcall */ write_lock_bh(&sk->sk_callback_lock); sk->sk_user_data = NULL; sk->sk_data_ready = port->data_ready; write_unlock_bh(&sk->sk_callback_lock); if (!nvmet_tcp_try_peek_pdu(queue)) { if (!nvmet_tcp_tls_handshake(queue)) return; /* TLS handshake failed, terminate the connection */ goto out_destroy_sq; } /* Not a TLS connection, continue with normal processing */ queue->state = NVMET_TCP_Q_CONNECTING; } #endif ret = nvmet_tcp_set_queue_sock(queue); if (ret) goto out_destroy_sq; return; out_destroy_sq: mutex_lock(&nvmet_tcp_queue_mutex); list_del_init(&queue->queue_list); mutex_unlock(&nvmet_tcp_queue_mutex); nvmet_sq_destroy(&queue->nvme_sq); out_free_connect: nvmet_cq_put(&queue->nvme_cq); nvmet_tcp_free_cmd(&queue->connect); out_ida_remove: ida_free(&nvmet_tcp_queue_ida, queue->idx); out_sock: fput(queue->sock->file); out_free_queue: kfree(queue); out_release: pr_err("failed to allocate queue, error %d\n", ret); if (!sock_file) sock_release(newsock); } static void nvmet_tcp_accept_work(struct work_struct *w) { struct nvmet_tcp_port *port = container_of(w, struct nvmet_tcp_port, accept_work); struct socket *newsock; int ret; while (true) { ret = kernel_accept(port->sock, &newsock, O_NONBLOCK); if (ret < 0) { if (ret != -EAGAIN) pr_warn("failed to accept err=%d\n", ret); return; } nvmet_tcp_alloc_queue(port, newsock); } } static void nvmet_tcp_listen_data_ready(struct sock *sk) { struct nvmet_tcp_port *port; trace_sk_data_ready(sk); if (sk->sk_state != TCP_LISTEN) return; read_lock_bh(&sk->sk_callback_lock); port = sk->sk_user_data; if (port) queue_work(nvmet_wq, &port->accept_work); read_unlock_bh(&sk->sk_callback_lock); } static int nvmet_tcp_add_port(struct nvmet_port *nport) { struct nvmet_tcp_port *port; __kernel_sa_family_t af; int ret; port = kzalloc(sizeof(*port), GFP_KERNEL); if (!port) return -ENOMEM; switch (nport->disc_addr.adrfam) { case NVMF_ADDR_FAMILY_IP4: af = AF_INET; break; case NVMF_ADDR_FAMILY_IP6: af = AF_INET6; break; default: pr_err("address family %d not supported\n", nport->disc_addr.adrfam); ret = -EINVAL; goto err_port; } ret = inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr, nport->disc_addr.trsvcid, &port->addr); if (ret) { pr_err("malformed ip/port passed: %s:%s\n", nport->disc_addr.traddr, nport->disc_addr.trsvcid); goto err_port; } port->nport = nport; INIT_WORK(&port->accept_work, nvmet_tcp_accept_work); if (port->nport->inline_data_size < 0) port->nport->inline_data_size = NVMET_TCP_DEF_INLINE_DATA_SIZE; ret = sock_create(port->addr.ss_family, SOCK_STREAM, IPPROTO_TCP, &port->sock); if (ret) { pr_err("failed to create a socket\n"); goto err_port; } port->sock->sk->sk_user_data = port; port->data_ready = port->sock->sk->sk_data_ready; port->sock->sk->sk_data_ready = nvmet_tcp_listen_data_ready; sock_set_reuseaddr(port->sock->sk); tcp_sock_set_nodelay(port->sock->sk); if (so_priority > 0) sock_set_priority(port->sock->sk, so_priority); ret = kernel_bind(port->sock, (struct sockaddr_unsized *)&port->addr, sizeof(port->addr)); if (ret) { pr_err("failed to bind port socket %d\n", ret); goto err_sock; } ret = kernel_listen(port->sock, NVMET_TCP_BACKLOG); if (ret) { pr_err("failed to listen %d on port sock\n", ret); goto err_sock; } nport->priv = port; pr_info("enabling port %d (%pISpc)\n", le16_to_cpu(nport->disc_addr.portid), &port->addr); return 0; err_sock: sock_release(port->sock); err_port: kfree(port); return ret; } static void nvmet_tcp_destroy_port_queues(struct nvmet_tcp_port *port) { struct nvmet_tcp_queue *queue; mutex_lock(&nvmet_tcp_queue_mutex); list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list) if (queue->port == port) kernel_sock_shutdown(queue->sock, SHUT_RDWR); mutex_unlock(&nvmet_tcp_queue_mutex); } static void nvmet_tcp_remove_port(struct nvmet_port *nport) { struct nvmet_tcp_port *port = nport->priv; write_lock_bh(&port->sock->sk->sk_callback_lock); port->sock->sk->sk_data_ready = port->data_ready; port->sock->sk->sk_user_data = NULL; write_unlock_bh(&port->sock->sk->sk_callback_lock); cancel_work_sync(&port->accept_work); /* * Destroy the remaining queues, which are not belong to any * controller yet. */ nvmet_tcp_destroy_port_queues(port); sock_release(port->sock); kfree(port); } static void nvmet_tcp_delete_ctrl(struct nvmet_ctrl *ctrl) { struct nvmet_tcp_queue *queue; mutex_lock(&nvmet_tcp_queue_mutex); list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list) if (queue->nvme_sq.ctrl == ctrl) kernel_sock_shutdown(queue->sock, SHUT_RDWR); mutex_unlock(&nvmet_tcp_queue_mutex); } static u16 nvmet_tcp_install_queue(struct nvmet_sq *sq) { struct nvmet_tcp_queue *queue = container_of(sq, struct nvmet_tcp_queue, nvme_sq); if (sq->qid == 0) { struct nvmet_tcp_queue *q; int pending = 0; /* Check for pending controller teardown */ mutex_lock(&nvmet_tcp_queue_mutex); list_for_each_entry(q, &nvmet_tcp_queue_list, queue_list) { if (q->nvme_sq.ctrl == sq->ctrl && q->state == NVMET_TCP_Q_DISCONNECTING) pending++; } mutex_unlock(&nvmet_tcp_queue_mutex); if (pending > NVMET_TCP_BACKLOG) return NVME_SC_CONNECT_CTRL_BUSY; } queue->nr_cmds = sq->size * 2; if (nvmet_tcp_alloc_cmds(queue)) { queue->nr_cmds = 0; return NVME_SC_INTERNAL; } return 0; } static void nvmet_tcp_disc_port_addr(struct nvmet_req *req, struct nvmet_port *nport, char *traddr) { struct nvmet_tcp_port *port = nport->priv; if (inet_addr_is_any(&port->addr)) { struct nvmet_tcp_cmd *cmd = container_of(req, struct nvmet_tcp_cmd, req); struct nvmet_tcp_queue *queue = cmd->queue; sprintf(traddr, "%pISc", (struct sockaddr *)&queue->sockaddr); } else { memcpy(traddr, nport->disc_addr.traddr, NVMF_TRADDR_SIZE); } } static ssize_t nvmet_tcp_host_port_addr(struct nvmet_ctrl *ctrl, char *traddr, size_t traddr_len) { struct nvmet_sq *sq = ctrl->sqs[0]; struct nvmet_tcp_queue *queue = container_of(sq, struct nvmet_tcp_queue, nvme_sq); if (queue->sockaddr_peer.ss_family == AF_UNSPEC) return -EINVAL; return snprintf(traddr, traddr_len, "%pISc", (struct sockaddr *)&queue->sockaddr_peer); } static const struct nvmet_fabrics_ops nvmet_tcp_ops = { .owner = THIS_MODULE, .type = NVMF_TRTYPE_TCP, .msdbd = 1, .add_port = nvmet_tcp_add_port, .remove_port = nvmet_tcp_remove_port, .queue_response = nvmet_tcp_queue_response, .delete_ctrl = nvmet_tcp_delete_ctrl, .install_queue = nvmet_tcp_install_queue, .disc_traddr = nvmet_tcp_disc_port_addr, .host_traddr = nvmet_tcp_host_port_addr, }; static int __init nvmet_tcp_init(void) { int ret; nvmet_tcp_wq = alloc_workqueue("nvmet_tcp_wq", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); if (!nvmet_tcp_wq) return -ENOMEM; ret = nvmet_register_transport(&nvmet_tcp_ops); if (ret) goto err; return 0; err: destroy_workqueue(nvmet_tcp_wq); return ret; } static void __exit nvmet_tcp_exit(void) { struct nvmet_tcp_queue *queue; nvmet_unregister_transport(&nvmet_tcp_ops); flush_workqueue(nvmet_wq); mutex_lock(&nvmet_tcp_queue_mutex); list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list) kernel_sock_shutdown(queue->sock, SHUT_RDWR); mutex_unlock(&nvmet_tcp_queue_mutex); flush_workqueue(nvmet_wq); destroy_workqueue(nvmet_tcp_wq); ida_destroy(&nvmet_tcp_queue_ida); } module_init(nvmet_tcp_init); module_exit(nvmet_tcp_exit); MODULE_DESCRIPTION("NVMe target TCP transport driver"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS("nvmet-transport-3"); /* 3 == NVMF_TRTYPE_TCP */ |
| 551 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 | // SPDX-License-Identifier: GPL-2.0-only /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> */ #include <linux/types.h> #include <linux/jiffies.h> #include <linux/timer.h> #include <linux/netfilter.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_timeout.h> static const unsigned int nf_ct_generic_timeout = 600*HZ; #ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> static int generic_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { struct nf_generic_net *gn = nf_generic_pernet(net); unsigned int *timeout = data; if (!timeout) timeout = &gn->timeout; if (tb[CTA_TIMEOUT_GENERIC_TIMEOUT]) *timeout = ntohl(nla_get_be32(tb[CTA_TIMEOUT_GENERIC_TIMEOUT])) * HZ; else { /* Set default generic timeout. */ *timeout = gn->timeout; } return 0; } static int generic_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) { const unsigned int *timeout = data; if (nla_put_be32(skb, CTA_TIMEOUT_GENERIC_TIMEOUT, htonl(*timeout / HZ))) goto nla_put_failure; return 0; nla_put_failure: return -ENOSPC; } static const struct nla_policy generic_timeout_nla_policy[CTA_TIMEOUT_GENERIC_MAX+1] = { [CTA_TIMEOUT_GENERIC_TIMEOUT] = { .type = NLA_U32 }, }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ void nf_conntrack_generic_init_net(struct net *net) { struct nf_generic_net *gn = nf_generic_pernet(net); gn->timeout = nf_ct_generic_timeout; } const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic = { .l4proto = 255, #ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = generic_timeout_nlattr_to_obj, .obj_to_nlattr = generic_timeout_obj_to_nlattr, .nlattr_max = CTA_TIMEOUT_GENERIC_MAX, .obj_size = sizeof(unsigned int), .nla_policy = generic_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ }; |
| 1683 123 123 63 46 47 335 1626 1633 1623 1624 1161 1149 4 1149 1152 1152 1152 1152 1149 176 4 1009 901 899 899 900 898 335 900 3 634 634 634 634 633 377 377 377 377 105 105 105 105 105 19 167 167 919 919 919 919 49 176 176 176 169 168 175 169 27 176 175 4 10 10 10 10 10 10 10 10 10 10 489 4 489 472 477 472 138 123 16 63 63 46 43 43 46 5734 5736 5736 5747 5733 472 470 473 478 470 474 473 473 469 472 473 472 138 138 138 138 138 123 138 64 64 63 63 64 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved. * Authors: David Chinner and Glauber Costa * * Generic LRU infrastructure */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/mm.h> #include <linux/list_lru.h> #include <linux/slab.h> #include <linux/mutex.h> #include <linux/memcontrol.h> #include "slab.h" #include "internal.h" #ifdef CONFIG_MEMCG static LIST_HEAD(memcg_list_lrus); static DEFINE_MUTEX(list_lrus_mutex); static inline bool list_lru_memcg_aware(struct list_lru *lru) { return lru->memcg_aware; } static void list_lru_register(struct list_lru *lru) { if (!list_lru_memcg_aware(lru)) return; mutex_lock(&list_lrus_mutex); list_add(&lru->list, &memcg_list_lrus); mutex_unlock(&list_lrus_mutex); } static void list_lru_unregister(struct list_lru *lru) { if (!list_lru_memcg_aware(lru)) return; mutex_lock(&list_lrus_mutex); list_del(&lru->list); mutex_unlock(&list_lrus_mutex); } static int lru_shrinker_id(struct list_lru *lru) { return lru->shrinker_id; } static inline struct list_lru_one * list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) { if (list_lru_memcg_aware(lru) && idx >= 0) { struct list_lru_memcg *mlru = xa_load(&lru->xa, idx); return mlru ? &mlru->node[nid] : NULL; } return &lru->node[nid].lru; } static inline bool lock_list_lru(struct list_lru_one *l, bool irq) { if (irq) spin_lock_irq(&l->lock); else spin_lock(&l->lock); if (unlikely(READ_ONCE(l->nr_items) == LONG_MIN)) { if (irq) spin_unlock_irq(&l->lock); else spin_unlock(&l->lock); return false; } return true; } static inline struct list_lru_one * lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg, bool irq, bool skip_empty) { struct list_lru_one *l; rcu_read_lock(); again: l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); if (likely(l) && lock_list_lru(l, irq)) { rcu_read_unlock(); return l; } /* * Caller may simply bail out if raced with reparenting or * may iterate through the list_lru and expect empty slots. */ if (skip_empty) { rcu_read_unlock(); return NULL; } VM_WARN_ON(!css_is_dying(&memcg->css)); memcg = parent_mem_cgroup(memcg); goto again; } static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off) { if (irq_off) spin_unlock_irq(&l->lock); else spin_unlock(&l->lock); } #else static void list_lru_register(struct list_lru *lru) { } static void list_lru_unregister(struct list_lru *lru) { } static int lru_shrinker_id(struct list_lru *lru) { return -1; } static inline bool list_lru_memcg_aware(struct list_lru *lru) { return false; } static inline struct list_lru_one * list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) { return &lru->node[nid].lru; } static inline struct list_lru_one * lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg, bool irq, bool skip_empty) { struct list_lru_one *l = &lru->node[nid].lru; if (irq) spin_lock_irq(&l->lock); else spin_lock(&l->lock); return l; } static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off) { if (irq_off) spin_unlock_irq(&l->lock); else spin_unlock(&l->lock); } #endif /* CONFIG_MEMCG */ /* The caller must ensure the memcg lifetime. */ bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, struct mem_cgroup *memcg) { struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; l = lock_list_lru_of_memcg(lru, nid, memcg, false, false); if (!l) return false; if (list_empty(item)) { list_add_tail(item, &l->list); /* Set shrinker bit if the first element was added */ if (!l->nr_items++) set_shrinker_bit(memcg, nid, lru_shrinker_id(lru)); unlock_list_lru(l, false); atomic_long_inc(&nlru->nr_items); return true; } unlock_list_lru(l, false); return false; } bool list_lru_add_obj(struct list_lru *lru, struct list_head *item) { bool ret; int nid = page_to_nid(virt_to_page(item)); if (list_lru_memcg_aware(lru)) { rcu_read_lock(); ret = list_lru_add(lru, item, nid, mem_cgroup_from_slab_obj(item)); rcu_read_unlock(); } else { ret = list_lru_add(lru, item, nid, NULL); } return ret; } EXPORT_SYMBOL_GPL(list_lru_add_obj); /* The caller must ensure the memcg lifetime. */ bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid, struct mem_cgroup *memcg) { struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; l = lock_list_lru_of_memcg(lru, nid, memcg, false, false); if (!l) return false; if (!list_empty(item)) { list_del_init(item); l->nr_items--; unlock_list_lru(l, false); atomic_long_dec(&nlru->nr_items); return true; } unlock_list_lru(l, false); return false; } bool list_lru_del_obj(struct list_lru *lru, struct list_head *item) { bool ret; int nid = page_to_nid(virt_to_page(item)); if (list_lru_memcg_aware(lru)) { rcu_read_lock(); ret = list_lru_del(lru, item, nid, mem_cgroup_from_slab_obj(item)); rcu_read_unlock(); } else { ret = list_lru_del(lru, item, nid, NULL); } return ret; } EXPORT_SYMBOL_GPL(list_lru_del_obj); void list_lru_isolate(struct list_lru_one *list, struct list_head *item) { list_del_init(item); list->nr_items--; } EXPORT_SYMBOL_GPL(list_lru_isolate); void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, struct list_head *head) { list_move(item, head); list->nr_items--; } EXPORT_SYMBOL_GPL(list_lru_isolate_move); unsigned long list_lru_count_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg) { struct list_lru_one *l; long count; rcu_read_lock(); l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); count = l ? READ_ONCE(l->nr_items) : 0; rcu_read_unlock(); if (unlikely(count < 0)) count = 0; return count; } EXPORT_SYMBOL_GPL(list_lru_count_one); unsigned long list_lru_count_node(struct list_lru *lru, int nid) { struct list_lru_node *nlru; nlru = &lru->node[nid]; return atomic_long_read(&nlru->nr_items); } EXPORT_SYMBOL_GPL(list_lru_count_node); static unsigned long __list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk, bool irq_off) { struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l = NULL; struct list_head *item, *n; unsigned long isolated = 0; restart: l = lock_list_lru_of_memcg(lru, nid, memcg, irq_off, true); if (!l) return isolated; list_for_each_safe(item, n, &l->list) { enum lru_status ret; /* * decrement nr_to_walk first so that we don't livelock if we * get stuck on large numbers of LRU_RETRY items */ if (!*nr_to_walk) break; --*nr_to_walk; ret = isolate(item, l, cb_arg); switch (ret) { /* * LRU_RETRY, LRU_REMOVED_RETRY and LRU_STOP will drop the lru * lock. List traversal will have to restart from scratch. */ case LRU_RETRY: goto restart; case LRU_REMOVED_RETRY: fallthrough; case LRU_REMOVED: isolated++; atomic_long_dec(&nlru->nr_items); if (ret == LRU_REMOVED_RETRY) goto restart; break; case LRU_ROTATE: list_move_tail(item, &l->list); break; case LRU_SKIP: break; case LRU_STOP: goto out; default: BUG(); } } unlock_list_lru(l, irq_off); out: return isolated; } unsigned long list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk) { return __list_lru_walk_one(lru, nid, memcg, isolate, cb_arg, nr_to_walk, false); } EXPORT_SYMBOL_GPL(list_lru_walk_one); unsigned long list_lru_walk_one_irq(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk) { return __list_lru_walk_one(lru, nid, memcg, isolate, cb_arg, nr_to_walk, true); } unsigned long list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk) { long isolated = 0; isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg, nr_to_walk); #ifdef CONFIG_MEMCG if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) { struct list_lru_memcg *mlru; struct mem_cgroup *memcg; unsigned long index; xa_for_each(&lru->xa, index, mlru) { rcu_read_lock(); memcg = mem_cgroup_from_id(index); if (!mem_cgroup_tryget(memcg)) { rcu_read_unlock(); continue; } rcu_read_unlock(); isolated += __list_lru_walk_one(lru, nid, memcg, isolate, cb_arg, nr_to_walk, false); mem_cgroup_put(memcg); if (*nr_to_walk <= 0) break; } } #endif return isolated; } EXPORT_SYMBOL_GPL(list_lru_walk_node); static void init_one_lru(struct list_lru *lru, struct list_lru_one *l) { INIT_LIST_HEAD(&l->list); spin_lock_init(&l->lock); l->nr_items = 0; #ifdef CONFIG_LOCKDEP if (lru->key) lockdep_set_class(&l->lock, lru->key); #endif } #ifdef CONFIG_MEMCG static struct list_lru_memcg *memcg_init_list_lru_one(struct list_lru *lru, gfp_t gfp) { int nid; struct list_lru_memcg *mlru; mlru = kmalloc(struct_size(mlru, node, nr_node_ids), gfp); if (!mlru) return NULL; for_each_node(nid) init_one_lru(lru, &mlru->node[nid]); return mlru; } static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) { if (memcg_aware) xa_init_flags(&lru->xa, XA_FLAGS_LOCK_IRQ); lru->memcg_aware = memcg_aware; } static void memcg_destroy_list_lru(struct list_lru *lru) { XA_STATE(xas, &lru->xa, 0); struct list_lru_memcg *mlru; if (!list_lru_memcg_aware(lru)) return; xas_lock_irq(&xas); xas_for_each(&xas, mlru, ULONG_MAX) { kfree(mlru); xas_store(&xas, NULL); } xas_unlock_irq(&xas); } static void memcg_reparent_list_lru_one(struct list_lru *lru, int nid, struct list_lru_one *src, struct mem_cgroup *dst_memcg) { int dst_idx = dst_memcg->kmemcg_id; struct list_lru_one *dst; spin_lock_irq(&src->lock); dst = list_lru_from_memcg_idx(lru, nid, dst_idx); spin_lock_nested(&dst->lock, SINGLE_DEPTH_NESTING); list_splice_init(&src->list, &dst->list); if (src->nr_items) { WARN_ON(src->nr_items < 0); dst->nr_items += src->nr_items; set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); } /* Mark the list_lru_one dead */ src->nr_items = LONG_MIN; spin_unlock(&dst->lock); spin_unlock_irq(&src->lock); } void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent) { struct list_lru *lru; int i; mutex_lock(&list_lrus_mutex); list_for_each_entry(lru, &memcg_list_lrus, list) { struct list_lru_memcg *mlru; XA_STATE(xas, &lru->xa, memcg->kmemcg_id); /* * Lock the Xarray to ensure no on going list_lru_memcg * allocation and further allocation will see css_is_dying(). */ xas_lock_irq(&xas); mlru = xas_store(&xas, NULL); xas_unlock_irq(&xas); if (!mlru) continue; /* * With Xarray value set to NULL, holding the lru lock below * prevents list_lru_{add,del,isolate} from touching the lru, * safe to reparent. */ for_each_node(i) memcg_reparent_list_lru_one(lru, i, &mlru->node[i], parent); /* * Here all list_lrus corresponding to the cgroup are guaranteed * to remain empty, we can safely free this lru, any further * memcg_list_lru_alloc() call will simply bail out. */ kvfree_rcu(mlru, rcu); } mutex_unlock(&list_lrus_mutex); } static inline bool memcg_list_lru_allocated(struct mem_cgroup *memcg, struct list_lru *lru) { int idx = memcg->kmemcg_id; return idx < 0 || xa_load(&lru->xa, idx); } int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, gfp_t gfp) { unsigned long flags; struct list_lru_memcg *mlru = NULL; struct mem_cgroup *pos, *parent; XA_STATE(xas, &lru->xa, 0); if (!list_lru_memcg_aware(lru) || memcg_list_lru_allocated(memcg, lru)) return 0; gfp &= GFP_RECLAIM_MASK; /* * Because the list_lru can be reparented to the parent cgroup's * list_lru, we should make sure that this cgroup and all its * ancestors have allocated list_lru_memcg. */ do { /* * Keep finding the farest parent that wasn't populated * until found memcg itself. */ pos = memcg; parent = parent_mem_cgroup(pos); while (!memcg_list_lru_allocated(parent, lru)) { pos = parent; parent = parent_mem_cgroup(pos); } if (!mlru) { mlru = memcg_init_list_lru_one(lru, gfp); if (!mlru) return -ENOMEM; } xas_set(&xas, pos->kmemcg_id); do { xas_lock_irqsave(&xas, flags); if (!xas_load(&xas) && !css_is_dying(&pos->css)) { xas_store(&xas, mlru); if (!xas_error(&xas)) mlru = NULL; } xas_unlock_irqrestore(&xas, flags); } while (xas_nomem(&xas, gfp)); } while (pos != memcg && !css_is_dying(&pos->css)); if (unlikely(mlru)) kfree(mlru); return xas_error(&xas); } #else static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) { } static void memcg_destroy_list_lru(struct list_lru *lru) { } #endif /* CONFIG_MEMCG */ int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct shrinker *shrinker) { int i; #ifdef CONFIG_MEMCG if (shrinker) lru->shrinker_id = shrinker->id; else lru->shrinker_id = -1; if (mem_cgroup_kmem_disabled()) memcg_aware = false; #endif lru->node = kcalloc(nr_node_ids, sizeof(*lru->node), GFP_KERNEL); if (!lru->node) return -ENOMEM; for_each_node(i) init_one_lru(lru, &lru->node[i].lru); memcg_init_list_lru(lru, memcg_aware); list_lru_register(lru); return 0; } EXPORT_SYMBOL_GPL(__list_lru_init); void list_lru_destroy(struct list_lru *lru) { /* Already destroyed or not yet initialized? */ if (!lru->node) return; list_lru_unregister(lru); memcg_destroy_list_lru(lru); kfree(lru->node); lru->node = NULL; #ifdef CONFIG_MEMCG lru->shrinker_id = -1; #endif } EXPORT_SYMBOL_GPL(list_lru_destroy); |
| 71 64 2 2 2 2 80 16 16 16 16 16 16 16 16 1 16 16 16 6 6 6 6 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 18 16 16 16 16 16 16 16 16 16 16 16 16 49 58 6 54 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 62 62 53 50 53 53 53 53 53 52 53 53 53 53 3 53 53 53 53 53 53 62 62 62 62 62 62 62 60 58 56 58 58 58 53 53 53 50 53 38 38 38 53 53 53 53 47 53 53 51 50 51 50 49 49 3 3 49 47 47 49 49 2 2 47 49 49 2 2 49 49 3 49 49 49 49 49 51 2 49 53 53 52 5 49 1 49 53 49 49 49 49 49 49 49 49 49 49 49 49 49 2 2 49 49 49 49 49 49 49 49 49 49 49 49 49 49 41 53 53 53 5 49 49 49 49 49 49 6 6 6 6 6 6 6 3 3 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 16 16 16 53 53 53 53 53 53 53 53 53 50 50 50 50 53 53 53 53 53 53 53 53 53 53 53 53 53 41 16 16 16 16 16 41 41 41 41 41 41 41 41 2 2 2 2 1 1 1 1 626 625 626 2 2 2 2 53 53 7 7 7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 | /* * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright 2008-2011 Luis R. Rodriguez <mcgrof@qca.qualcomm.com> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2017 Intel Deutschland GmbH * Copyright (C) 2018 - 2025 Intel Corporation * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ /** * DOC: Wireless regulatory infrastructure * * The usual implementation is for a driver to read a device EEPROM to * determine which regulatory domain it should be operating under, then * looking up the allowable channels in a driver-local table and finally * registering those channels in the wiphy structure. * * Another set of compliance enforcement is for drivers to use their * own compliance limits which can be stored on the EEPROM. The host * driver or firmware may ensure these are used. * * In addition to all this we provide an extra layer of regulatory * conformance. For drivers which do not have any regulatory * information CRDA provides the complete regulatory solution. * For others it provides a community effort on further restrictions * to enhance compliance. * * Note: When number of rules --> infinity we will not be able to * index on alpha2 any more, instead we'll probably have to * rely on some SHA1 checksum of the regdomain for example. * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/ctype.h> #include <linux/nl80211.h> #include <linux/device/faux.h> #include <linux/verification.h> #include <linux/moduleparam.h> #include <linux/firmware.h> #include <linux/units.h> #include <net/cfg80211.h> #include "core.h" #include "reg.h" #include "rdev-ops.h" #include "nl80211.h" /* * Grace period we give before making sure all current interfaces reside on * channels allowed by the current regulatory domain. */ #define REG_ENFORCE_GRACE_MS 60000 /** * enum reg_request_treatment - regulatory request treatment * * @REG_REQ_OK: continue processing the regulatory request * @REG_REQ_IGNORE: ignore the regulatory request * @REG_REQ_INTERSECT: the regulatory domain resulting from this request should * be intersected with the current one. * @REG_REQ_ALREADY_SET: the regulatory request will not change the current * regulatory settings, and no further processing is required. */ enum reg_request_treatment { REG_REQ_OK, REG_REQ_IGNORE, REG_REQ_INTERSECT, REG_REQ_ALREADY_SET, }; static struct regulatory_request core_request_world = { .initiator = NL80211_REGDOM_SET_BY_CORE, .alpha2[0] = '0', .alpha2[1] = '0', .intersect = false, .processed = true, .country_ie_env = ENVIRON_ANY, }; /* * Receipt of information from last regulatory request, * protected by RTNL (and can be accessed with RCU protection) */ static struct regulatory_request __rcu *last_request = (void __force __rcu *)&core_request_world; /* To trigger userspace events and load firmware */ static struct faux_device *reg_fdev; /* * Central wireless core regulatory domains, we only need two, * the current one and a world regulatory domain in case we have no * information to give us an alpha2. * (protected by RTNL, can be read under RCU) */ const struct ieee80211_regdomain __rcu *cfg80211_regdomain; /* * Number of devices that registered to the core * that support cellular base station regulatory hints * (protected by RTNL) */ static int reg_num_devs_support_basehint; /* * State variable indicating if the platform on which the devices * are attached is operating in an indoor environment. The state variable * is relevant for all registered devices. */ static bool reg_is_indoor; static DEFINE_SPINLOCK(reg_indoor_lock); /* Used to track the userspace process controlling the indoor setting */ static u32 reg_is_indoor_portid; static void restore_regulatory_settings(bool reset_user, bool cached); static void print_regdomain(const struct ieee80211_regdomain *rd); static void reg_process_hint(struct regulatory_request *reg_request); static const struct ieee80211_regdomain *get_cfg80211_regdom(void) { return rcu_dereference_rtnl(cfg80211_regdomain); } /* * Returns the regulatory domain associated with the wiphy. * * Requires any of RTNL, wiphy mutex or RCU protection. */ const struct ieee80211_regdomain *get_wiphy_regdom(struct wiphy *wiphy) { return rcu_dereference_check(wiphy->regd, lockdep_is_held(&wiphy->mtx) || lockdep_rtnl_is_held()); } EXPORT_SYMBOL(get_wiphy_regdom); static const char *reg_dfs_region_str(enum nl80211_dfs_regions dfs_region) { switch (dfs_region) { case NL80211_DFS_UNSET: return "unset"; case NL80211_DFS_FCC: return "FCC"; case NL80211_DFS_ETSI: return "ETSI"; case NL80211_DFS_JP: return "JP"; } return "Unknown"; } enum nl80211_dfs_regions reg_get_dfs_region(struct wiphy *wiphy) { const struct ieee80211_regdomain *regd = NULL; const struct ieee80211_regdomain *wiphy_regd = NULL; enum nl80211_dfs_regions dfs_region; rcu_read_lock(); regd = get_cfg80211_regdom(); dfs_region = regd->dfs_region; if (!wiphy) goto out; wiphy_regd = get_wiphy_regdom(wiphy); if (!wiphy_regd) goto out; if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) { dfs_region = wiphy_regd->dfs_region; goto out; } if (wiphy_regd->dfs_region == regd->dfs_region) goto out; pr_debug("%s: device specific dfs_region (%s) disagrees with cfg80211's central dfs_region (%s)\n", dev_name(&wiphy->dev), reg_dfs_region_str(wiphy_regd->dfs_region), reg_dfs_region_str(regd->dfs_region)); out: rcu_read_unlock(); return dfs_region; } static void rcu_free_regdom(const struct ieee80211_regdomain *r) { if (!r) return; kfree_rcu((struct ieee80211_regdomain *)r, rcu_head); } static struct regulatory_request *get_last_request(void) { return rcu_dereference_rtnl(last_request); } /* Used to queue up regulatory hints */ static LIST_HEAD(reg_requests_list); static DEFINE_SPINLOCK(reg_requests_lock); /* Used to queue up beacon hints for review */ static LIST_HEAD(reg_pending_beacons); static DEFINE_SPINLOCK(reg_pending_beacons_lock); /* Used to keep track of processed beacon hints */ static LIST_HEAD(reg_beacon_list); struct reg_beacon { struct list_head list; struct ieee80211_channel chan; }; static void reg_check_chans_work(struct work_struct *work); static DECLARE_DELAYED_WORK(reg_check_chans, reg_check_chans_work); static void reg_todo(struct work_struct *work); static DECLARE_WORK(reg_work, reg_todo); /* We keep a static world regulatory domain in case of the absence of CRDA */ static const struct ieee80211_regdomain world_regdom = { .n_reg_rules = 8, .alpha2 = "00", .reg_rules = { /* IEEE 802.11b/g, channels 1..11 */ REG_RULE(2412-10, 2462+10, 40, 6, 20, 0), /* IEEE 802.11b/g, channels 12..13. */ REG_RULE(2467-10, 2472+10, 20, 6, 20, NL80211_RRF_NO_IR | NL80211_RRF_AUTO_BW), /* IEEE 802.11 channel 14 - Only JP enables * this and for 802.11b only */ REG_RULE(2484-10, 2484+10, 20, 6, 20, NL80211_RRF_NO_IR | NL80211_RRF_NO_OFDM), /* IEEE 802.11a, channel 36..48 */ REG_RULE(5180-10, 5240+10, 80, 6, 20, NL80211_RRF_NO_IR | NL80211_RRF_AUTO_BW), /* IEEE 802.11a, channel 52..64 - DFS required */ REG_RULE(5260-10, 5320+10, 80, 6, 20, NL80211_RRF_NO_IR | NL80211_RRF_AUTO_BW | NL80211_RRF_DFS), /* IEEE 802.11a, channel 100..144 - DFS required */ REG_RULE(5500-10, 5720+10, 160, 6, 20, NL80211_RRF_NO_IR | NL80211_RRF_DFS), /* IEEE 802.11a, channel 149..165 */ REG_RULE(5745-10, 5825+10, 80, 6, 20, NL80211_RRF_NO_IR), /* IEEE 802.11ad (60GHz), channels 1..3 */ REG_RULE(56160+2160*1-1080, 56160+2160*3+1080, 2160, 0, 0, 0), } }; /* protected by RTNL */ static const struct ieee80211_regdomain *cfg80211_world_regdom = &world_regdom; static char *ieee80211_regdom = "00"; static char user_alpha2[2]; static const struct ieee80211_regdomain *cfg80211_user_regdom; module_param(ieee80211_regdom, charp, 0444); MODULE_PARM_DESC(ieee80211_regdom, "IEEE 802.11 regulatory domain code"); static void reg_free_request(struct regulatory_request *request) { if (request == &core_request_world) return; if (request != get_last_request()) kfree(request); } static void reg_free_last_request(void) { struct regulatory_request *lr = get_last_request(); if (lr != &core_request_world && lr) kfree_rcu(lr, rcu_head); } static void reg_update_last_request(struct regulatory_request *request) { struct regulatory_request *lr; lr = get_last_request(); if (lr == request) return; reg_free_last_request(); rcu_assign_pointer(last_request, request); } static void reset_regdomains(bool full_reset, const struct ieee80211_regdomain *new_regdom) { const struct ieee80211_regdomain *r; ASSERT_RTNL(); r = get_cfg80211_regdom(); /* avoid freeing static information or freeing something twice */ if (r == cfg80211_world_regdom) r = NULL; if (cfg80211_world_regdom == &world_regdom) cfg80211_world_regdom = NULL; if (r == &world_regdom) r = NULL; rcu_free_regdom(r); rcu_free_regdom(cfg80211_world_regdom); cfg80211_world_regdom = &world_regdom; rcu_assign_pointer(cfg80211_regdomain, new_regdom); if (!full_reset) return; reg_update_last_request(&core_request_world); } /* * Dynamic world regulatory domain requested by the wireless * core upon initialization */ static void update_world_regdomain(const struct ieee80211_regdomain *rd) { struct regulatory_request *lr; lr = get_last_request(); WARN_ON(!lr); reset_regdomains(false, rd); cfg80211_world_regdom = rd; } bool is_world_regdom(const char *alpha2) { if (!alpha2) return false; return alpha2[0] == '0' && alpha2[1] == '0'; } static bool is_alpha2_set(const char *alpha2) { if (!alpha2) return false; return alpha2[0] && alpha2[1]; } static bool is_unknown_alpha2(const char *alpha2) { if (!alpha2) return false; /* * Special case where regulatory domain was built by driver * but a specific alpha2 cannot be determined */ return alpha2[0] == '9' && alpha2[1] == '9'; } static bool is_intersected_alpha2(const char *alpha2) { if (!alpha2) return false; /* * Special case where regulatory domain is the * result of an intersection between two regulatory domain * structures */ return alpha2[0] == '9' && alpha2[1] == '8'; } static bool is_an_alpha2(const char *alpha2) { if (!alpha2) return false; return isascii(alpha2[0]) && isalpha(alpha2[0]) && isascii(alpha2[1]) && isalpha(alpha2[1]); } static bool alpha2_equal(const char *alpha2_x, const char *alpha2_y) { if (!alpha2_x || !alpha2_y) return false; return alpha2_x[0] == alpha2_y[0] && alpha2_x[1] == alpha2_y[1]; } static bool regdom_changes(const char *alpha2) { const struct ieee80211_regdomain *r = get_cfg80211_regdom(); if (!r) return true; return !alpha2_equal(r->alpha2, alpha2); } /* * The NL80211_REGDOM_SET_BY_USER regdom alpha2 is cached, this lets * you know if a valid regulatory hint with NL80211_REGDOM_SET_BY_USER * has ever been issued. */ static bool is_user_regdom_saved(void) { if (user_alpha2[0] == '9' && user_alpha2[1] == '7') return false; /* This would indicate a mistake on the design */ if (WARN(!is_world_regdom(user_alpha2) && !is_an_alpha2(user_alpha2), "Unexpected user alpha2: %c%c\n", user_alpha2[0], user_alpha2[1])) return false; return true; } static const struct ieee80211_regdomain * reg_copy_regd(const struct ieee80211_regdomain *src_regd) { struct ieee80211_regdomain *regd; unsigned int i; regd = kzalloc(struct_size(regd, reg_rules, src_regd->n_reg_rules), GFP_KERNEL); if (!regd) return ERR_PTR(-ENOMEM); memcpy(regd, src_regd, sizeof(struct ieee80211_regdomain)); for (i = 0; i < src_regd->n_reg_rules; i++) memcpy(®d->reg_rules[i], &src_regd->reg_rules[i], sizeof(struct ieee80211_reg_rule)); return regd; } static void cfg80211_save_user_regdom(const struct ieee80211_regdomain *rd) { ASSERT_RTNL(); if (!IS_ERR(cfg80211_user_regdom)) kfree(cfg80211_user_regdom); cfg80211_user_regdom = reg_copy_regd(rd); } struct reg_regdb_apply_request { struct list_head list; const struct ieee80211_regdomain *regdom; }; static LIST_HEAD(reg_regdb_apply_list); static DEFINE_MUTEX(reg_regdb_apply_mutex); static void reg_regdb_apply(struct work_struct *work) { struct reg_regdb_apply_request *request; rtnl_lock(); mutex_lock(®_regdb_apply_mutex); while (!list_empty(®_regdb_apply_list)) { request = list_first_entry(®_regdb_apply_list, struct reg_regdb_apply_request, list); list_del(&request->list); set_regdom(request->regdom, REGD_SOURCE_INTERNAL_DB); kfree(request); } mutex_unlock(®_regdb_apply_mutex); rtnl_unlock(); } static DECLARE_WORK(reg_regdb_work, reg_regdb_apply); static int reg_schedule_apply(const struct ieee80211_regdomain *regdom) { struct reg_regdb_apply_request *request; request = kzalloc(sizeof(struct reg_regdb_apply_request), GFP_KERNEL); if (!request) { kfree(regdom); return -ENOMEM; } request->regdom = regdom; mutex_lock(®_regdb_apply_mutex); list_add_tail(&request->list, ®_regdb_apply_list); mutex_unlock(®_regdb_apply_mutex); schedule_work(®_regdb_work); return 0; } #ifdef CONFIG_CFG80211_CRDA_SUPPORT /* Max number of consecutive attempts to communicate with CRDA */ #define REG_MAX_CRDA_TIMEOUTS 10 static u32 reg_crda_timeouts; static void crda_timeout_work(struct work_struct *work); static DECLARE_DELAYED_WORK(crda_timeout, crda_timeout_work); static void crda_timeout_work(struct work_struct *work) { pr_debug("Timeout while waiting for CRDA to reply, restoring regulatory settings\n"); rtnl_lock(); reg_crda_timeouts++; restore_regulatory_settings(true, false); rtnl_unlock(); } static void cancel_crda_timeout(void) { cancel_delayed_work(&crda_timeout); } static void cancel_crda_timeout_sync(void) { cancel_delayed_work_sync(&crda_timeout); } static void reset_crda_timeouts(void) { reg_crda_timeouts = 0; } /* * This lets us keep regulatory code which is updated on a regulatory * basis in userspace. */ static int call_crda(const char *alpha2) { char country[12]; char *env[] = { country, NULL }; int ret; snprintf(country, sizeof(country), "COUNTRY=%c%c", alpha2[0], alpha2[1]); if (reg_crda_timeouts > REG_MAX_CRDA_TIMEOUTS) { pr_debug("Exceeded CRDA call max attempts. Not calling CRDA\n"); return -EINVAL; } if (!is_world_regdom((char *) alpha2)) pr_debug("Calling CRDA for country: %c%c\n", alpha2[0], alpha2[1]); else pr_debug("Calling CRDA to update world regulatory domain\n"); ret = kobject_uevent_env(®_fdev->dev.kobj, KOBJ_CHANGE, env); if (ret) return ret; queue_delayed_work(system_power_efficient_wq, &crda_timeout, msecs_to_jiffies(3142)); return 0; } #else static inline void cancel_crda_timeout(void) {} static inline void cancel_crda_timeout_sync(void) {} static inline void reset_crda_timeouts(void) {} static inline int call_crda(const char *alpha2) { return -ENODATA; } #endif /* CONFIG_CFG80211_CRDA_SUPPORT */ /* code to directly load a firmware database through request_firmware */ static const struct fwdb_header *regdb; struct fwdb_country { u8 alpha2[2]; __be16 coll_ptr; /* this struct cannot be extended */ } __packed __aligned(4); struct fwdb_collection { u8 len; u8 n_rules; u8 dfs_region; /* no optional data yet */ /* aligned to 2, then followed by __be16 array of rule pointers */ } __packed __aligned(4); enum fwdb_flags { FWDB_FLAG_NO_OFDM = BIT(0), FWDB_FLAG_NO_OUTDOOR = BIT(1), FWDB_FLAG_DFS = BIT(2), FWDB_FLAG_NO_IR = BIT(3), FWDB_FLAG_AUTO_BW = BIT(4), }; struct fwdb_wmm_ac { u8 ecw; u8 aifsn; __be16 cot; } __packed; struct fwdb_wmm_rule { struct fwdb_wmm_ac client[IEEE80211_NUM_ACS]; struct fwdb_wmm_ac ap[IEEE80211_NUM_ACS]; } __packed; struct fwdb_rule { u8 len; u8 flags; __be16 max_eirp; __be32 start, end, max_bw; /* start of optional data */ __be16 cac_timeout; __be16 wmm_ptr; } __packed __aligned(4); #define FWDB_MAGIC 0x52474442 #define FWDB_VERSION 20 struct fwdb_header { __be32 magic; __be32 version; struct fwdb_country country[]; } __packed __aligned(4); static int ecw2cw(int ecw) { return (1 << ecw) - 1; } static bool valid_wmm(struct fwdb_wmm_rule *rule) { struct fwdb_wmm_ac *ac = (struct fwdb_wmm_ac *)rule; int i; for (i = 0; i < IEEE80211_NUM_ACS * 2; i++) { u16 cw_min = ecw2cw((ac[i].ecw & 0xf0) >> 4); u16 cw_max = ecw2cw(ac[i].ecw & 0x0f); u8 aifsn = ac[i].aifsn; if (cw_min >= cw_max) return false; if (aifsn < 1) return false; } return true; } static bool valid_rule(const u8 *data, unsigned int size, u16 rule_ptr) { struct fwdb_rule *rule = (void *)(data + (rule_ptr << 2)); if ((u8 *)rule + sizeof(rule->len) > data + size) return false; /* mandatory fields */ if (rule->len < offsetofend(struct fwdb_rule, max_bw)) return false; if (rule->len >= offsetofend(struct fwdb_rule, wmm_ptr)) { u32 wmm_ptr = be16_to_cpu(rule->wmm_ptr) << 2; struct fwdb_wmm_rule *wmm; if (wmm_ptr + sizeof(struct fwdb_wmm_rule) > size) return false; wmm = (void *)(data + wmm_ptr); if (!valid_wmm(wmm)) return false; } return true; } static bool valid_country(const u8 *data, unsigned int size, const struct fwdb_country *country) { unsigned int ptr = be16_to_cpu(country->coll_ptr) << 2; struct fwdb_collection *coll = (void *)(data + ptr); __be16 *rules_ptr; unsigned int i; /* make sure we can read len/n_rules */ if ((u8 *)coll + offsetofend(typeof(*coll), n_rules) > data + size) return false; /* make sure base struct and all rules fit */ if ((u8 *)coll + ALIGN(coll->len, 2) + (coll->n_rules * 2) > data + size) return false; /* mandatory fields must exist */ if (coll->len < offsetofend(struct fwdb_collection, dfs_region)) return false; rules_ptr = (void *)((u8 *)coll + ALIGN(coll->len, 2)); for (i = 0; i < coll->n_rules; i++) { u16 rule_ptr = be16_to_cpu(rules_ptr[i]); if (!valid_rule(data, size, rule_ptr)) return false; } return true; } #ifdef CONFIG_CFG80211_REQUIRE_SIGNED_REGDB #include <keys/asymmetric-type.h> static struct key *builtin_regdb_keys; static int __init load_builtin_regdb_keys(void) { builtin_regdb_keys = keyring_alloc(".builtin_regdb_keys", KUIDT_INIT(0), KGIDT_INIT(0), current_cred(), ((KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH), KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); if (IS_ERR(builtin_regdb_keys)) return PTR_ERR(builtin_regdb_keys); pr_notice("Loading compiled-in X.509 certificates for regulatory database\n"); #ifdef CONFIG_CFG80211_USE_KERNEL_REGDB_KEYS x509_load_certificate_list(shipped_regdb_certs, shipped_regdb_certs_len, builtin_regdb_keys); #endif #ifdef CONFIG_CFG80211_EXTRA_REGDB_KEYDIR if (CONFIG_CFG80211_EXTRA_REGDB_KEYDIR[0] != '\0') x509_load_certificate_list(extra_regdb_certs, extra_regdb_certs_len, builtin_regdb_keys); #endif return 0; } MODULE_FIRMWARE("regulatory.db.p7s"); static bool regdb_has_valid_signature(const u8 *data, unsigned int size) { const struct firmware *sig; bool result; if (request_firmware(&sig, "regulatory.db.p7s", ®_fdev->dev)) return false; result = verify_pkcs7_signature(data, size, sig->data, sig->size, builtin_regdb_keys, VERIFYING_UNSPECIFIED_SIGNATURE, NULL, NULL) == 0; release_firmware(sig); return result; } static void free_regdb_keyring(void) { key_put(builtin_regdb_keys); } #else static int load_builtin_regdb_keys(void) { return 0; } static bool regdb_has_valid_signature(const u8 *data, unsigned int size) { return true; } static void free_regdb_keyring(void) { } #endif /* CONFIG_CFG80211_REQUIRE_SIGNED_REGDB */ static bool valid_regdb(const u8 *data, unsigned int size) { const struct fwdb_header *hdr = (void *)data; const struct fwdb_country *country; if (size < sizeof(*hdr)) return false; if (hdr->magic != cpu_to_be32(FWDB_MAGIC)) return false; if (hdr->version != cpu_to_be32(FWDB_VERSION)) return false; if (!regdb_has_valid_signature(data, size)) return false; country = &hdr->country[0]; while ((u8 *)(country + 1) <= data + size) { if (!country->coll_ptr) break; if (!valid_country(data, size, country)) return false; country++; } return true; } static void set_wmm_rule(const struct fwdb_header *db, const struct fwdb_country *country, const struct fwdb_rule *rule, struct ieee80211_reg_rule *rrule) { struct ieee80211_wmm_rule *wmm_rule = &rrule->wmm_rule; struct fwdb_wmm_rule *wmm; unsigned int i, wmm_ptr; wmm_ptr = be16_to_cpu(rule->wmm_ptr) << 2; wmm = (void *)((u8 *)db + wmm_ptr); if (!valid_wmm(wmm)) { pr_err("Invalid regulatory WMM rule %u-%u in domain %c%c\n", be32_to_cpu(rule->start), be32_to_cpu(rule->end), country->alpha2[0], country->alpha2[1]); return; } for (i = 0; i < IEEE80211_NUM_ACS; i++) { wmm_rule->client[i].cw_min = ecw2cw((wmm->client[i].ecw & 0xf0) >> 4); wmm_rule->client[i].cw_max = ecw2cw(wmm->client[i].ecw & 0x0f); wmm_rule->client[i].aifsn = wmm->client[i].aifsn; wmm_rule->client[i].cot = 1000 * be16_to_cpu(wmm->client[i].cot); wmm_rule->ap[i].cw_min = ecw2cw((wmm->ap[i].ecw & 0xf0) >> 4); wmm_rule->ap[i].cw_max = ecw2cw(wmm->ap[i].ecw & 0x0f); wmm_rule->ap[i].aifsn = wmm->ap[i].aifsn; wmm_rule->ap[i].cot = 1000 * be16_to_cpu(wmm->ap[i].cot); } rrule->has_wmm = true; } static int __regdb_query_wmm(const struct fwdb_header *db, const struct fwdb_country *country, int freq, struct ieee80211_reg_rule *rrule) { unsigned int ptr = be16_to_cpu(country->coll_ptr) << 2; struct fwdb_collection *coll = (void *)((u8 *)db + ptr); int i; for (i = 0; i < coll->n_rules; i++) { __be16 *rules_ptr = (void *)((u8 *)coll + ALIGN(coll->len, 2)); unsigned int rule_ptr = be16_to_cpu(rules_ptr[i]) << 2; struct fwdb_rule *rule = (void *)((u8 *)db + rule_ptr); if (rule->len < offsetofend(struct fwdb_rule, wmm_ptr)) continue; if (freq >= KHZ_TO_MHZ(be32_to_cpu(rule->start)) && freq <= KHZ_TO_MHZ(be32_to_cpu(rule->end))) { set_wmm_rule(db, country, rule, rrule); return 0; } } return -ENODATA; } int reg_query_regdb_wmm(char *alpha2, int freq, struct ieee80211_reg_rule *rule) { const struct fwdb_header *hdr = regdb; const struct fwdb_country *country; if (!regdb) return -ENODATA; if (IS_ERR(regdb)) return PTR_ERR(regdb); country = &hdr->country[0]; while (country->coll_ptr) { if (alpha2_equal(alpha2, country->alpha2)) return __regdb_query_wmm(regdb, country, freq, rule); country++; } return -ENODATA; } EXPORT_SYMBOL(reg_query_regdb_wmm); static int regdb_query_country(const struct fwdb_header *db, const struct fwdb_country *country) { unsigned int ptr = be16_to_cpu(country->coll_ptr) << 2; struct fwdb_collection *coll = (void *)((u8 *)db + ptr); struct ieee80211_regdomain *regdom; unsigned int i; regdom = kzalloc(struct_size(regdom, reg_rules, coll->n_rules), GFP_KERNEL); if (!regdom) return -ENOMEM; regdom->n_reg_rules = coll->n_rules; regdom->alpha2[0] = country->alpha2[0]; regdom->alpha2[1] = country->alpha2[1]; regdom->dfs_region = coll->dfs_region; for (i = 0; i < regdom->n_reg_rules; i++) { __be16 *rules_ptr = (void *)((u8 *)coll + ALIGN(coll->len, 2)); unsigned int rule_ptr = be16_to_cpu(rules_ptr[i]) << 2; struct fwdb_rule *rule = (void *)((u8 *)db + rule_ptr); struct ieee80211_reg_rule *rrule = ®dom->reg_rules[i]; rrule->freq_range.start_freq_khz = be32_to_cpu(rule->start); rrule->freq_range.end_freq_khz = be32_to_cpu(rule->end); rrule->freq_range.max_bandwidth_khz = be32_to_cpu(rule->max_bw); rrule->power_rule.max_antenna_gain = 0; rrule->power_rule.max_eirp = be16_to_cpu(rule->max_eirp); rrule->flags = 0; if (rule->flags & FWDB_FLAG_NO_OFDM) rrule->flags |= NL80211_RRF_NO_OFDM; if (rule->flags & FWDB_FLAG_NO_OUTDOOR) rrule->flags |= NL80211_RRF_NO_OUTDOOR; if (rule->flags & FWDB_FLAG_DFS) rrule->flags |= NL80211_RRF_DFS; if (rule->flags & FWDB_FLAG_NO_IR) rrule->flags |= NL80211_RRF_NO_IR; if (rule->flags & FWDB_FLAG_AUTO_BW) rrule->flags |= NL80211_RRF_AUTO_BW; rrule->dfs_cac_ms = 0; /* handle optional data */ if (rule->len >= offsetofend(struct fwdb_rule, cac_timeout)) rrule->dfs_cac_ms = 1000 * be16_to_cpu(rule->cac_timeout); if (rule->len >= offsetofend(struct fwdb_rule, wmm_ptr)) set_wmm_rule(db, country, rule, rrule); } return reg_schedule_apply(regdom); } static int query_regdb(const char *alpha2) { const struct fwdb_header *hdr = regdb; const struct fwdb_country *country; ASSERT_RTNL(); if (IS_ERR(regdb)) return PTR_ERR(regdb); country = &hdr->country[0]; while (country->coll_ptr) { if (alpha2_equal(alpha2, country->alpha2)) return regdb_query_country(regdb, country); country++; } return -ENODATA; } static void regdb_fw_cb(const struct firmware *fw, void *context) { int set_error = 0; bool restore = true; void *db; if (!fw) { pr_info("failed to load regulatory.db\n"); set_error = -ENODATA; } else if (!valid_regdb(fw->data, fw->size)) { pr_info("loaded regulatory.db is malformed or signature is missing/invalid\n"); set_error = -EINVAL; } rtnl_lock(); if (regdb && !IS_ERR(regdb)) { /* negative case - a bug * positive case - can happen due to race in case of multiple cb's in * queue, due to usage of asynchronous callback * * Either case, just restore and free new db. */ } else if (set_error) { regdb = ERR_PTR(set_error); } else if (fw) { db = kmemdup(fw->data, fw->size, GFP_KERNEL); if (db) { regdb = db; restore = context && query_regdb(context); } else { restore = true; } } if (restore) restore_regulatory_settings(true, false); rtnl_unlock(); kfree(context); release_firmware(fw); } MODULE_FIRMWARE("regulatory.db"); static int query_regdb_file(const char *alpha2) { int err; ASSERT_RTNL(); if (regdb) return query_regdb(alpha2); alpha2 = kmemdup(alpha2, 2, GFP_KERNEL); if (!alpha2) return -ENOMEM; err = request_firmware_nowait(THIS_MODULE, true, "regulatory.db", ®_fdev->dev, GFP_KERNEL, (void *)alpha2, regdb_fw_cb); if (err) kfree(alpha2); return err; } int reg_reload_regdb(void) { const struct firmware *fw; void *db; int err; const struct ieee80211_regdomain *current_regdomain; struct regulatory_request *request; err = request_firmware(&fw, "regulatory.db", ®_fdev->dev); if (err) return err; if (!valid_regdb(fw->data, fw->size)) { err = -ENODATA; goto out; } db = kmemdup(fw->data, fw->size, GFP_KERNEL); if (!db) { err = -ENOMEM; goto out; } rtnl_lock(); if (!IS_ERR_OR_NULL(regdb)) kfree(regdb); regdb = db; /* reset regulatory domain */ current_regdomain = get_cfg80211_regdom(); request = kzalloc(sizeof(*request), GFP_KERNEL); if (!request) { err = -ENOMEM; goto out_unlock; } request->wiphy_idx = WIPHY_IDX_INVALID; request->alpha2[0] = current_regdomain->alpha2[0]; request->alpha2[1] = current_regdomain->alpha2[1]; request->initiator = NL80211_REGDOM_SET_BY_CORE; request->user_reg_hint_type = NL80211_USER_REG_HINT_USER; reg_process_hint(request); out_unlock: rtnl_unlock(); out: release_firmware(fw); return err; } static bool reg_query_database(struct regulatory_request *request) { if (query_regdb_file(request->alpha2) == 0) return true; if (call_crda(request->alpha2) == 0) return true; return false; } bool reg_is_valid_request(const char *alpha2) { struct regulatory_request *lr = get_last_request(); if (!lr || lr->processed) return false; return alpha2_equal(lr->alpha2, alpha2); } static const struct ieee80211_regdomain *reg_get_regdomain(struct wiphy *wiphy) { struct regulatory_request *lr = get_last_request(); /* * Follow the driver's regulatory domain, if present, unless a country * IE has been processed or a user wants to help compliance further */ if (lr->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE && lr->initiator != NL80211_REGDOM_SET_BY_USER && wiphy->regd) return get_wiphy_regdom(wiphy); return get_cfg80211_regdom(); } static unsigned int reg_get_max_bandwidth_from_range(const struct ieee80211_regdomain *rd, const struct ieee80211_reg_rule *rule) { const struct ieee80211_freq_range *freq_range = &rule->freq_range; const struct ieee80211_freq_range *freq_range_tmp; const struct ieee80211_reg_rule *tmp; u32 start_freq, end_freq, idx, no; for (idx = 0; idx < rd->n_reg_rules; idx++) if (rule == &rd->reg_rules[idx]) break; if (idx == rd->n_reg_rules) return 0; /* get start_freq */ no = idx; while (no) { tmp = &rd->reg_rules[--no]; freq_range_tmp = &tmp->freq_range; if (freq_range_tmp->end_freq_khz < freq_range->start_freq_khz) break; freq_range = freq_range_tmp; } start_freq = freq_range->start_freq_khz; /* get end_freq */ freq_range = &rule->freq_range; no = idx; while (no < rd->n_reg_rules - 1) { tmp = &rd->reg_rules[++no]; freq_range_tmp = &tmp->freq_range; if (freq_range_tmp->start_freq_khz > freq_range->end_freq_khz) break; freq_range = freq_range_tmp; } end_freq = freq_range->end_freq_khz; return end_freq - start_freq; } unsigned int reg_get_max_bandwidth(const struct ieee80211_regdomain *rd, const struct ieee80211_reg_rule *rule) { unsigned int bw = reg_get_max_bandwidth_from_range(rd, rule); if (rule->flags & NL80211_RRF_NO_320MHZ) bw = min_t(unsigned int, bw, MHZ_TO_KHZ(160)); if (rule->flags & NL80211_RRF_NO_160MHZ) bw = min_t(unsigned int, bw, MHZ_TO_KHZ(80)); if (rule->flags & NL80211_RRF_NO_80MHZ) bw = min_t(unsigned int, bw, MHZ_TO_KHZ(40)); /* * HT40+/HT40- limits are handled per-channel. Only limit BW if both * are not allowed. */ if (rule->flags & NL80211_RRF_NO_HT40MINUS && rule->flags & NL80211_RRF_NO_HT40PLUS) bw = min_t(unsigned int, bw, MHZ_TO_KHZ(20)); return bw; } /* Sanity check on a regulatory rule */ static bool is_valid_reg_rule(const struct ieee80211_reg_rule *rule) { const struct ieee80211_freq_range *freq_range = &rule->freq_range; u32 freq_diff; if (freq_range->start_freq_khz <= 0 || freq_range->end_freq_khz <= 0) return false; if (freq_range->start_freq_khz > freq_range->end_freq_khz) return false; freq_diff = freq_range->end_freq_khz - freq_range->start_freq_khz; if (freq_range->end_freq_khz <= freq_range->start_freq_khz || freq_range->max_bandwidth_khz > freq_diff) return false; return true; } static bool is_valid_rd(const struct ieee80211_regdomain *rd) { const struct ieee80211_reg_rule *reg_rule = NULL; unsigned int i; if (!rd->n_reg_rules) return false; if (WARN_ON(rd->n_reg_rules > NL80211_MAX_SUPP_REG_RULES)) return false; for (i = 0; i < rd->n_reg_rules; i++) { reg_rule = &rd->reg_rules[i]; if (!is_valid_reg_rule(reg_rule)) return false; } return true; } /** * freq_in_rule_band - tells us if a frequency is in a frequency band * @freq_range: frequency rule we want to query * @freq_khz: frequency we are inquiring about * * This lets us know if a specific frequency rule is or is not relevant to * a specific frequency's band. Bands are device specific and artificial * definitions (the "2.4 GHz band", the "5 GHz band" and the "60GHz band"), * however it is safe for now to assume that a frequency rule should not be * part of a frequency's band if the start freq or end freq are off by more * than 2 GHz for the 2.4 and 5 GHz bands, and by more than 20 GHz for the * 60 GHz band. * This resolution can be lowered and should be considered as we add * regulatory rule support for other "bands". * * Returns: whether or not the frequency is in the range */ static bool freq_in_rule_band(const struct ieee80211_freq_range *freq_range, u32 freq_khz) { /* * From 802.11ad: directional multi-gigabit (DMG): * Pertaining to operation in a frequency band containing a channel * with the Channel starting frequency above 45 GHz. */ u32 limit = freq_khz > 45 * KHZ_PER_GHZ ? 20 * KHZ_PER_GHZ : 2 * KHZ_PER_GHZ; if (abs(freq_khz - freq_range->start_freq_khz) <= limit) return true; if (abs(freq_khz - freq_range->end_freq_khz) <= limit) return true; return false; } /* * Later on we can perhaps use the more restrictive DFS * region but we don't have information for that yet so * for now simply disallow conflicts. */ static enum nl80211_dfs_regions reg_intersect_dfs_region(const enum nl80211_dfs_regions dfs_region1, const enum nl80211_dfs_regions dfs_region2) { if (dfs_region1 != dfs_region2) return NL80211_DFS_UNSET; return dfs_region1; } static void reg_wmm_rules_intersect(const struct ieee80211_wmm_ac *wmm_ac1, const struct ieee80211_wmm_ac *wmm_ac2, struct ieee80211_wmm_ac *intersect) { intersect->cw_min = max_t(u16, wmm_ac1->cw_min, wmm_ac2->cw_min); intersect->cw_max = max_t(u16, wmm_ac1->cw_max, wmm_ac2->cw_max); intersect->cot = min_t(u16, wmm_ac1->cot, wmm_ac2->cot); intersect->aifsn = max_t(u8, wmm_ac1->aifsn, wmm_ac2->aifsn); } /* * Helper for regdom_intersect(), this does the real * mathematical intersection fun */ static int reg_rules_intersect(const struct ieee80211_regdomain *rd1, const struct ieee80211_regdomain *rd2, const struct ieee80211_reg_rule *rule1, const struct ieee80211_reg_rule *rule2, struct ieee80211_reg_rule *intersected_rule) { const struct ieee80211_freq_range *freq_range1, *freq_range2; struct ieee80211_freq_range *freq_range; const struct ieee80211_power_rule *power_rule1, *power_rule2; struct ieee80211_power_rule *power_rule; const struct ieee80211_wmm_rule *wmm_rule1, *wmm_rule2; struct ieee80211_wmm_rule *wmm_rule; u32 freq_diff, max_bandwidth1, max_bandwidth2; freq_range1 = &rule1->freq_range; freq_range2 = &rule2->freq_range; freq_range = &intersected_rule->freq_range; power_rule1 = &rule1->power_rule; power_rule2 = &rule2->power_rule; power_rule = &intersected_rule->power_rule; wmm_rule1 = &rule1->wmm_rule; wmm_rule2 = &rule2->wmm_rule; wmm_rule = &intersected_rule->wmm_rule; freq_range->start_freq_khz = max(freq_range1->start_freq_khz, freq_range2->start_freq_khz); freq_range->end_freq_khz = min(freq_range1->end_freq_khz, freq_range2->end_freq_khz); max_bandwidth1 = freq_range1->max_bandwidth_khz; max_bandwidth2 = freq_range2->max_bandwidth_khz; if (rule1->flags & NL80211_RRF_AUTO_BW) max_bandwidth1 = reg_get_max_bandwidth(rd1, rule1); if (rule2->flags & NL80211_RRF_AUTO_BW) max_bandwidth2 = reg_get_max_bandwidth(rd2, rule2); freq_range->max_bandwidth_khz = min(max_bandwidth1, max_bandwidth2); intersected_rule->flags = rule1->flags | rule2->flags; /* * In case NL80211_RRF_AUTO_BW requested for both rules * set AUTO_BW in intersected rule also. Next we will * calculate BW correctly in handle_channel function. * In other case remove AUTO_BW flag while we calculate * maximum bandwidth correctly and auto calculation is * not required. */ if ((rule1->flags & NL80211_RRF_AUTO_BW) && (rule2->flags & NL80211_RRF_AUTO_BW)) intersected_rule->flags |= NL80211_RRF_AUTO_BW; else intersected_rule->flags &= ~NL80211_RRF_AUTO_BW; freq_diff = freq_range->end_freq_khz - freq_range->start_freq_khz; if (freq_range->max_bandwidth_khz > freq_diff) freq_range->max_bandwidth_khz = freq_diff; power_rule->max_eirp = min(power_rule1->max_eirp, power_rule2->max_eirp); power_rule->max_antenna_gain = min(power_rule1->max_antenna_gain, power_rule2->max_antenna_gain); intersected_rule->dfs_cac_ms = max(rule1->dfs_cac_ms, rule2->dfs_cac_ms); if (rule1->has_wmm && rule2->has_wmm) { u8 ac; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { reg_wmm_rules_intersect(&wmm_rule1->client[ac], &wmm_rule2->client[ac], &wmm_rule->client[ac]); reg_wmm_rules_intersect(&wmm_rule1->ap[ac], &wmm_rule2->ap[ac], &wmm_rule->ap[ac]); } intersected_rule->has_wmm = true; } else if (rule1->has_wmm) { *wmm_rule = *wmm_rule1; intersected_rule->has_wmm = true; } else if (rule2->has_wmm) { *wmm_rule = *wmm_rule2; intersected_rule->has_wmm = true; } else { intersected_rule->has_wmm = false; } if (!is_valid_reg_rule(intersected_rule)) return -EINVAL; return 0; } /* check whether old rule contains new rule */ static bool rule_contains(struct ieee80211_reg_rule *r1, struct ieee80211_reg_rule *r2) { /* for simplicity, currently consider only same flags */ if (r1->flags != r2->flags) return false; /* verify r1 is more restrictive */ if ((r1->power_rule.max_antenna_gain > r2->power_rule.max_antenna_gain) || r1->power_rule.max_eirp > r2->power_rule.max_eirp) return false; /* make sure r2's range is contained within r1 */ if (r1->freq_range.start_freq_khz > r2->freq_range.start_freq_khz || r1->freq_range.end_freq_khz < r2->freq_range.end_freq_khz) return false; /* and finally verify that r1.max_bw >= r2.max_bw */ if (r1->freq_range.max_bandwidth_khz < r2->freq_range.max_bandwidth_khz) return false; return true; } /* add or extend current rules. do nothing if rule is already contained */ static void add_rule(struct ieee80211_reg_rule *rule, struct ieee80211_reg_rule *reg_rules, u32 *n_rules) { struct ieee80211_reg_rule *tmp_rule; int i; for (i = 0; i < *n_rules; i++) { tmp_rule = ®_rules[i]; /* rule is already contained - do nothing */ if (rule_contains(tmp_rule, rule)) return; /* extend rule if possible */ if (rule_contains(rule, tmp_rule)) { memcpy(tmp_rule, rule, sizeof(*rule)); return; } } memcpy(®_rules[*n_rules], rule, sizeof(*rule)); (*n_rules)++; } /** * regdom_intersect - do the intersection between two regulatory domains * @rd1: first regulatory domain * @rd2: second regulatory domain * * Use this function to get the intersection between two regulatory domains. * Once completed we will mark the alpha2 for the rd as intersected, "98", * as no one single alpha2 can represent this regulatory domain. * * Returns a pointer to the regulatory domain structure which will hold the * resulting intersection of rules between rd1 and rd2. We will * kzalloc() this structure for you. * * Returns: the intersected regdomain */ static struct ieee80211_regdomain * regdom_intersect(const struct ieee80211_regdomain *rd1, const struct ieee80211_regdomain *rd2) { int r; unsigned int x, y; unsigned int num_rules = 0; const struct ieee80211_reg_rule *rule1, *rule2; struct ieee80211_reg_rule intersected_rule; struct ieee80211_regdomain *rd; if (!rd1 || !rd2) return NULL; /* * First we get a count of the rules we'll need, then we actually * build them. This is to so we can malloc() and free() a * regdomain once. The reason we use reg_rules_intersect() here * is it will return -EINVAL if the rule computed makes no sense. * All rules that do check out OK are valid. */ for (x = 0; x < rd1->n_reg_rules; x++) { rule1 = &rd1->reg_rules[x]; for (y = 0; y < rd2->n_reg_rules; y++) { rule2 = &rd2->reg_rules[y]; if (!reg_rules_intersect(rd1, rd2, rule1, rule2, &intersected_rule)) num_rules++; } } if (!num_rules) return NULL; rd = kzalloc(struct_size(rd, reg_rules, num_rules), GFP_KERNEL); if (!rd) return NULL; for (x = 0; x < rd1->n_reg_rules; x++) { rule1 = &rd1->reg_rules[x]; for (y = 0; y < rd2->n_reg_rules; y++) { rule2 = &rd2->reg_rules[y]; r = reg_rules_intersect(rd1, rd2, rule1, rule2, &intersected_rule); /* * No need to memset here the intersected rule here as * we're not using the stack anymore */ if (r) continue; add_rule(&intersected_rule, rd->reg_rules, &rd->n_reg_rules); } } rd->alpha2[0] = '9'; rd->alpha2[1] = '8'; rd->dfs_region = reg_intersect_dfs_region(rd1->dfs_region, rd2->dfs_region); return rd; } /* * XXX: add support for the rest of enum nl80211_reg_rule_flags, we may * want to just have the channel structure use these */ static u32 map_regdom_flags(u32 rd_flags) { u32 channel_flags = 0; if (rd_flags & NL80211_RRF_NO_IR_ALL) channel_flags |= IEEE80211_CHAN_NO_IR; if (rd_flags & NL80211_RRF_DFS) channel_flags |= IEEE80211_CHAN_RADAR; if (rd_flags & NL80211_RRF_NO_OFDM) channel_flags |= IEEE80211_CHAN_NO_OFDM; if (rd_flags & NL80211_RRF_NO_OUTDOOR) channel_flags |= IEEE80211_CHAN_INDOOR_ONLY; if (rd_flags & NL80211_RRF_IR_CONCURRENT) channel_flags |= IEEE80211_CHAN_IR_CONCURRENT; if (rd_flags & NL80211_RRF_NO_HT40MINUS) channel_flags |= IEEE80211_CHAN_NO_HT40MINUS; if (rd_flags & NL80211_RRF_NO_HT40PLUS) channel_flags |= IEEE80211_CHAN_NO_HT40PLUS; if (rd_flags & NL80211_RRF_NO_80MHZ) channel_flags |= IEEE80211_CHAN_NO_80MHZ; if (rd_flags & NL80211_RRF_NO_160MHZ) channel_flags |= IEEE80211_CHAN_NO_160MHZ; if (rd_flags & NL80211_RRF_NO_HE) channel_flags |= IEEE80211_CHAN_NO_HE; if (rd_flags & NL80211_RRF_NO_320MHZ) channel_flags |= IEEE80211_CHAN_NO_320MHZ; if (rd_flags & NL80211_RRF_NO_EHT) channel_flags |= IEEE80211_CHAN_NO_EHT; if (rd_flags & NL80211_RRF_DFS_CONCURRENT) channel_flags |= IEEE80211_CHAN_DFS_CONCURRENT; if (rd_flags & NL80211_RRF_NO_6GHZ_VLP_CLIENT) channel_flags |= IEEE80211_CHAN_NO_6GHZ_VLP_CLIENT; if (rd_flags & NL80211_RRF_NO_6GHZ_AFC_CLIENT) channel_flags |= IEEE80211_CHAN_NO_6GHZ_AFC_CLIENT; if (rd_flags & NL80211_RRF_PSD) channel_flags |= IEEE80211_CHAN_PSD; if (rd_flags & NL80211_RRF_ALLOW_6GHZ_VLP_AP) channel_flags |= IEEE80211_CHAN_ALLOW_6GHZ_VLP_AP; if (rd_flags & NL80211_RRF_ALLOW_20MHZ_ACTIVITY) channel_flags |= IEEE80211_CHAN_ALLOW_20MHZ_ACTIVITY; return channel_flags; } static const struct ieee80211_reg_rule * freq_reg_info_regd(u32 center_freq, const struct ieee80211_regdomain *regd, u32 bw) { int i; bool band_rule_found = false; bool bw_fits = false; if (!regd) return ERR_PTR(-EINVAL); for (i = 0; i < regd->n_reg_rules; i++) { const struct ieee80211_reg_rule *rr; const struct ieee80211_freq_range *fr = NULL; rr = ®d->reg_rules[i]; fr = &rr->freq_range; /* * We only need to know if one frequency rule was * in center_freq's band, that's enough, so let's * not overwrite it once found */ if (!band_rule_found) band_rule_found = freq_in_rule_band(fr, center_freq); bw_fits = cfg80211_does_bw_fit_range(fr, center_freq, bw); if (band_rule_found && bw_fits) return rr; } if (!band_rule_found) return ERR_PTR(-ERANGE); return ERR_PTR(-EINVAL); } static const struct ieee80211_reg_rule * __freq_reg_info(struct wiphy *wiphy, u32 center_freq, u32 min_bw) { const struct ieee80211_regdomain *regd = reg_get_regdomain(wiphy); static const u32 bws[] = {0, 1, 2, 4, 5, 8, 10, 16, 20}; const struct ieee80211_reg_rule *reg_rule = ERR_PTR(-ERANGE); int i = ARRAY_SIZE(bws) - 1; u32 bw; for (bw = MHZ_TO_KHZ(bws[i]); bw >= min_bw; bw = MHZ_TO_KHZ(bws[i--])) { reg_rule = freq_reg_info_regd(center_freq, regd, bw); if (!IS_ERR(reg_rule)) return reg_rule; } return reg_rule; } const struct ieee80211_reg_rule *freq_reg_info(struct wiphy *wiphy, u32 center_freq) { u32 min_bw = center_freq < MHZ_TO_KHZ(1000) ? 1 : 20; return __freq_reg_info(wiphy, center_freq, MHZ_TO_KHZ(min_bw)); } EXPORT_SYMBOL(freq_reg_info); const char *reg_initiator_name(enum nl80211_reg_initiator initiator) { switch (initiator) { case NL80211_REGDOM_SET_BY_CORE: return "core"; case NL80211_REGDOM_SET_BY_USER: return "user"; case NL80211_REGDOM_SET_BY_DRIVER: return "driver"; case NL80211_REGDOM_SET_BY_COUNTRY_IE: return "country element"; default: WARN_ON(1); return "bug"; } } EXPORT_SYMBOL(reg_initiator_name); static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd, const struct ieee80211_reg_rule *reg_rule, const struct ieee80211_channel *chan) { const struct ieee80211_freq_range *freq_range = NULL; u32 max_bandwidth_khz, center_freq_khz, bw_flags = 0; bool is_s1g = chan->band == NL80211_BAND_S1GHZ; freq_range = ®_rule->freq_range; max_bandwidth_khz = freq_range->max_bandwidth_khz; center_freq_khz = ieee80211_channel_to_khz(chan); /* Check if auto calculation requested */ if (reg_rule->flags & NL80211_RRF_AUTO_BW) max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule); if (is_s1g) { if (max_bandwidth_khz < MHZ_TO_KHZ(16)) bw_flags |= IEEE80211_CHAN_NO_16MHZ; if (max_bandwidth_khz < MHZ_TO_KHZ(8)) bw_flags |= IEEE80211_CHAN_NO_8MHZ; if (max_bandwidth_khz < MHZ_TO_KHZ(4)) bw_flags |= IEEE80211_CHAN_NO_4MHZ; return bw_flags; } /* If we get a reg_rule we can assume that at least 5Mhz fit */ if (!cfg80211_does_bw_fit_range(freq_range, center_freq_khz, MHZ_TO_KHZ(10))) bw_flags |= IEEE80211_CHAN_NO_10MHZ; if (!cfg80211_does_bw_fit_range(freq_range, center_freq_khz, MHZ_TO_KHZ(20))) bw_flags |= IEEE80211_CHAN_NO_20MHZ; if (max_bandwidth_khz < MHZ_TO_KHZ(10)) bw_flags |= IEEE80211_CHAN_NO_10MHZ; if (max_bandwidth_khz < MHZ_TO_KHZ(20)) bw_flags |= IEEE80211_CHAN_NO_20MHZ; if (max_bandwidth_khz < MHZ_TO_KHZ(40)) bw_flags |= IEEE80211_CHAN_NO_HT40; if (max_bandwidth_khz < MHZ_TO_KHZ(80)) bw_flags |= IEEE80211_CHAN_NO_80MHZ; if (max_bandwidth_khz < MHZ_TO_KHZ(160)) bw_flags |= IEEE80211_CHAN_NO_160MHZ; if (max_bandwidth_khz < MHZ_TO_KHZ(320)) bw_flags |= IEEE80211_CHAN_NO_320MHZ; return bw_flags; } static void handle_channel_single_rule(struct wiphy *wiphy, enum nl80211_reg_initiator initiator, struct ieee80211_channel *chan, u32 flags, struct regulatory_request *lr, struct wiphy *request_wiphy, const struct ieee80211_reg_rule *reg_rule) { u32 bw_flags = 0; const struct ieee80211_power_rule *power_rule = NULL; const struct ieee80211_regdomain *regd; regd = reg_get_regdomain(wiphy); power_rule = ®_rule->power_rule; bw_flags = reg_rule_to_chan_bw_flags(regd, reg_rule, chan); if (lr->initiator == NL80211_REGDOM_SET_BY_DRIVER && request_wiphy && request_wiphy == wiphy && request_wiphy->regulatory_flags & REGULATORY_STRICT_REG) { /* * This guarantees the driver's requested regulatory domain * will always be used as a base for further regulatory * settings */ chan->flags = chan->orig_flags = map_regdom_flags(reg_rule->flags) | bw_flags; chan->max_antenna_gain = chan->orig_mag = (int) MBI_TO_DBI(power_rule->max_antenna_gain); chan->max_reg_power = chan->max_power = chan->orig_mpwr = (int) MBM_TO_DBM(power_rule->max_eirp); if (chan->flags & IEEE80211_CHAN_RADAR) { chan->dfs_cac_ms = IEEE80211_DFS_MIN_CAC_TIME_MS; if (reg_rule->dfs_cac_ms) chan->dfs_cac_ms = reg_rule->dfs_cac_ms; } if (chan->flags & IEEE80211_CHAN_PSD) chan->psd = reg_rule->psd; return; } chan->dfs_state = NL80211_DFS_USABLE; chan->dfs_state_entered = jiffies; chan->beacon_found = false; chan->flags = flags | bw_flags | map_regdom_flags(reg_rule->flags); chan->max_antenna_gain = min_t(int, chan->orig_mag, MBI_TO_DBI(power_rule->max_antenna_gain)); chan->max_reg_power = (int) MBM_TO_DBM(power_rule->max_eirp); if (chan->flags & IEEE80211_CHAN_RADAR) { if (reg_rule->dfs_cac_ms) chan->dfs_cac_ms = reg_rule->dfs_cac_ms; else chan->dfs_cac_ms = IEEE80211_DFS_MIN_CAC_TIME_MS; } if (chan->flags & IEEE80211_CHAN_PSD) chan->psd = reg_rule->psd; if (chan->orig_mpwr) { /* * Devices that use REGULATORY_COUNTRY_IE_FOLLOW_POWER * will always follow the passed country IE power settings. */ if (initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE && wiphy->regulatory_flags & REGULATORY_COUNTRY_IE_FOLLOW_POWER) chan->max_power = chan->max_reg_power; else chan->max_power = min(chan->orig_mpwr, chan->max_reg_power); } else chan->max_power = chan->max_reg_power; } static void handle_channel_adjacent_rules(struct wiphy *wiphy, enum nl80211_reg_initiator initiator, struct ieee80211_channel *chan, u32 flags, struct regulatory_request *lr, struct wiphy *request_wiphy, const struct ieee80211_reg_rule *rrule1, const struct ieee80211_reg_rule *rrule2, struct ieee80211_freq_range *comb_range) { u32 bw_flags1 = 0; u32 bw_flags2 = 0; const struct ieee80211_power_rule *power_rule1 = NULL; const struct ieee80211_power_rule *power_rule2 = NULL; const struct ieee80211_regdomain *regd; regd = reg_get_regdomain(wiphy); power_rule1 = &rrule1->power_rule; power_rule2 = &rrule2->power_rule; bw_flags1 = reg_rule_to_chan_bw_flags(regd, rrule1, chan); bw_flags2 = reg_rule_to_chan_bw_flags(regd, rrule2, chan); if (lr->initiator == NL80211_REGDOM_SET_BY_DRIVER && request_wiphy && request_wiphy == wiphy && request_wiphy->regulatory_flags & REGULATORY_STRICT_REG) { /* This guarantees the driver's requested regulatory domain * will always be used as a base for further regulatory * settings */ chan->flags = map_regdom_flags(rrule1->flags) | map_regdom_flags(rrule2->flags) | bw_flags1 | bw_flags2; chan->orig_flags = chan->flags; chan->max_antenna_gain = min_t(int, MBI_TO_DBI(power_rule1->max_antenna_gain), MBI_TO_DBI(power_rule2->max_antenna_gain)); chan->orig_mag = chan->max_antenna_gain; chan->max_reg_power = min_t(int, MBM_TO_DBM(power_rule1->max_eirp), MBM_TO_DBM(power_rule2->max_eirp)); chan->max_power = chan->max_reg_power; chan->orig_mpwr = chan->max_reg_power; if (chan->flags & IEEE80211_CHAN_RADAR) { chan->dfs_cac_ms = IEEE80211_DFS_MIN_CAC_TIME_MS; if (rrule1->dfs_cac_ms || rrule2->dfs_cac_ms) chan->dfs_cac_ms = max_t(unsigned int, rrule1->dfs_cac_ms, rrule2->dfs_cac_ms); } if ((rrule1->flags & NL80211_RRF_PSD) && (rrule2->flags & NL80211_RRF_PSD)) chan->psd = min_t(s8, rrule1->psd, rrule2->psd); else chan->flags &= ~NL80211_RRF_PSD; return; } chan->dfs_state = NL80211_DFS_USABLE; chan->dfs_state_entered = jiffies; chan->beacon_found = false; chan->flags = flags | bw_flags1 | bw_flags2 | map_regdom_flags(rrule1->flags) | map_regdom_flags(rrule2->flags); /* reg_rule_to_chan_bw_flags may forbids 10 and forbids 20 MHz * (otherwise no adj. rule case), recheck therefore */ if (cfg80211_does_bw_fit_range(comb_range, ieee80211_channel_to_khz(chan), MHZ_TO_KHZ(10))) chan->flags &= ~IEEE80211_CHAN_NO_10MHZ; if (cfg80211_does_bw_fit_range(comb_range, ieee80211_channel_to_khz(chan), MHZ_TO_KHZ(20))) chan->flags &= ~IEEE80211_CHAN_NO_20MHZ; chan->max_antenna_gain = min_t(int, chan->orig_mag, min_t(int, MBI_TO_DBI(power_rule1->max_antenna_gain), MBI_TO_DBI(power_rule2->max_antenna_gain))); chan->max_reg_power = min_t(int, MBM_TO_DBM(power_rule1->max_eirp), MBM_TO_DBM(power_rule2->max_eirp)); if (chan->flags & IEEE80211_CHAN_RADAR) { if (rrule1->dfs_cac_ms || rrule2->dfs_cac_ms) chan->dfs_cac_ms = max_t(unsigned int, rrule1->dfs_cac_ms, rrule2->dfs_cac_ms); else chan->dfs_cac_ms = IEEE80211_DFS_MIN_CAC_TIME_MS; } if (chan->orig_mpwr) { /* Devices that use REGULATORY_COUNTRY_IE_FOLLOW_POWER * will always follow the passed country IE power settings. */ if (initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE && wiphy->regulatory_flags & REGULATORY_COUNTRY_IE_FOLLOW_POWER) chan->max_power = chan->max_reg_power; else chan->max_power = min(chan->orig_mpwr, chan->max_reg_power); } else { chan->max_power = chan->max_reg_power; } } /* Note that right now we assume the desired channel bandwidth * is always 20 MHz for each individual channel (HT40 uses 20 MHz * per channel, the primary and the extension channel). */ static void handle_channel(struct wiphy *wiphy, enum nl80211_reg_initiator initiator, struct ieee80211_channel *chan) { const u32 orig_chan_freq = ieee80211_channel_to_khz(chan); struct regulatory_request *lr = get_last_request(); struct wiphy *request_wiphy = wiphy_idx_to_wiphy(lr->wiphy_idx); const struct ieee80211_reg_rule *rrule = NULL; const struct ieee80211_reg_rule *rrule1 = NULL; const struct ieee80211_reg_rule *rrule2 = NULL; u32 flags = chan->orig_flags; rrule = freq_reg_info(wiphy, orig_chan_freq); if (IS_ERR(rrule)) { /* check for adjacent match, therefore get rules for * chan - 20 MHz and chan + 20 MHz and test * if reg rules are adjacent */ rrule1 = freq_reg_info(wiphy, orig_chan_freq - MHZ_TO_KHZ(20)); rrule2 = freq_reg_info(wiphy, orig_chan_freq + MHZ_TO_KHZ(20)); if (!IS_ERR(rrule1) && !IS_ERR(rrule2)) { struct ieee80211_freq_range comb_range; if (rrule1->freq_range.end_freq_khz != rrule2->freq_range.start_freq_khz) goto disable_chan; comb_range.start_freq_khz = rrule1->freq_range.start_freq_khz; comb_range.end_freq_khz = rrule2->freq_range.end_freq_khz; comb_range.max_bandwidth_khz = min_t(u32, rrule1->freq_range.max_bandwidth_khz, rrule2->freq_range.max_bandwidth_khz); if (!cfg80211_does_bw_fit_range(&comb_range, orig_chan_freq, MHZ_TO_KHZ(20))) goto disable_chan; handle_channel_adjacent_rules(wiphy, initiator, chan, flags, lr, request_wiphy, rrule1, rrule2, &comb_range); return; } disable_chan: /* We will disable all channels that do not match our * received regulatory rule unless the hint is coming * from a Country IE and the Country IE had no information * about a band. The IEEE 802.11 spec allows for an AP * to send only a subset of the regulatory rules allowed, * so an AP in the US that only supports 2.4 GHz may only send * a country IE with information for the 2.4 GHz band * while 5 GHz is still supported. */ if (initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE && PTR_ERR(rrule) == -ERANGE) return; if (lr->initiator == NL80211_REGDOM_SET_BY_DRIVER && request_wiphy && request_wiphy == wiphy && request_wiphy->regulatory_flags & REGULATORY_STRICT_REG) { pr_debug("Disabling freq %d.%03d MHz for good\n", chan->center_freq, chan->freq_offset); chan->orig_flags |= IEEE80211_CHAN_DISABLED; chan->flags = chan->orig_flags; } else { pr_debug("Disabling freq %d.%03d MHz\n", chan->center_freq, chan->freq_offset); chan->flags |= IEEE80211_CHAN_DISABLED; } return; } handle_channel_single_rule(wiphy, initiator, chan, flags, lr, request_wiphy, rrule); } static void handle_band(struct wiphy *wiphy, enum nl80211_reg_initiator initiator, struct ieee80211_supported_band *sband) { unsigned int i; if (!sband) return; for (i = 0; i < sband->n_channels; i++) handle_channel(wiphy, initiator, &sband->channels[i]); } static bool reg_request_cell_base(struct regulatory_request *request) { if (request->initiator != NL80211_REGDOM_SET_BY_USER) return false; return request->user_reg_hint_type == NL80211_USER_REG_HINT_CELL_BASE; } bool reg_last_request_cell_base(void) { return reg_request_cell_base(get_last_request()); } #ifdef CONFIG_CFG80211_REG_CELLULAR_HINTS /* Core specific check */ static enum reg_request_treatment reg_ignore_cell_hint(struct regulatory_request *pending_request) { struct regulatory_request *lr = get_last_request(); if (!reg_num_devs_support_basehint) return REG_REQ_IGNORE; if (reg_request_cell_base(lr) && !regdom_changes(pending_request->alpha2)) return REG_REQ_ALREADY_SET; return REG_REQ_OK; } /* Device specific check */ static bool reg_dev_ignore_cell_hint(struct wiphy *wiphy) { return !(wiphy->features & NL80211_FEATURE_CELL_BASE_REG_HINTS); } #else static enum reg_request_treatment reg_ignore_cell_hint(struct regulatory_request *pending_request) { return REG_REQ_IGNORE; } static bool reg_dev_ignore_cell_hint(struct wiphy *wiphy) { return true; } #endif static bool wiphy_strict_alpha2_regd(struct wiphy *wiphy) { if (wiphy->regulatory_flags & REGULATORY_STRICT_REG && !(wiphy->regulatory_flags & REGULATORY_CUSTOM_REG)) return true; return false; } static bool ignore_reg_update(struct wiphy *wiphy, enum nl80211_reg_initiator initiator) { struct regulatory_request *lr = get_last_request(); if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) return true; if (!lr) { pr_debug("Ignoring regulatory request set by %s since last_request is not set\n", reg_initiator_name(initiator)); return true; } if (initiator == NL80211_REGDOM_SET_BY_CORE && wiphy->regulatory_flags & REGULATORY_CUSTOM_REG) { pr_debug("Ignoring regulatory request set by %s since the driver uses its own custom regulatory domain\n", reg_initiator_name(initiator)); return true; } /* * wiphy->regd will be set once the device has its own * desired regulatory domain set */ if (wiphy_strict_alpha2_regd(wiphy) && !wiphy->regd && initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE && !is_world_regdom(lr->alpha2)) { pr_debug("Ignoring regulatory request set by %s since the driver requires its own regulatory domain to be set first\n", reg_initiator_name(initiator)); return true; } if (reg_request_cell_base(lr)) return reg_dev_ignore_cell_hint(wiphy); return false; } static bool reg_is_world_roaming(struct wiphy *wiphy) { const struct ieee80211_regdomain *cr = get_cfg80211_regdom(); const struct ieee80211_regdomain *wr = get_wiphy_regdom(wiphy); struct regulatory_request *lr = get_last_request(); if (is_world_regdom(cr->alpha2) || (wr && is_world_regdom(wr->alpha2))) return true; if (lr && lr->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE && wiphy->regulatory_flags & REGULATORY_CUSTOM_REG) return true; return false; } static void reg_call_notifier(struct wiphy *wiphy, struct regulatory_request *request) { if (wiphy->reg_notifier) wiphy->reg_notifier(wiphy, request); } static void handle_reg_beacon(struct wiphy *wiphy, unsigned int chan_idx, struct reg_beacon *reg_beacon) { struct ieee80211_supported_band *sband; struct ieee80211_channel *chan; bool channel_changed = false; struct ieee80211_channel chan_before; struct regulatory_request *lr = get_last_request(); sband = wiphy->bands[reg_beacon->chan.band]; chan = &sband->channels[chan_idx]; if (likely(!ieee80211_channel_equal(chan, ®_beacon->chan))) return; if (chan->beacon_found) return; chan->beacon_found = true; if (!reg_is_world_roaming(wiphy)) return; if (wiphy->regulatory_flags & REGULATORY_DISABLE_BEACON_HINTS) return; chan_before = *chan; if (chan->flags & IEEE80211_CHAN_NO_IR) { chan->flags &= ~IEEE80211_CHAN_NO_IR; channel_changed = true; } if (channel_changed) { nl80211_send_beacon_hint_event(wiphy, &chan_before, chan); if (wiphy->flags & WIPHY_FLAG_CHANNEL_CHANGE_ON_BEACON) reg_call_notifier(wiphy, lr); } } /* * Called when a scan on a wiphy finds a beacon on * new channel */ static void wiphy_update_new_beacon(struct wiphy *wiphy, struct reg_beacon *reg_beacon) { unsigned int i; struct ieee80211_supported_band *sband; if (!wiphy->bands[reg_beacon->chan.band]) return; sband = wiphy->bands[reg_beacon->chan.band]; for (i = 0; i < sband->n_channels; i++) handle_reg_beacon(wiphy, i, reg_beacon); } /* * Called upon reg changes or a new wiphy is added */ static void wiphy_update_beacon_reg(struct wiphy *wiphy) { unsigned int i; struct ieee80211_supported_band *sband; struct reg_beacon *reg_beacon; list_for_each_entry(reg_beacon, ®_beacon_list, list) { if (!wiphy->bands[reg_beacon->chan.band]) continue; sband = wiphy->bands[reg_beacon->chan.band]; for (i = 0; i < sband->n_channels; i++) handle_reg_beacon(wiphy, i, reg_beacon); } } /* Reap the advantages of previously found beacons */ static void reg_process_beacons(struct wiphy *wiphy) { /* * Means we are just firing up cfg80211, so no beacons would * have been processed yet. */ if (!last_request) return; wiphy_update_beacon_reg(wiphy); } static bool is_ht40_allowed(struct ieee80211_channel *chan) { if (!chan) return false; if (chan->flags & IEEE80211_CHAN_DISABLED) return false; /* This would happen when regulatory rules disallow HT40 completely */ if ((chan->flags & IEEE80211_CHAN_NO_HT40) == IEEE80211_CHAN_NO_HT40) return false; return true; } static void reg_process_ht_flags_channel(struct wiphy *wiphy, struct ieee80211_channel *channel) { struct ieee80211_supported_band *sband = wiphy->bands[channel->band]; struct ieee80211_channel *channel_before = NULL, *channel_after = NULL; const struct ieee80211_regdomain *regd; unsigned int i; u32 flags; if (!is_ht40_allowed(channel)) { channel->flags |= IEEE80211_CHAN_NO_HT40; return; } /* * We need to ensure the extension channels exist to * be able to use HT40- or HT40+, this finds them (or not) */ for (i = 0; i < sband->n_channels; i++) { struct ieee80211_channel *c = &sband->channels[i]; if (c->center_freq == (channel->center_freq - 20)) channel_before = c; if (c->center_freq == (channel->center_freq + 20)) channel_after = c; } flags = 0; regd = get_wiphy_regdom(wiphy); if (regd) { const struct ieee80211_reg_rule *reg_rule = freq_reg_info_regd(MHZ_TO_KHZ(channel->center_freq), regd, MHZ_TO_KHZ(20)); if (!IS_ERR(reg_rule)) flags = reg_rule->flags; } /* * Please note that this assumes target bandwidth is 20 MHz, * if that ever changes we also need to change the below logic * to include that as well. */ if (!is_ht40_allowed(channel_before) || flags & NL80211_RRF_NO_HT40MINUS) channel->flags |= IEEE80211_CHAN_NO_HT40MINUS; else channel->flags &= ~IEEE80211_CHAN_NO_HT40MINUS; if (!is_ht40_allowed(channel_after) || flags & NL80211_RRF_NO_HT40PLUS) channel->flags |= IEEE80211_CHAN_NO_HT40PLUS; else channel->flags &= ~IEEE80211_CHAN_NO_HT40PLUS; } static void reg_process_ht_flags_band(struct wiphy *wiphy, struct ieee80211_supported_band *sband) { unsigned int i; if (!sband) return; for (i = 0; i < sband->n_channels; i++) reg_process_ht_flags_channel(wiphy, &sband->channels[i]); } static void reg_process_ht_flags(struct wiphy *wiphy) { enum nl80211_band band; if (!wiphy) return; for (band = 0; band < NUM_NL80211_BANDS; band++) reg_process_ht_flags_band(wiphy, wiphy->bands[band]); } static bool reg_wdev_chan_valid(struct wiphy *wiphy, struct wireless_dev *wdev) { struct cfg80211_chan_def chandef = {}; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); enum nl80211_iftype iftype; bool ret; int link; iftype = wdev->iftype; /* make sure the interface is active */ if (!wdev->netdev || !netif_running(wdev->netdev)) return true; for (link = 0; link < ARRAY_SIZE(wdev->links); link++) { struct ieee80211_channel *chan; if (!wdev->valid_links && link > 0) break; if (wdev->valid_links && !(wdev->valid_links & BIT(link))) continue; switch (iftype) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: if (!wdev->links[link].ap.beacon_interval) continue; chandef = wdev->links[link].ap.chandef; break; case NL80211_IFTYPE_MESH_POINT: if (!wdev->u.mesh.beacon_interval) continue; chandef = wdev->u.mesh.chandef; break; case NL80211_IFTYPE_ADHOC: if (!wdev->u.ibss.ssid_len) continue; chandef = wdev->u.ibss.chandef; break; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: /* Maybe we could consider disabling that link only? */ if (!wdev->links[link].client.current_bss) continue; chan = wdev->links[link].client.current_bss->pub.channel; if (!chan) continue; if (!rdev->ops->get_channel || rdev_get_channel(rdev, wdev, link, &chandef)) cfg80211_chandef_create(&chandef, chan, NL80211_CHAN_NO_HT); break; case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_P2P_DEVICE: /* no enforcement required */ break; case NL80211_IFTYPE_OCB: if (!wdev->u.ocb.chandef.chan) continue; chandef = wdev->u.ocb.chandef; break; case NL80211_IFTYPE_NAN: /* we have no info, but NAN is also pretty universal */ continue; default: /* others not implemented for now */ WARN_ON_ONCE(1); break; } switch (iftype) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: case NL80211_IFTYPE_ADHOC: case NL80211_IFTYPE_MESH_POINT: ret = cfg80211_reg_can_beacon_relax(wiphy, &chandef, iftype); if (!ret) return ret; break; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: ret = cfg80211_chandef_usable(wiphy, &chandef, IEEE80211_CHAN_DISABLED); if (!ret) return ret; break; default: break; } } return true; } static void reg_leave_invalid_chans(struct wiphy *wiphy) { struct wireless_dev *wdev; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); guard(wiphy)(wiphy); list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) if (!reg_wdev_chan_valid(wiphy, wdev)) cfg80211_leave(rdev, wdev); } static void reg_check_chans_work(struct work_struct *work) { struct cfg80211_registered_device *rdev; pr_debug("Verifying active interfaces after reg change\n"); rtnl_lock(); for_each_rdev(rdev) reg_leave_invalid_chans(&rdev->wiphy); rtnl_unlock(); } void reg_check_channels(void) { /* * Give usermode a chance to do something nicer (move to another * channel, orderly disconnection), before forcing a disconnection. */ mod_delayed_work(system_power_efficient_wq, ®_check_chans, msecs_to_jiffies(REG_ENFORCE_GRACE_MS)); } static void wiphy_update_regulatory(struct wiphy *wiphy, enum nl80211_reg_initiator initiator) { enum nl80211_band band; struct regulatory_request *lr = get_last_request(); if (ignore_reg_update(wiphy, initiator)) { /* * Regulatory updates set by CORE are ignored for custom * regulatory cards. Let us notify the changes to the driver, * as some drivers used this to restore its orig_* reg domain. */ if (initiator == NL80211_REGDOM_SET_BY_CORE && wiphy->regulatory_flags & REGULATORY_CUSTOM_REG && !(wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED)) reg_call_notifier(wiphy, lr); return; } lr->dfs_region = get_cfg80211_regdom()->dfs_region; for (band = 0; band < NUM_NL80211_BANDS; band++) handle_band(wiphy, initiator, wiphy->bands[band]); reg_process_beacons(wiphy); reg_process_ht_flags(wiphy); reg_call_notifier(wiphy, lr); } static void update_all_wiphy_regulatory(enum nl80211_reg_initiator initiator) { struct cfg80211_registered_device *rdev; struct wiphy *wiphy; ASSERT_RTNL(); for_each_rdev(rdev) { wiphy = &rdev->wiphy; wiphy_update_regulatory(wiphy, initiator); } reg_check_channels(); } static void handle_channel_custom(struct wiphy *wiphy, struct ieee80211_channel *chan, const struct ieee80211_regdomain *regd, u32 min_bw) { u32 bw_flags = 0; const struct ieee80211_reg_rule *reg_rule = NULL; const struct ieee80211_power_rule *power_rule = NULL; u32 bw, center_freq_khz; center_freq_khz = ieee80211_channel_to_khz(chan); for (bw = MHZ_TO_KHZ(20); bw >= min_bw; bw = bw / 2) { reg_rule = freq_reg_info_regd(center_freq_khz, regd, bw); if (!IS_ERR(reg_rule)) break; } if (IS_ERR_OR_NULL(reg_rule)) { pr_debug("Disabling freq %d.%03d MHz as custom regd has no rule that fits it\n", chan->center_freq, chan->freq_offset); if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) { chan->flags |= IEEE80211_CHAN_DISABLED; } else { chan->orig_flags |= IEEE80211_CHAN_DISABLED; chan->flags = chan->orig_flags; } return; } power_rule = ®_rule->power_rule; bw_flags = reg_rule_to_chan_bw_flags(regd, reg_rule, chan); chan->dfs_state_entered = jiffies; chan->dfs_state = NL80211_DFS_USABLE; chan->beacon_found = false; if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) chan->flags = chan->orig_flags | bw_flags | map_regdom_flags(reg_rule->flags); else chan->flags |= map_regdom_flags(reg_rule->flags) | bw_flags; chan->max_antenna_gain = (int) MBI_TO_DBI(power_rule->max_antenna_gain); chan->max_reg_power = chan->max_power = (int) MBM_TO_DBM(power_rule->max_eirp); if (chan->flags & IEEE80211_CHAN_RADAR) { if (reg_rule->dfs_cac_ms) chan->dfs_cac_ms = reg_rule->dfs_cac_ms; else chan->dfs_cac_ms = IEEE80211_DFS_MIN_CAC_TIME_MS; } if (chan->flags & IEEE80211_CHAN_PSD) chan->psd = reg_rule->psd; chan->max_power = chan->max_reg_power; } static void handle_band_custom(struct wiphy *wiphy, struct ieee80211_supported_band *sband, const struct ieee80211_regdomain *regd) { unsigned int i; if (!sband) return; /* * We currently assume that you always want at least 20 MHz, * otherwise channel 12 might get enabled if this rule is * compatible to US, which permits 2402 - 2472 MHz. */ for (i = 0; i < sband->n_channels; i++) handle_channel_custom(wiphy, &sband->channels[i], regd, MHZ_TO_KHZ(20)); } /* Used by drivers prior to wiphy registration */ void wiphy_apply_custom_regulatory(struct wiphy *wiphy, const struct ieee80211_regdomain *regd) { const struct ieee80211_regdomain *new_regd, *tmp; enum nl80211_band band; unsigned int bands_set = 0; WARN(!(wiphy->regulatory_flags & REGULATORY_CUSTOM_REG), "wiphy should have REGULATORY_CUSTOM_REG\n"); wiphy->regulatory_flags |= REGULATORY_CUSTOM_REG; for (band = 0; band < NUM_NL80211_BANDS; band++) { if (!wiphy->bands[band]) continue; handle_band_custom(wiphy, wiphy->bands[band], regd); bands_set++; } /* * no point in calling this if it won't have any effect * on your device's supported bands. */ WARN_ON(!bands_set); new_regd = reg_copy_regd(regd); if (IS_ERR(new_regd)) return; rtnl_lock(); scoped_guard(wiphy, wiphy) { tmp = get_wiphy_regdom(wiphy); rcu_assign_pointer(wiphy->regd, new_regd); rcu_free_regdom(tmp); } rtnl_unlock(); } EXPORT_SYMBOL(wiphy_apply_custom_regulatory); static void reg_set_request_processed(void) { bool need_more_processing = false; struct regulatory_request *lr = get_last_request(); lr->processed = true; spin_lock(®_requests_lock); if (!list_empty(®_requests_list)) need_more_processing = true; spin_unlock(®_requests_lock); cancel_crda_timeout(); if (need_more_processing) schedule_work(®_work); } /** * reg_process_hint_core - process core regulatory requests * @core_request: a pending core regulatory request * * The wireless subsystem can use this function to process * a regulatory request issued by the regulatory core. * * Returns: %REG_REQ_OK or %REG_REQ_IGNORE, indicating if the * hint was processed or ignored */ static enum reg_request_treatment reg_process_hint_core(struct regulatory_request *core_request) { if (reg_query_database(core_request)) { core_request->intersect = false; core_request->processed = false; reg_update_last_request(core_request); return REG_REQ_OK; } return REG_REQ_IGNORE; } static enum reg_request_treatment __reg_process_hint_user(struct regulatory_request *user_request) { struct regulatory_request *lr = get_last_request(); if (reg_request_cell_base(user_request)) return reg_ignore_cell_hint(user_request); if (reg_request_cell_base(lr)) return REG_REQ_IGNORE; if (lr->initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE) return REG_REQ_INTERSECT; /* * If the user knows better the user should set the regdom * to their country before the IE is picked up */ if (lr->initiator == NL80211_REGDOM_SET_BY_USER && lr->intersect) return REG_REQ_IGNORE; /* * Process user requests only after previous user/driver/core * requests have been processed */ if ((lr->initiator == NL80211_REGDOM_SET_BY_CORE || lr->initiator == NL80211_REGDOM_SET_BY_DRIVER || lr->initiator == NL80211_REGDOM_SET_BY_USER) && regdom_changes(lr->alpha2)) return REG_REQ_IGNORE; if (!regdom_changes(user_request->alpha2)) return REG_REQ_ALREADY_SET; return REG_REQ_OK; } /** * reg_process_hint_user - process user regulatory requests * @user_request: a pending user regulatory request * * The wireless subsystem can use this function to process * a regulatory request initiated by userspace. * * Returns: %REG_REQ_OK or %REG_REQ_IGNORE, indicating if the * hint was processed or ignored */ static enum reg_request_treatment reg_process_hint_user(struct regulatory_request *user_request) { enum reg_request_treatment treatment; treatment = __reg_process_hint_user(user_request); if (treatment == REG_REQ_IGNORE || treatment == REG_REQ_ALREADY_SET) return REG_REQ_IGNORE; user_request->intersect = treatment == REG_REQ_INTERSECT; user_request->processed = false; if (reg_query_database(user_request)) { reg_update_last_request(user_request); user_alpha2[0] = user_request->alpha2[0]; user_alpha2[1] = user_request->alpha2[1]; return REG_REQ_OK; } return REG_REQ_IGNORE; } static enum reg_request_treatment __reg_process_hint_driver(struct regulatory_request *driver_request) { struct regulatory_request *lr = get_last_request(); if (lr->initiator == NL80211_REGDOM_SET_BY_CORE) { if (regdom_changes(driver_request->alpha2)) return REG_REQ_OK; return REG_REQ_ALREADY_SET; } /* * This would happen if you unplug and plug your card * back in or if you add a new device for which the previously * loaded card also agrees on the regulatory domain. */ if (lr->initiator == NL80211_REGDOM_SET_BY_DRIVER && !regdom_changes(driver_request->alpha2)) return REG_REQ_ALREADY_SET; return REG_REQ_INTERSECT; } /** * reg_process_hint_driver - process driver regulatory requests * @wiphy: the wireless device for the regulatory request * @driver_request: a pending driver regulatory request * * The wireless subsystem can use this function to process * a regulatory request issued by an 802.11 driver. * * Returns: one of the different reg request treatment values. */ static enum reg_request_treatment reg_process_hint_driver(struct wiphy *wiphy, struct regulatory_request *driver_request) { const struct ieee80211_regdomain *regd, *tmp; enum reg_request_treatment treatment; treatment = __reg_process_hint_driver(driver_request); switch (treatment) { case REG_REQ_OK: break; case REG_REQ_IGNORE: return REG_REQ_IGNORE; case REG_REQ_INTERSECT: case REG_REQ_ALREADY_SET: regd = reg_copy_regd(get_cfg80211_regdom()); if (IS_ERR(regd)) return REG_REQ_IGNORE; tmp = get_wiphy_regdom(wiphy); ASSERT_RTNL(); scoped_guard(wiphy, wiphy) { rcu_assign_pointer(wiphy->regd, regd); } rcu_free_regdom(tmp); } driver_request->intersect = treatment == REG_REQ_INTERSECT; driver_request->processed = false; /* * Since CRDA will not be called in this case as we already * have applied the requested regulatory domain before we just * inform userspace we have processed the request */ if (treatment == REG_REQ_ALREADY_SET) { nl80211_send_reg_change_event(driver_request); reg_update_last_request(driver_request); reg_set_request_processed(); return REG_REQ_ALREADY_SET; } if (reg_query_database(driver_request)) { reg_update_last_request(driver_request); return REG_REQ_OK; } return REG_REQ_IGNORE; } static enum reg_request_treatment __reg_process_hint_country_ie(struct wiphy *wiphy, struct regulatory_request *country_ie_request) { struct wiphy *last_wiphy = NULL; struct regulatory_request *lr = get_last_request(); if (reg_request_cell_base(lr)) { /* Trust a Cell base station over the AP's country IE */ if (regdom_changes(country_ie_request->alpha2)) return REG_REQ_IGNORE; return REG_REQ_ALREADY_SET; } else { if (wiphy->regulatory_flags & REGULATORY_COUNTRY_IE_IGNORE) return REG_REQ_IGNORE; } if (unlikely(!is_an_alpha2(country_ie_request->alpha2))) return -EINVAL; if (lr->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE) return REG_REQ_OK; last_wiphy = wiphy_idx_to_wiphy(lr->wiphy_idx); if (last_wiphy != wiphy) { /* * Two cards with two APs claiming different * Country IE alpha2s. We could * intersect them, but that seems unlikely * to be correct. Reject second one for now. */ if (regdom_changes(country_ie_request->alpha2)) return REG_REQ_IGNORE; return REG_REQ_ALREADY_SET; } if (regdom_changes(country_ie_request->alpha2)) return REG_REQ_OK; return REG_REQ_ALREADY_SET; } /** * reg_process_hint_country_ie - process regulatory requests from country IEs * @wiphy: the wireless device for the regulatory request * @country_ie_request: a regulatory request from a country IE * * The wireless subsystem can use this function to process * a regulatory request issued by a country Information Element. * * Returns: one of the different reg request treatment values. */ static enum reg_request_treatment reg_process_hint_country_ie(struct wiphy *wiphy, struct regulatory_request *country_ie_request) { enum reg_request_treatment treatment; treatment = __reg_process_hint_country_ie(wiphy, country_ie_request); switch (treatment) { case REG_REQ_OK: break; case REG_REQ_IGNORE: return REG_REQ_IGNORE; case REG_REQ_ALREADY_SET: reg_free_request(country_ie_request); return REG_REQ_ALREADY_SET; case REG_REQ_INTERSECT: /* * This doesn't happen yet, not sure we * ever want to support it for this case. */ WARN_ONCE(1, "Unexpected intersection for country elements"); return REG_REQ_IGNORE; } country_ie_request->intersect = false; country_ie_request->processed = false; if (reg_query_database(country_ie_request)) { reg_update_last_request(country_ie_request); return REG_REQ_OK; } return REG_REQ_IGNORE; } bool reg_dfs_domain_same(struct wiphy *wiphy1, struct wiphy *wiphy2) { const struct ieee80211_regdomain *wiphy1_regd = NULL; const struct ieee80211_regdomain *wiphy2_regd = NULL; const struct ieee80211_regdomain *cfg80211_regd = NULL; bool dfs_domain_same; rcu_read_lock(); cfg80211_regd = rcu_dereference(cfg80211_regdomain); wiphy1_regd = rcu_dereference(wiphy1->regd); if (!wiphy1_regd) wiphy1_regd = cfg80211_regd; wiphy2_regd = rcu_dereference(wiphy2->regd); if (!wiphy2_regd) wiphy2_regd = cfg80211_regd; dfs_domain_same = wiphy1_regd->dfs_region == wiphy2_regd->dfs_region; rcu_read_unlock(); return dfs_domain_same; } static void reg_copy_dfs_chan_state(struct ieee80211_channel *dst_chan, struct ieee80211_channel *src_chan) { if (!(dst_chan->flags & IEEE80211_CHAN_RADAR) || !(src_chan->flags & IEEE80211_CHAN_RADAR)) return; if (dst_chan->flags & IEEE80211_CHAN_DISABLED || src_chan->flags & IEEE80211_CHAN_DISABLED) return; if (src_chan->center_freq == dst_chan->center_freq && dst_chan->dfs_state == NL80211_DFS_USABLE) { dst_chan->dfs_state = src_chan->dfs_state; dst_chan->dfs_state_entered = src_chan->dfs_state_entered; } } static void wiphy_share_dfs_chan_state(struct wiphy *dst_wiphy, struct wiphy *src_wiphy) { struct ieee80211_supported_band *src_sband, *dst_sband; struct ieee80211_channel *src_chan, *dst_chan; int i, j, band; if (!reg_dfs_domain_same(dst_wiphy, src_wiphy)) return; for (band = 0; band < NUM_NL80211_BANDS; band++) { dst_sband = dst_wiphy->bands[band]; src_sband = src_wiphy->bands[band]; if (!dst_sband || !src_sband) continue; for (i = 0; i < dst_sband->n_channels; i++) { dst_chan = &dst_sband->channels[i]; for (j = 0; j < src_sband->n_channels; j++) { src_chan = &src_sband->channels[j]; reg_copy_dfs_chan_state(dst_chan, src_chan); } } } } static void wiphy_all_share_dfs_chan_state(struct wiphy *wiphy) { struct cfg80211_registered_device *rdev; ASSERT_RTNL(); for_each_rdev(rdev) { if (wiphy == &rdev->wiphy) continue; wiphy_share_dfs_chan_state(wiphy, &rdev->wiphy); } } /* This processes *all* regulatory hints */ static void reg_process_hint(struct regulatory_request *reg_request) { struct wiphy *wiphy = NULL; enum reg_request_treatment treatment; enum nl80211_reg_initiator initiator = reg_request->initiator; if (reg_request->wiphy_idx != WIPHY_IDX_INVALID) wiphy = wiphy_idx_to_wiphy(reg_request->wiphy_idx); switch (initiator) { case NL80211_REGDOM_SET_BY_CORE: treatment = reg_process_hint_core(reg_request); break; case NL80211_REGDOM_SET_BY_USER: treatment = reg_process_hint_user(reg_request); break; case NL80211_REGDOM_SET_BY_DRIVER: if (!wiphy) goto out_free; treatment = reg_process_hint_driver(wiphy, reg_request); break; case NL80211_REGDOM_SET_BY_COUNTRY_IE: if (!wiphy) goto out_free; treatment = reg_process_hint_country_ie(wiphy, reg_request); break; default: WARN(1, "invalid initiator %d\n", initiator); goto out_free; } if (treatment == REG_REQ_IGNORE) goto out_free; WARN(treatment != REG_REQ_OK && treatment != REG_REQ_ALREADY_SET, "unexpected treatment value %d\n", treatment); /* This is required so that the orig_* parameters are saved. * NOTE: treatment must be set for any case that reaches here! */ if (treatment == REG_REQ_ALREADY_SET && wiphy && wiphy->regulatory_flags & REGULATORY_STRICT_REG) { wiphy_update_regulatory(wiphy, initiator); wiphy_all_share_dfs_chan_state(wiphy); reg_check_channels(); } return; out_free: reg_free_request(reg_request); } static void notify_self_managed_wiphys(struct regulatory_request *request) { struct cfg80211_registered_device *rdev; struct wiphy *wiphy; for_each_rdev(rdev) { wiphy = &rdev->wiphy; if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED && request->initiator == NL80211_REGDOM_SET_BY_USER) reg_call_notifier(wiphy, request); } } /* * Processes regulatory hints, this is all the NL80211_REGDOM_SET_BY_* * Regulatory hints come on a first come first serve basis and we * must process each one atomically. */ static void reg_process_pending_hints(void) { struct regulatory_request *reg_request, *lr; lr = get_last_request(); /* When last_request->processed becomes true this will be rescheduled */ if (lr && !lr->processed) { pr_debug("Pending regulatory request, waiting for it to be processed...\n"); return; } spin_lock(®_requests_lock); if (list_empty(®_requests_list)) { spin_unlock(®_requests_lock); return; } reg_request = list_first_entry(®_requests_list, struct regulatory_request, list); list_del_init(®_request->list); spin_unlock(®_requests_lock); notify_self_managed_wiphys(reg_request); reg_process_hint(reg_request); lr = get_last_request(); spin_lock(®_requests_lock); if (!list_empty(®_requests_list) && lr && lr->processed) schedule_work(®_work); spin_unlock(®_requests_lock); } /* Processes beacon hints -- this has nothing to do with country IEs */ static void reg_process_pending_beacon_hints(void) { struct cfg80211_registered_device *rdev; struct reg_beacon *pending_beacon, *tmp; /* This goes through the _pending_ beacon list */ spin_lock_bh(®_pending_beacons_lock); list_for_each_entry_safe(pending_beacon, tmp, ®_pending_beacons, list) { list_del_init(&pending_beacon->list); /* Applies the beacon hint to current wiphys */ for_each_rdev(rdev) wiphy_update_new_beacon(&rdev->wiphy, pending_beacon); /* Remembers the beacon hint for new wiphys or reg changes */ list_add_tail(&pending_beacon->list, ®_beacon_list); } spin_unlock_bh(®_pending_beacons_lock); } static void reg_process_self_managed_hint(struct wiphy *wiphy) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); const struct ieee80211_regdomain *tmp; const struct ieee80211_regdomain *regd; enum nl80211_band band; struct regulatory_request request = {}; ASSERT_RTNL(); lockdep_assert_wiphy(wiphy); spin_lock(®_requests_lock); regd = rdev->requested_regd; rdev->requested_regd = NULL; spin_unlock(®_requests_lock); if (!regd) return; tmp = get_wiphy_regdom(wiphy); rcu_assign_pointer(wiphy->regd, regd); rcu_free_regdom(tmp); for (band = 0; band < NUM_NL80211_BANDS; band++) handle_band_custom(wiphy, wiphy->bands[band], regd); reg_process_ht_flags(wiphy); request.wiphy_idx = get_wiphy_idx(wiphy); request.alpha2[0] = regd->alpha2[0]; request.alpha2[1] = regd->alpha2[1]; request.initiator = NL80211_REGDOM_SET_BY_DRIVER; if (wiphy->flags & WIPHY_FLAG_NOTIFY_REGDOM_BY_DRIVER) reg_call_notifier(wiphy, &request); nl80211_send_wiphy_reg_change_event(&request); } static void reg_process_self_managed_hints(void) { struct cfg80211_registered_device *rdev; ASSERT_RTNL(); for_each_rdev(rdev) { guard(wiphy)(&rdev->wiphy); reg_process_self_managed_hint(&rdev->wiphy); } reg_check_channels(); } static void reg_todo(struct work_struct *work) { rtnl_lock(); reg_process_pending_hints(); reg_process_pending_beacon_hints(); reg_process_self_managed_hints(); rtnl_unlock(); } static void queue_regulatory_request(struct regulatory_request *request) { request->alpha2[0] = toupper(request->alpha2[0]); request->alpha2[1] = toupper(request->alpha2[1]); spin_lock(®_requests_lock); list_add_tail(&request->list, ®_requests_list); spin_unlock(®_requests_lock); schedule_work(®_work); } /* * Core regulatory hint -- happens during cfg80211_init() * and when we restore regulatory settings. */ static int regulatory_hint_core(const char *alpha2) { struct regulatory_request *request; request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL); if (!request) return -ENOMEM; request->alpha2[0] = alpha2[0]; request->alpha2[1] = alpha2[1]; request->initiator = NL80211_REGDOM_SET_BY_CORE; request->wiphy_idx = WIPHY_IDX_INVALID; queue_regulatory_request(request); return 0; } /* User hints */ int regulatory_hint_user(const char *alpha2, enum nl80211_user_reg_hint_type user_reg_hint_type) { struct regulatory_request *request; if (WARN_ON(!alpha2)) return -EINVAL; if (!is_world_regdom(alpha2) && !is_an_alpha2(alpha2)) return -EINVAL; request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL); if (!request) return -ENOMEM; request->wiphy_idx = WIPHY_IDX_INVALID; request->alpha2[0] = alpha2[0]; request->alpha2[1] = alpha2[1]; request->initiator = NL80211_REGDOM_SET_BY_USER; request->user_reg_hint_type = user_reg_hint_type; /* Allow calling CRDA again */ reset_crda_timeouts(); queue_regulatory_request(request); return 0; } void regulatory_hint_indoor(bool is_indoor, u32 portid) { spin_lock(®_indoor_lock); /* It is possible that more than one user space process is trying to * configure the indoor setting. To handle such cases, clear the indoor * setting in case that some process does not think that the device * is operating in an indoor environment. In addition, if a user space * process indicates that it is controlling the indoor setting, save its * portid, i.e., make it the owner. */ reg_is_indoor = is_indoor; if (reg_is_indoor) { if (!reg_is_indoor_portid) reg_is_indoor_portid = portid; } else { reg_is_indoor_portid = 0; } spin_unlock(®_indoor_lock); if (!is_indoor) reg_check_channels(); } void regulatory_netlink_notify(u32 portid) { spin_lock(®_indoor_lock); if (reg_is_indoor_portid != portid) { spin_unlock(®_indoor_lock); return; } reg_is_indoor = false; reg_is_indoor_portid = 0; spin_unlock(®_indoor_lock); reg_check_channels(); } /* Driver hints */ int regulatory_hint(struct wiphy *wiphy, const char *alpha2) { struct regulatory_request *request; if (WARN_ON(!alpha2 || !wiphy)) return -EINVAL; wiphy->regulatory_flags &= ~REGULATORY_CUSTOM_REG; request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL); if (!request) return -ENOMEM; request->wiphy_idx = get_wiphy_idx(wiphy); request->alpha2[0] = alpha2[0]; request->alpha2[1] = alpha2[1]; request->initiator = NL80211_REGDOM_SET_BY_DRIVER; /* Allow calling CRDA again */ reset_crda_timeouts(); queue_regulatory_request(request); return 0; } EXPORT_SYMBOL(regulatory_hint); void regulatory_hint_country_ie(struct wiphy *wiphy, enum nl80211_band band, const u8 *country_ie, u8 country_ie_len) { char alpha2[2]; enum environment_cap env = ENVIRON_ANY; struct regulatory_request *request = NULL, *lr; /* IE len must be evenly divisible by 2 */ if (country_ie_len & 0x01) return; if (country_ie_len < IEEE80211_COUNTRY_IE_MIN_LEN) return; request = kzalloc(sizeof(*request), GFP_KERNEL); if (!request) return; alpha2[0] = country_ie[0]; alpha2[1] = country_ie[1]; if (country_ie[2] == 'I') env = ENVIRON_INDOOR; else if (country_ie[2] == 'O') env = ENVIRON_OUTDOOR; rcu_read_lock(); lr = get_last_request(); if (unlikely(!lr)) goto out; /* * We will run this only upon a successful connection on cfg80211. * We leave conflict resolution to the workqueue, where can hold * the RTNL. */ if (lr->initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE && lr->wiphy_idx != WIPHY_IDX_INVALID) goto out; request->wiphy_idx = get_wiphy_idx(wiphy); request->alpha2[0] = alpha2[0]; request->alpha2[1] = alpha2[1]; request->initiator = NL80211_REGDOM_SET_BY_COUNTRY_IE; request->country_ie_env = env; /* Allow calling CRDA again */ reset_crda_timeouts(); queue_regulatory_request(request); request = NULL; out: kfree(request); rcu_read_unlock(); } static void restore_alpha2(char *alpha2, bool reset_user) { /* indicates there is no alpha2 to consider for restoration */ alpha2[0] = '9'; alpha2[1] = '7'; /* The user setting has precedence over the module parameter */ if (is_user_regdom_saved()) { /* Unless we're asked to ignore it and reset it */ if (reset_user) { pr_debug("Restoring regulatory settings including user preference\n"); user_alpha2[0] = '9'; user_alpha2[1] = '7'; /* * If we're ignoring user settings, we still need to * check the module parameter to ensure we put things * back as they were for a full restore. */ if (!is_world_regdom(ieee80211_regdom)) { pr_debug("Keeping preference on module parameter ieee80211_regdom: %c%c\n", ieee80211_regdom[0], ieee80211_regdom[1]); alpha2[0] = ieee80211_regdom[0]; alpha2[1] = ieee80211_regdom[1]; } } else { pr_debug("Restoring regulatory settings while preserving user preference for: %c%c\n", user_alpha2[0], user_alpha2[1]); alpha2[0] = user_alpha2[0]; alpha2[1] = user_alpha2[1]; } } else if (!is_world_regdom(ieee80211_regdom)) { pr_debug("Keeping preference on module parameter ieee80211_regdom: %c%c\n", ieee80211_regdom[0], ieee80211_regdom[1]); alpha2[0] = ieee80211_regdom[0]; alpha2[1] = ieee80211_regdom[1]; } else pr_debug("Restoring regulatory settings\n"); } static void restore_custom_reg_settings(struct wiphy *wiphy) { struct ieee80211_supported_band *sband; enum nl80211_band band; struct ieee80211_channel *chan; int i; for (band = 0; band < NUM_NL80211_BANDS; band++) { sband = wiphy->bands[band]; if (!sband) continue; for (i = 0; i < sband->n_channels; i++) { chan = &sband->channels[i]; chan->flags = chan->orig_flags; chan->max_antenna_gain = chan->orig_mag; chan->max_power = chan->orig_mpwr; chan->beacon_found = false; } } } /* * Restoring regulatory settings involves ignoring any * possibly stale country IE information and user regulatory * settings if so desired, this includes any beacon hints * learned as we could have traveled outside to another country * after disconnection. To restore regulatory settings we do * exactly what we did at bootup: * * - send a core regulatory hint * - send a user regulatory hint if applicable * * Device drivers that send a regulatory hint for a specific country * keep their own regulatory domain on wiphy->regd so that does * not need to be remembered. */ static void restore_regulatory_settings(bool reset_user, bool cached) { char alpha2[2]; char world_alpha2[2]; struct reg_beacon *reg_beacon, *btmp; LIST_HEAD(tmp_reg_req_list); struct cfg80211_registered_device *rdev; ASSERT_RTNL(); /* * Clear the indoor setting in case that it is not controlled by user * space, as otherwise there is no guarantee that the device is still * operating in an indoor environment. */ spin_lock(®_indoor_lock); if (reg_is_indoor && !reg_is_indoor_portid) { reg_is_indoor = false; reg_check_channels(); } spin_unlock(®_indoor_lock); reset_regdomains(true, &world_regdom); restore_alpha2(alpha2, reset_user); /* * If there's any pending requests we simply * stash them to a temporary pending queue and * add then after we've restored regulatory * settings. */ spin_lock(®_requests_lock); list_splice_tail_init(®_requests_list, &tmp_reg_req_list); spin_unlock(®_requests_lock); /* Clear beacon hints */ spin_lock_bh(®_pending_beacons_lock); list_for_each_entry_safe(reg_beacon, btmp, ®_pending_beacons, list) { list_del(®_beacon->list); kfree(reg_beacon); } spin_unlock_bh(®_pending_beacons_lock); list_for_each_entry_safe(reg_beacon, btmp, ®_beacon_list, list) { list_del(®_beacon->list); kfree(reg_beacon); } /* First restore to the basic regulatory settings */ world_alpha2[0] = cfg80211_world_regdom->alpha2[0]; world_alpha2[1] = cfg80211_world_regdom->alpha2[1]; for_each_rdev(rdev) { if (rdev->wiphy.regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) continue; if (rdev->wiphy.regulatory_flags & REGULATORY_CUSTOM_REG) restore_custom_reg_settings(&rdev->wiphy); } if (cached && (!is_an_alpha2(alpha2) || !IS_ERR_OR_NULL(cfg80211_user_regdom))) { reset_regdomains(false, cfg80211_world_regdom); update_all_wiphy_regulatory(NL80211_REGDOM_SET_BY_CORE); print_regdomain(get_cfg80211_regdom()); nl80211_send_reg_change_event(&core_request_world); reg_set_request_processed(); if (is_an_alpha2(alpha2) && !regulatory_hint_user(alpha2, NL80211_USER_REG_HINT_USER)) { struct regulatory_request *ureq; spin_lock(®_requests_lock); ureq = list_last_entry(®_requests_list, struct regulatory_request, list); list_del(&ureq->list); spin_unlock(®_requests_lock); notify_self_managed_wiphys(ureq); reg_update_last_request(ureq); set_regdom(reg_copy_regd(cfg80211_user_regdom), REGD_SOURCE_CACHED); } } else { regulatory_hint_core(world_alpha2); /* * This restores the ieee80211_regdom module parameter * preference or the last user requested regulatory * settings, user regulatory settings takes precedence. */ if (is_an_alpha2(alpha2)) regulatory_hint_user(alpha2, NL80211_USER_REG_HINT_USER); } spin_lock(®_requests_lock); list_splice_tail_init(&tmp_reg_req_list, ®_requests_list); spin_unlock(®_requests_lock); pr_debug("Kicking the queue\n"); schedule_work(®_work); } static bool is_wiphy_all_set_reg_flag(enum ieee80211_regulatory_flags flag) { struct cfg80211_registered_device *rdev; struct wireless_dev *wdev; for_each_rdev(rdev) { guard(wiphy)(&rdev->wiphy); list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (!(wdev->wiphy->regulatory_flags & flag)) return false; } } return true; } void regulatory_hint_disconnect(void) { /* Restore of regulatory settings is not required when wiphy(s) * ignore IE from connected access point but clearance of beacon hints * is required when wiphy(s) supports beacon hints. */ if (is_wiphy_all_set_reg_flag(REGULATORY_COUNTRY_IE_IGNORE)) { struct reg_beacon *reg_beacon, *btmp; if (is_wiphy_all_set_reg_flag(REGULATORY_DISABLE_BEACON_HINTS)) return; spin_lock_bh(®_pending_beacons_lock); list_for_each_entry_safe(reg_beacon, btmp, ®_pending_beacons, list) { list_del(®_beacon->list); kfree(reg_beacon); } spin_unlock_bh(®_pending_beacons_lock); list_for_each_entry_safe(reg_beacon, btmp, ®_beacon_list, list) { list_del(®_beacon->list); kfree(reg_beacon); } return; } pr_debug("All devices are disconnected, going to restore regulatory settings\n"); restore_regulatory_settings(false, true); } static bool freq_is_chan_12_13_14(u32 freq) { if (freq == ieee80211_channel_to_frequency(12, NL80211_BAND_2GHZ) || freq == ieee80211_channel_to_frequency(13, NL80211_BAND_2GHZ) || freq == ieee80211_channel_to_frequency(14, NL80211_BAND_2GHZ)) return true; return false; } static bool pending_reg_beacon(struct ieee80211_channel *beacon_chan) { struct reg_beacon *pending_beacon; list_for_each_entry(pending_beacon, ®_pending_beacons, list) if (ieee80211_channel_equal(beacon_chan, &pending_beacon->chan)) return true; return false; } void regulatory_hint_found_beacon(struct wiphy *wiphy, struct ieee80211_channel *beacon_chan, gfp_t gfp) { struct reg_beacon *reg_beacon; bool processing; if (beacon_chan->beacon_found || beacon_chan->flags & IEEE80211_CHAN_RADAR || (beacon_chan->band == NL80211_BAND_2GHZ && !freq_is_chan_12_13_14(beacon_chan->center_freq))) return; spin_lock_bh(®_pending_beacons_lock); processing = pending_reg_beacon(beacon_chan); spin_unlock_bh(®_pending_beacons_lock); if (processing) return; reg_beacon = kzalloc(sizeof(struct reg_beacon), gfp); if (!reg_beacon) return; pr_debug("Found new beacon on frequency: %d.%03d MHz (Ch %d) on %s\n", beacon_chan->center_freq, beacon_chan->freq_offset, ieee80211_freq_khz_to_channel( ieee80211_channel_to_khz(beacon_chan)), wiphy_name(wiphy)); memcpy(®_beacon->chan, beacon_chan, sizeof(struct ieee80211_channel)); /* * Since we can be called from BH or and non-BH context * we must use spin_lock_bh() */ spin_lock_bh(®_pending_beacons_lock); list_add_tail(®_beacon->list, ®_pending_beacons); spin_unlock_bh(®_pending_beacons_lock); schedule_work(®_work); } static void print_rd_rules(const struct ieee80211_regdomain *rd) { unsigned int i; const struct ieee80211_reg_rule *reg_rule = NULL; const struct ieee80211_freq_range *freq_range = NULL; const struct ieee80211_power_rule *power_rule = NULL; char bw[32], cac_time[32]; pr_debug(" (start_freq - end_freq @ bandwidth), (max_antenna_gain, max_eirp), (dfs_cac_time)\n"); for (i = 0; i < rd->n_reg_rules; i++) { reg_rule = &rd->reg_rules[i]; freq_range = ®_rule->freq_range; power_rule = ®_rule->power_rule; if (reg_rule->flags & NL80211_RRF_AUTO_BW) snprintf(bw, sizeof(bw), "%d KHz, %u KHz AUTO", freq_range->max_bandwidth_khz, reg_get_max_bandwidth(rd, reg_rule)); else snprintf(bw, sizeof(bw), "%d KHz", freq_range->max_bandwidth_khz); if (reg_rule->flags & NL80211_RRF_DFS) scnprintf(cac_time, sizeof(cac_time), "%u s", reg_rule->dfs_cac_ms/1000); else scnprintf(cac_time, sizeof(cac_time), "N/A"); /* * There may not be documentation for max antenna gain * in certain regions */ if (power_rule->max_antenna_gain) pr_debug(" (%d KHz - %d KHz @ %s), (%d mBi, %d mBm), (%s)\n", freq_range->start_freq_khz, freq_range->end_freq_khz, bw, power_rule->max_antenna_gain, power_rule->max_eirp, cac_time); else pr_debug(" (%d KHz - %d KHz @ %s), (N/A, %d mBm), (%s)\n", freq_range->start_freq_khz, freq_range->end_freq_khz, bw, power_rule->max_eirp, cac_time); } } bool reg_supported_dfs_region(enum nl80211_dfs_regions dfs_region) { switch (dfs_region) { case NL80211_DFS_UNSET: case NL80211_DFS_FCC: case NL80211_DFS_ETSI: case NL80211_DFS_JP: return true; default: pr_debug("Ignoring unknown DFS master region: %d\n", dfs_region); return false; } } static void print_regdomain(const struct ieee80211_regdomain *rd) { struct regulatory_request *lr = get_last_request(); if (is_intersected_alpha2(rd->alpha2)) { if (lr->initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE) { struct cfg80211_registered_device *rdev; rdev = cfg80211_rdev_by_wiphy_idx(lr->wiphy_idx); if (rdev) { pr_debug("Current regulatory domain updated by AP to: %c%c\n", rdev->country_ie_alpha2[0], rdev->country_ie_alpha2[1]); } else pr_debug("Current regulatory domain intersected:\n"); } else pr_debug("Current regulatory domain intersected:\n"); } else if (is_world_regdom(rd->alpha2)) { pr_debug("World regulatory domain updated:\n"); } else { if (is_unknown_alpha2(rd->alpha2)) pr_debug("Regulatory domain changed to driver built-in settings (unknown country)\n"); else { if (reg_request_cell_base(lr)) pr_debug("Regulatory domain changed to country: %c%c by Cell Station\n", rd->alpha2[0], rd->alpha2[1]); else pr_debug("Regulatory domain changed to country: %c%c\n", rd->alpha2[0], rd->alpha2[1]); } } pr_debug(" DFS Master region: %s", reg_dfs_region_str(rd->dfs_region)); print_rd_rules(rd); } static void print_regdomain_info(const struct ieee80211_regdomain *rd) { pr_debug("Regulatory domain: %c%c\n", rd->alpha2[0], rd->alpha2[1]); print_rd_rules(rd); } static int reg_set_rd_core(const struct ieee80211_regdomain *rd) { if (!is_world_regdom(rd->alpha2)) return -EINVAL; update_world_regdomain(rd); return 0; } static int reg_set_rd_user(const struct ieee80211_regdomain *rd, struct regulatory_request *user_request) { const struct ieee80211_regdomain *intersected_rd = NULL; if (!regdom_changes(rd->alpha2)) return -EALREADY; if (!is_valid_rd(rd)) { pr_err("Invalid regulatory domain detected: %c%c\n", rd->alpha2[0], rd->alpha2[1]); print_regdomain_info(rd); return -EINVAL; } if (!user_request->intersect) { reset_regdomains(false, rd); return 0; } intersected_rd = regdom_intersect(rd, get_cfg80211_regdom()); if (!intersected_rd) return -EINVAL; kfree(rd); rd = NULL; reset_regdomains(false, intersected_rd); return 0; } static int reg_set_rd_driver(const struct ieee80211_regdomain *rd, struct regulatory_request *driver_request) { const struct ieee80211_regdomain *regd; const struct ieee80211_regdomain *intersected_rd = NULL; const struct ieee80211_regdomain *tmp = NULL; struct wiphy *request_wiphy; if (is_world_regdom(rd->alpha2)) return -EINVAL; if (!regdom_changes(rd->alpha2)) return -EALREADY; if (!is_valid_rd(rd)) { pr_err("Invalid regulatory domain detected: %c%c\n", rd->alpha2[0], rd->alpha2[1]); print_regdomain_info(rd); return -EINVAL; } request_wiphy = wiphy_idx_to_wiphy(driver_request->wiphy_idx); if (!request_wiphy) return -ENODEV; if (!driver_request->intersect) { ASSERT_RTNL(); scoped_guard(wiphy, request_wiphy) { if (request_wiphy->regd) tmp = get_wiphy_regdom(request_wiphy); regd = reg_copy_regd(rd); if (IS_ERR(regd)) return PTR_ERR(regd); rcu_assign_pointer(request_wiphy->regd, regd); rcu_free_regdom(tmp); } reset_regdomains(false, rd); return 0; } intersected_rd = regdom_intersect(rd, get_cfg80211_regdom()); if (!intersected_rd) return -EINVAL; /* * We can trash what CRDA provided now. * However if a driver requested this specific regulatory * domain we keep it for its private use */ tmp = get_wiphy_regdom(request_wiphy); rcu_assign_pointer(request_wiphy->regd, rd); rcu_free_regdom(tmp); rd = NULL; reset_regdomains(false, intersected_rd); return 0; } static int reg_set_rd_country_ie(const struct ieee80211_regdomain *rd, struct regulatory_request *country_ie_request) { struct wiphy *request_wiphy; if (!is_alpha2_set(rd->alpha2) && !is_an_alpha2(rd->alpha2) && !is_unknown_alpha2(rd->alpha2)) return -EINVAL; /* * Lets only bother proceeding on the same alpha2 if the current * rd is non static (it means CRDA was present and was used last) * and the pending request came in from a country IE */ if (!is_valid_rd(rd)) { pr_err("Invalid regulatory domain detected: %c%c\n", rd->alpha2[0], rd->alpha2[1]); print_regdomain_info(rd); return -EINVAL; } request_wiphy = wiphy_idx_to_wiphy(country_ie_request->wiphy_idx); if (!request_wiphy) return -ENODEV; if (country_ie_request->intersect) return -EINVAL; reset_regdomains(false, rd); return 0; } /* * Use this call to set the current regulatory domain. Conflicts with * multiple drivers can be ironed out later. Caller must've already * kmalloc'd the rd structure. */ int set_regdom(const struct ieee80211_regdomain *rd, enum ieee80211_regd_source regd_src) { struct regulatory_request *lr; bool user_reset = false; int r; if (IS_ERR_OR_NULL(rd)) return -ENODATA; if (!reg_is_valid_request(rd->alpha2)) { kfree(rd); return -EINVAL; } if (regd_src == REGD_SOURCE_CRDA) reset_crda_timeouts(); lr = get_last_request(); /* Note that this doesn't update the wiphys, this is done below */ switch (lr->initiator) { case NL80211_REGDOM_SET_BY_CORE: r = reg_set_rd_core(rd); break; case NL80211_REGDOM_SET_BY_USER: cfg80211_save_user_regdom(rd); r = reg_set_rd_user(rd, lr); user_reset = true; break; case NL80211_REGDOM_SET_BY_DRIVER: r = reg_set_rd_driver(rd, lr); break; case NL80211_REGDOM_SET_BY_COUNTRY_IE: r = reg_set_rd_country_ie(rd, lr); break; default: WARN(1, "invalid initiator %d\n", lr->initiator); kfree(rd); return -EINVAL; } if (r) { switch (r) { case -EALREADY: reg_set_request_processed(); break; default: /* Back to world regulatory in case of errors */ restore_regulatory_settings(user_reset, false); } kfree(rd); return r; } /* This would make this whole thing pointless */ if (WARN_ON(!lr->intersect && rd != get_cfg80211_regdom())) return -EINVAL; /* update all wiphys now with the new established regulatory domain */ update_all_wiphy_regulatory(lr->initiator); print_regdomain(get_cfg80211_regdom()); nl80211_send_reg_change_event(lr); reg_set_request_processed(); return 0; } static int __regulatory_set_wiphy_regd(struct wiphy *wiphy, struct ieee80211_regdomain *rd) { const struct ieee80211_regdomain *regd; const struct ieee80211_regdomain *prev_regd; struct cfg80211_registered_device *rdev; if (WARN_ON(!wiphy || !rd)) return -EINVAL; if (WARN(!(wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED), "wiphy should have REGULATORY_WIPHY_SELF_MANAGED\n")) return -EPERM; if (WARN(!is_valid_rd(rd), "Invalid regulatory domain detected: %c%c\n", rd->alpha2[0], rd->alpha2[1])) { print_regdomain_info(rd); return -EINVAL; } regd = reg_copy_regd(rd); if (IS_ERR(regd)) return PTR_ERR(regd); rdev = wiphy_to_rdev(wiphy); spin_lock(®_requests_lock); prev_regd = rdev->requested_regd; rdev->requested_regd = regd; spin_unlock(®_requests_lock); kfree(prev_regd); return 0; } int regulatory_set_wiphy_regd(struct wiphy *wiphy, struct ieee80211_regdomain *rd) { int ret = __regulatory_set_wiphy_regd(wiphy, rd); if (ret) return ret; schedule_work(®_work); return 0; } EXPORT_SYMBOL(regulatory_set_wiphy_regd); int regulatory_set_wiphy_regd_sync(struct wiphy *wiphy, struct ieee80211_regdomain *rd) { int ret; ASSERT_RTNL(); ret = __regulatory_set_wiphy_regd(wiphy, rd); if (ret) return ret; /* process the request immediately */ reg_process_self_managed_hint(wiphy); reg_check_channels(); return 0; } EXPORT_SYMBOL(regulatory_set_wiphy_regd_sync); void wiphy_regulatory_register(struct wiphy *wiphy) { struct regulatory_request *lr = get_last_request(); /* self-managed devices ignore beacon hints and country IE */ if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) { wiphy->regulatory_flags |= REGULATORY_DISABLE_BEACON_HINTS | REGULATORY_COUNTRY_IE_IGNORE; /* * The last request may have been received before this * registration call. Call the driver notifier if * initiator is USER. */ if (lr->initiator == NL80211_REGDOM_SET_BY_USER) reg_call_notifier(wiphy, lr); } if (!reg_dev_ignore_cell_hint(wiphy)) reg_num_devs_support_basehint++; wiphy_update_regulatory(wiphy, lr->initiator); wiphy_all_share_dfs_chan_state(wiphy); reg_process_self_managed_hints(); } void wiphy_regulatory_deregister(struct wiphy *wiphy) { struct wiphy *request_wiphy = NULL; struct regulatory_request *lr; lr = get_last_request(); if (!reg_dev_ignore_cell_hint(wiphy)) reg_num_devs_support_basehint--; rcu_free_regdom(get_wiphy_regdom(wiphy)); RCU_INIT_POINTER(wiphy->regd, NULL); if (lr) request_wiphy = wiphy_idx_to_wiphy(lr->wiphy_idx); if (!request_wiphy || request_wiphy != wiphy) return; lr->wiphy_idx = WIPHY_IDX_INVALID; lr->country_ie_env = ENVIRON_ANY; } /* * See FCC notices for UNII band definitions * 5GHz: https://www.fcc.gov/document/5-ghz-unlicensed-spectrum-unii * 6GHz: https://www.fcc.gov/document/fcc-proposes-more-spectrum-unlicensed-use-0 */ int cfg80211_get_unii(int freq) { /* UNII-1 */ if (freq >= 5150 && freq <= 5250) return 0; /* UNII-2A */ if (freq > 5250 && freq <= 5350) return 1; /* UNII-2B */ if (freq > 5350 && freq <= 5470) return 2; /* UNII-2C */ if (freq > 5470 && freq <= 5725) return 3; /* UNII-3 */ if (freq > 5725 && freq <= 5825) return 4; /* UNII-5 */ if (freq > 5925 && freq <= 6425) return 5; /* UNII-6 */ if (freq > 6425 && freq <= 6525) return 6; /* UNII-7 */ if (freq > 6525 && freq <= 6875) return 7; /* UNII-8 */ if (freq > 6875 && freq <= 7125) return 8; return -EINVAL; } bool regulatory_indoor_allowed(void) { return reg_is_indoor; } bool regulatory_pre_cac_allowed(struct wiphy *wiphy) { const struct ieee80211_regdomain *regd = NULL; const struct ieee80211_regdomain *wiphy_regd = NULL; bool pre_cac_allowed = false; rcu_read_lock(); regd = rcu_dereference(cfg80211_regdomain); wiphy_regd = rcu_dereference(wiphy->regd); if (!wiphy_regd) { if (regd->dfs_region == NL80211_DFS_ETSI) pre_cac_allowed = true; rcu_read_unlock(); return pre_cac_allowed; } if (regd->dfs_region == wiphy_regd->dfs_region && wiphy_regd->dfs_region == NL80211_DFS_ETSI) pre_cac_allowed = true; rcu_read_unlock(); return pre_cac_allowed; } EXPORT_SYMBOL(regulatory_pre_cac_allowed); static void cfg80211_check_and_end_cac(struct cfg80211_registered_device *rdev) { struct wireless_dev *wdev; unsigned int link_id; guard(wiphy)(&rdev->wiphy); /* If we finished CAC or received radar, we should end any * CAC running on the same channels. * the check !cfg80211_chandef_dfs_usable contain 2 options: * either all channels are available - those the CAC_FINISHED * event has effected another wdev state, or there is a channel * in unavailable state in wdev chandef - those the RADAR_DETECTED * event has effected another wdev state. * In both cases we should end the CAC on the wdev. */ list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { struct cfg80211_chan_def *chandef; for_each_valid_link(wdev, link_id) { if (!wdev->links[link_id].cac_started) continue; chandef = wdev_chandef(wdev, link_id); if (!chandef) continue; if (!cfg80211_chandef_dfs_usable(&rdev->wiphy, chandef)) rdev_end_cac(rdev, wdev->netdev, link_id); } } } void regulatory_propagate_dfs_state(struct wiphy *wiphy, struct cfg80211_chan_def *chandef, enum nl80211_dfs_state dfs_state, enum nl80211_radar_event event) { struct cfg80211_registered_device *rdev; ASSERT_RTNL(); if (WARN_ON(!cfg80211_chandef_valid(chandef))) return; for_each_rdev(rdev) { if (wiphy == &rdev->wiphy) continue; if (!reg_dfs_domain_same(wiphy, &rdev->wiphy)) continue; if (!ieee80211_get_channel(&rdev->wiphy, chandef->chan->center_freq)) continue; cfg80211_set_dfs_state(&rdev->wiphy, chandef, dfs_state); if (event == NL80211_RADAR_DETECTED || event == NL80211_RADAR_CAC_FINISHED) { cfg80211_sched_dfs_chan_update(rdev); cfg80211_check_and_end_cac(rdev); } nl80211_radar_notify(rdev, chandef, event, NULL, GFP_KERNEL); } } static int __init regulatory_init_db(void) { int err; /* * It's possible that - due to other bugs/issues - cfg80211 * never called regulatory_init() below, or that it failed; * in that case, don't try to do any further work here as * it's doomed to lead to crashes. */ if (!reg_fdev) return -EINVAL; err = load_builtin_regdb_keys(); if (err) { faux_device_destroy(reg_fdev); return err; } /* We always try to get an update for the static regdomain */ err = regulatory_hint_core(cfg80211_world_regdom->alpha2); if (err) { if (err == -ENOMEM) { faux_device_destroy(reg_fdev); return err; } /* * N.B. kobject_uevent_env() can fail mainly for when we're out * memory which is handled and propagated appropriately above * but it can also fail during a netlink_broadcast() or during * early boot for call_usermodehelper(). For now treat these * errors as non-fatal. */ pr_err("kobject_uevent_env() was unable to call CRDA during init\n"); } /* * Finally, if the user set the module parameter treat it * as a user hint. */ if (!is_world_regdom(ieee80211_regdom)) regulatory_hint_user(ieee80211_regdom, NL80211_USER_REG_HINT_USER); return 0; } #ifndef MODULE late_initcall(regulatory_init_db); #endif int __init regulatory_init(void) { reg_fdev = faux_device_create("regulatory", NULL, NULL); if (!reg_fdev) return -ENODEV; rcu_assign_pointer(cfg80211_regdomain, cfg80211_world_regdom); user_alpha2[0] = '9'; user_alpha2[1] = '7'; #ifdef MODULE return regulatory_init_db(); #else return 0; #endif } void regulatory_exit(void) { struct regulatory_request *reg_request, *tmp; struct reg_beacon *reg_beacon, *btmp; cancel_work_sync(®_work); cancel_crda_timeout_sync(); cancel_delayed_work_sync(®_check_chans); /* Lock to suppress warnings */ rtnl_lock(); reset_regdomains(true, NULL); rtnl_unlock(); dev_set_uevent_suppress(®_fdev->dev, true); faux_device_destroy(reg_fdev); list_for_each_entry_safe(reg_beacon, btmp, ®_pending_beacons, list) { list_del(®_beacon->list); kfree(reg_beacon); } list_for_each_entry_safe(reg_beacon, btmp, ®_beacon_list, list) { list_del(®_beacon->list); kfree(reg_beacon); } list_for_each_entry_safe(reg_request, tmp, ®_requests_list, list) { list_del(®_request->list); kfree(reg_request); } if (!IS_ERR_OR_NULL(regdb)) kfree(regdb); if (!IS_ERR_OR_NULL(cfg80211_user_regdom)) kfree(cfg80211_user_regdom); free_regdb_keyring(); } |
| 552 552 552 552 552 10 10 10 10 10 10 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 | // SPDX-License-Identifier: GPL-2.0 #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/netdevice.h> #include <net/gro_cells.h> #include <net/hotdata.h> struct gro_cell { struct sk_buff_head napi_skbs; struct napi_struct napi; local_lock_t bh_lock; }; int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb) { struct net_device *dev = skb->dev; bool have_bh_lock = false; struct gro_cell *cell; int res; rcu_read_lock(); if (unlikely(!(dev->flags & IFF_UP))) goto drop; if (!gcells->cells || skb_cloned(skb) || netif_elide_gro(dev)) { res = netif_rx(skb); goto unlock; } local_lock_nested_bh(&gcells->cells->bh_lock); have_bh_lock = true; cell = this_cpu_ptr(gcells->cells); if (skb_queue_len(&cell->napi_skbs) > READ_ONCE(net_hotdata.max_backlog)) { drop: dev_core_stats_rx_dropped_inc(dev); kfree_skb(skb); res = NET_RX_DROP; goto unlock; } __skb_queue_tail(&cell->napi_skbs, skb); if (skb_queue_len(&cell->napi_skbs) == 1) napi_schedule(&cell->napi); res = NET_RX_SUCCESS; unlock: if (have_bh_lock) local_unlock_nested_bh(&gcells->cells->bh_lock); rcu_read_unlock(); return res; } EXPORT_SYMBOL(gro_cells_receive); /* called under BH context */ static int gro_cell_poll(struct napi_struct *napi, int budget) { struct gro_cell *cell = container_of(napi, struct gro_cell, napi); struct sk_buff *skb; int work_done = 0; while (work_done < budget) { __local_lock_nested_bh(&cell->bh_lock); skb = __skb_dequeue(&cell->napi_skbs); __local_unlock_nested_bh(&cell->bh_lock); if (!skb) break; napi_gro_receive(napi, skb); work_done++; } if (work_done < budget) napi_complete_done(napi, work_done); return work_done; } int gro_cells_init(struct gro_cells *gcells, struct net_device *dev) { int i; gcells->cells = alloc_percpu(struct gro_cell); if (!gcells->cells) return -ENOMEM; for_each_possible_cpu(i) { struct gro_cell *cell = per_cpu_ptr(gcells->cells, i); __skb_queue_head_init(&cell->napi_skbs); local_lock_init(&cell->bh_lock); set_bit(NAPI_STATE_NO_BUSY_POLL, &cell->napi.state); netif_napi_add(dev, &cell->napi, gro_cell_poll); napi_enable(&cell->napi); } return 0; } EXPORT_SYMBOL(gro_cells_init); struct percpu_free_defer { struct rcu_head rcu; void __percpu *ptr; }; static void percpu_free_defer_callback(struct rcu_head *head) { struct percpu_free_defer *defer; defer = container_of(head, struct percpu_free_defer, rcu); free_percpu(defer->ptr); kfree(defer); } void gro_cells_destroy(struct gro_cells *gcells) { struct percpu_free_defer *defer; int i; if (!gcells->cells) return; for_each_possible_cpu(i) { struct gro_cell *cell = per_cpu_ptr(gcells->cells, i); napi_disable(&cell->napi); __netif_napi_del(&cell->napi); __skb_queue_purge(&cell->napi_skbs); } /* We need to observe an rcu grace period before freeing ->cells, * because netpoll could access dev->napi_list under rcu protection. * Try hard using call_rcu() instead of synchronize_rcu(), * because we might be called from cleanup_net(), and we * definitely do not want to block this critical task. */ defer = kmalloc(sizeof(*defer), GFP_KERNEL | __GFP_NOWARN); if (likely(defer)) { defer->ptr = gcells->cells; call_rcu(&defer->rcu, percpu_free_defer_callback); } else { /* We do not hold RTNL at this point, synchronize_net() * would not be able to expedite this sync. */ synchronize_rcu_expedited(); free_percpu(gcells->cells); } gcells->cells = NULL; } EXPORT_SYMBOL(gro_cells_destroy); |
| 25 3 17 9 4 548 2 3 4 7 3 3 3 1 5 4 5 4 4 3 2 4 4 2 2 1 1 1 1 1 1 6 7 7 7 7 6 6 6 6 6 6 6 6 6 6 6 1 1 1 1 1 1 27 27 27 27 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 6 7 6 6 6 3 3 17 17 17 3 17 17 3 3 3 3 3 17 17 17 17 17 21 21 21 21 20 9 9 9 1 9 9 1 9 12 20 21 21 21 21 1 21 13 13 13 16 16 11 11 11 3 3 11 3 11 24 24 24 21 3 24 18 18 2 2 16 16 16 16 23 3 24 3 24 3 23 3 23 3 24 6 24 9 9 3 3 19 8 8 8 8 2 2 3 3 3 4 3 2 3 6 35 31 6 35 36 35 36 36 3 35 8 8 36 1 36 7 36 4 36 2 36 36 36 1 36 30 30 30 54 54 54 54 55 1 55 54 55 55 55 55 55 1 55 54 1 55 55 55 46 55 51 55 8 55 1 54 5 54 1 54 54 54 51 54 1 54 1 53 54 54 54 54 54 54 54 54 54 54 4 54 4 54 5 54 1 54 6 54 1 54 54 54 3 30 30 2 30 27 30 2 30 30 30 30 30 4 4 32 7 7 7 7 7 7 7 7 7 7 7 1 7 7 7 7 3 3 3 3 3 6 5 4 3 2 1 6 6 5 4 3 2 1 1 6 1 6 2 2 2 2 2 2 1 1 1 7 7 7 1 7 1 7 4 7 1 7 7 11 11 2 9 5 9 1 8 7 7 64 1 64 41 64 1 64 4 63 2 64 7 6 62 2 1 1 1 62 13 1 49 60 8 61 1 61 4 1 3 1 2 1 58 56 6 58 7 7 58 11 11 5 55 35 5 2 3 2 4 4 4 2 2 2 2 1 1 1 5 4 2 1 1 1 1 1 2 8 8 8 8 7 7 8 3 1 1 8 8 666 666 7 6 6 665 6 6 667 667 667 1 549 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 | // SPDX-License-Identifier: GPL-2.0-only /* * mac80211_hwsim - software simulator of 802.11 radio(s) for mac80211 * Copyright (c) 2008, Jouni Malinen <j@w1.fi> * Copyright (c) 2011, Javier Lopez <jlopex@gmail.com> * Copyright (c) 2016 - 2017 Intel Deutschland GmbH * Copyright (C) 2018 - 2025 Intel Corporation */ /* * TODO: * - Add TSF sync and fix IBSS beacon transmission by adding * competition for "air time" at TBTT * - RX filtering based on filter configuration (data->rx_filter) */ #include <linux/list.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <net/dst.h> #include <net/xfrm.h> #include <net/mac80211.h> #include <net/ieee80211_radiotap.h> #include <linux/if_arp.h> #include <linux/rtnetlink.h> #include <linux/etherdevice.h> #include <linux/platform_device.h> #include <linux/debugfs.h> #include <linux/module.h> #include <linux/ktime.h> #include <net/genetlink.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <linux/rhashtable.h> #include <linux/nospec.h> #include <linux/virtio.h> #include <linux/virtio_ids.h> #include <linux/virtio_config.h> #include "mac80211_hwsim.h" #define WARN_QUEUE 100 #define MAX_QUEUE 200 MODULE_AUTHOR("Jouni Malinen"); MODULE_DESCRIPTION("Software simulator of 802.11 radio(s) for mac80211"); MODULE_LICENSE("GPL"); static int radios = 2; module_param(radios, int, 0444); MODULE_PARM_DESC(radios, "Number of simulated radios"); static int channels = 1; module_param(channels, int, 0444); MODULE_PARM_DESC(channels, "Number of concurrent channels"); static bool paged_rx = false; module_param(paged_rx, bool, 0644); MODULE_PARM_DESC(paged_rx, "Use paged SKBs for RX instead of linear ones"); static bool rctbl = false; module_param(rctbl, bool, 0444); MODULE_PARM_DESC(rctbl, "Handle rate control table"); static bool support_p2p_device = true; module_param(support_p2p_device, bool, 0444); MODULE_PARM_DESC(support_p2p_device, "Support P2P-Device interface type"); static bool mlo; module_param(mlo, bool, 0444); MODULE_PARM_DESC(mlo, "Support MLO"); static bool multi_radio; module_param(multi_radio, bool, 0444); MODULE_PARM_DESC(multi_radio, "Support Multiple Radios per wiphy"); /** * enum hwsim_regtest - the type of regulatory tests we offer * * @HWSIM_REGTEST_DISABLED: No regulatory tests are performed, * this is the default value. * @HWSIM_REGTEST_DRIVER_REG_FOLLOW: Used for testing the driver regulatory * hint, only one driver regulatory hint will be sent as such the * secondary radios are expected to follow. * @HWSIM_REGTEST_DRIVER_REG_ALL: Used for testing the driver regulatory * request with all radios reporting the same regulatory domain. * @HWSIM_REGTEST_DIFF_COUNTRY: Used for testing the drivers calling * different regulatory domains requests. Expected behaviour is for * an intersection to occur but each device will still use their * respective regulatory requested domains. Subsequent radios will * use the resulting intersection. * @HWSIM_REGTEST_WORLD_ROAM: Used for testing the world roaming. We accomplish * this by using a custom beacon-capable regulatory domain for the first * radio. All other device world roam. * @HWSIM_REGTEST_CUSTOM_WORLD: Used for testing the custom world regulatory * domain requests. All radios will adhere to this custom world regulatory * domain. * @HWSIM_REGTEST_CUSTOM_WORLD_2: Used for testing 2 custom world regulatory * domain requests. The first radio will adhere to the first custom world * regulatory domain, the second one to the second custom world regulatory * domain. All other devices will world roam. * @HWSIM_REGTEST_STRICT_FOLLOW: Used for testing strict regulatory domain * settings, only the first radio will send a regulatory domain request * and use strict settings. The rest of the radios are expected to follow. * @HWSIM_REGTEST_STRICT_ALL: Used for testing strict regulatory domain * settings. All radios will adhere to this. * @HWSIM_REGTEST_STRICT_AND_DRIVER_REG: Used for testing strict regulatory * domain settings, combined with secondary driver regulatory domain * settings. The first radio will get a strict regulatory domain setting * using the first driver regulatory request and the second radio will use * non-strict settings using the second driver regulatory request. All * other devices should follow the intersection created between the * first two. * @HWSIM_REGTEST_ALL: Used for testing every possible mix. You will need * at least 6 radios for a complete test. We will test in this order: * 1 - driver custom world regulatory domain * 2 - second custom world regulatory domain * 3 - first driver regulatory domain request * 4 - second driver regulatory domain request * 5 - strict regulatory domain settings using the third driver regulatory * domain request * 6 and on - should follow the intersection of the 3rd, 4rth and 5th radio * regulatory requests. * * These are the different values you can use for the regtest * module parameter. This is useful to help test world roaming * and the driver regulatory_hint() call and combinations of these. * If you want to do specific alpha2 regulatory domain tests simply * use the userspace regulatory request as that will be respected as * well without the need of this module parameter. This is designed * only for testing the driver regulatory request, world roaming * and all possible combinations. */ enum hwsim_regtest { HWSIM_REGTEST_DISABLED = 0, HWSIM_REGTEST_DRIVER_REG_FOLLOW = 1, HWSIM_REGTEST_DRIVER_REG_ALL = 2, HWSIM_REGTEST_DIFF_COUNTRY = 3, HWSIM_REGTEST_WORLD_ROAM = 4, HWSIM_REGTEST_CUSTOM_WORLD = 5, HWSIM_REGTEST_CUSTOM_WORLD_2 = 6, HWSIM_REGTEST_STRICT_FOLLOW = 7, HWSIM_REGTEST_STRICT_ALL = 8, HWSIM_REGTEST_STRICT_AND_DRIVER_REG = 9, HWSIM_REGTEST_ALL = 10, }; /* Set to one of the HWSIM_REGTEST_* values above */ static int regtest = HWSIM_REGTEST_DISABLED; module_param(regtest, int, 0444); MODULE_PARM_DESC(regtest, "The type of regulatory test we want to run"); static const char *hwsim_alpha2s[] = { "FI", "AL", "US", "DE", "JP", "AL", }; static const struct ieee80211_regdomain hwsim_world_regdom_custom_01 = { .n_reg_rules = 5, .alpha2 = "99", .reg_rules = { REG_RULE(2412-10, 2462+10, 40, 0, 20, 0), REG_RULE(2484-10, 2484+10, 40, 0, 20, 0), REG_RULE(5150-10, 5240+10, 40, 0, 30, 0), REG_RULE(5745-10, 5825+10, 40, 0, 30, 0), REG_RULE(5855-10, 5925+10, 40, 0, 33, 0), } }; static const struct ieee80211_regdomain hwsim_world_regdom_custom_02 = { .n_reg_rules = 3, .alpha2 = "99", .reg_rules = { REG_RULE(2412-10, 2462+10, 40, 0, 20, 0), REG_RULE(5725-10, 5850+10, 40, 0, 30, NL80211_RRF_NO_IR), REG_RULE(5855-10, 5925+10, 40, 0, 33, 0), } }; static const struct ieee80211_regdomain hwsim_world_regdom_custom_03 = { .n_reg_rules = 6, .alpha2 = "99", .reg_rules = { REG_RULE(2412 - 10, 2462 + 10, 40, 0, 20, 0), REG_RULE(2484 - 10, 2484 + 10, 40, 0, 20, 0), REG_RULE(5150 - 10, 5240 + 10, 40, 0, 30, 0), REG_RULE(5745 - 10, 5825 + 10, 40, 0, 30, 0), REG_RULE(5855 - 10, 5925 + 10, 40, 0, 33, 0), REG_RULE(5955 - 10, 7125 + 10, 320, 0, 33, 0), } }; static const struct ieee80211_regdomain hwsim_world_regdom_custom_04 = { .n_reg_rules = 6, .alpha2 = "99", .reg_rules = { REG_RULE(2412 - 10, 2462 + 10, 40, 0, 20, 0), REG_RULE(2484 - 10, 2484 + 10, 40, 0, 20, 0), REG_RULE(5150 - 10, 5240 + 10, 80, 0, 30, NL80211_RRF_AUTO_BW), REG_RULE(5260 - 10, 5320 + 10, 80, 0, 30, NL80211_RRF_DFS_CONCURRENT | NL80211_RRF_DFS | NL80211_RRF_AUTO_BW), REG_RULE(5500 - 10, 5720 + 10, 160, 0, 30, NL80211_RRF_DFS_CONCURRENT | NL80211_RRF_DFS), REG_RULE(5745 - 10, 5825 + 10, 80, 0, 30, 0), REG_RULE(5855 - 10, 5925 + 10, 80, 0, 33, 0), } }; static const struct ieee80211_regdomain *hwsim_world_regdom_custom[] = { &hwsim_world_regdom_custom_01, &hwsim_world_regdom_custom_02, &hwsim_world_regdom_custom_03, &hwsim_world_regdom_custom_04, }; struct hwsim_vif_priv { u32 magic; u32 skip_beacons[IEEE80211_MLD_MAX_NUM_LINKS]; u8 bssid[ETH_ALEN]; bool assoc; bool bcn_en; u16 aid; }; #define HWSIM_VIF_MAGIC 0x69537748 static inline void hwsim_check_magic(struct ieee80211_vif *vif) { struct hwsim_vif_priv *vp = (void *)vif->drv_priv; WARN(vp->magic != HWSIM_VIF_MAGIC, "Invalid VIF (%p) magic %#x, %pM, %d/%d\n", vif, vp->magic, vif->addr, vif->type, vif->p2p); } static inline void hwsim_set_magic(struct ieee80211_vif *vif) { struct hwsim_vif_priv *vp = (void *)vif->drv_priv; vp->magic = HWSIM_VIF_MAGIC; } static inline void hwsim_clear_magic(struct ieee80211_vif *vif) { struct hwsim_vif_priv *vp = (void *)vif->drv_priv; vp->magic = 0; } struct hwsim_sta_priv { u32 magic; unsigned int last_link; u16 active_links_rx; }; #define HWSIM_STA_MAGIC 0x6d537749 static inline void hwsim_check_sta_magic(struct ieee80211_sta *sta) { struct hwsim_sta_priv *sp = (void *)sta->drv_priv; WARN_ON(sp->magic != HWSIM_STA_MAGIC); } static inline void hwsim_set_sta_magic(struct ieee80211_sta *sta) { struct hwsim_sta_priv *sp = (void *)sta->drv_priv; sp->magic = HWSIM_STA_MAGIC; } static inline void hwsim_clear_sta_magic(struct ieee80211_sta *sta) { struct hwsim_sta_priv *sp = (void *)sta->drv_priv; sp->magic = 0; } struct hwsim_chanctx_priv { u32 magic; }; #define HWSIM_CHANCTX_MAGIC 0x6d53774a static inline void hwsim_check_chanctx_magic(struct ieee80211_chanctx_conf *c) { struct hwsim_chanctx_priv *cp = (void *)c->drv_priv; WARN_ON(cp->magic != HWSIM_CHANCTX_MAGIC); } static inline void hwsim_set_chanctx_magic(struct ieee80211_chanctx_conf *c) { struct hwsim_chanctx_priv *cp = (void *)c->drv_priv; cp->magic = HWSIM_CHANCTX_MAGIC; } static inline void hwsim_clear_chanctx_magic(struct ieee80211_chanctx_conf *c) { struct hwsim_chanctx_priv *cp = (void *)c->drv_priv; cp->magic = 0; } static unsigned int hwsim_net_id; static DEFINE_IDA(hwsim_netgroup_ida); struct hwsim_net { int netgroup; u32 wmediumd; }; static inline int hwsim_net_get_netgroup(struct net *net) { struct hwsim_net *hwsim_net = net_generic(net, hwsim_net_id); return hwsim_net->netgroup; } static inline int hwsim_net_set_netgroup(struct net *net) { struct hwsim_net *hwsim_net = net_generic(net, hwsim_net_id); hwsim_net->netgroup = ida_alloc(&hwsim_netgroup_ida, GFP_KERNEL); return hwsim_net->netgroup >= 0 ? 0 : -ENOMEM; } static inline u32 hwsim_net_get_wmediumd(struct net *net) { struct hwsim_net *hwsim_net = net_generic(net, hwsim_net_id); return hwsim_net->wmediumd; } static inline void hwsim_net_set_wmediumd(struct net *net, u32 portid) { struct hwsim_net *hwsim_net = net_generic(net, hwsim_net_id); hwsim_net->wmediumd = portid; } static struct class *hwsim_class; static struct net_device *hwsim_mon; /* global monitor netdev */ #define CHAN2G(_freq) { \ .band = NL80211_BAND_2GHZ, \ .center_freq = (_freq), \ .hw_value = (_freq), \ } #define CHAN5G(_freq) { \ .band = NL80211_BAND_5GHZ, \ .center_freq = (_freq), \ .hw_value = (_freq), \ } #define CHAN6G(_freq) { \ .band = NL80211_BAND_6GHZ, \ .center_freq = (_freq), \ .hw_value = (_freq), \ } static const struct ieee80211_channel hwsim_channels_2ghz[] = { CHAN2G(2412), /* Channel 1 */ CHAN2G(2417), /* Channel 2 */ CHAN2G(2422), /* Channel 3 */ CHAN2G(2427), /* Channel 4 */ CHAN2G(2432), /* Channel 5 */ CHAN2G(2437), /* Channel 6 */ CHAN2G(2442), /* Channel 7 */ CHAN2G(2447), /* Channel 8 */ CHAN2G(2452), /* Channel 9 */ CHAN2G(2457), /* Channel 10 */ CHAN2G(2462), /* Channel 11 */ CHAN2G(2467), /* Channel 12 */ CHAN2G(2472), /* Channel 13 */ CHAN2G(2484), /* Channel 14 */ }; static const struct ieee80211_channel hwsim_channels_5ghz[] = { CHAN5G(5180), /* Channel 36 */ CHAN5G(5200), /* Channel 40 */ CHAN5G(5220), /* Channel 44 */ CHAN5G(5240), /* Channel 48 */ CHAN5G(5260), /* Channel 52 */ CHAN5G(5280), /* Channel 56 */ CHAN5G(5300), /* Channel 60 */ CHAN5G(5320), /* Channel 64 */ CHAN5G(5500), /* Channel 100 */ CHAN5G(5520), /* Channel 104 */ CHAN5G(5540), /* Channel 108 */ CHAN5G(5560), /* Channel 112 */ CHAN5G(5580), /* Channel 116 */ CHAN5G(5600), /* Channel 120 */ CHAN5G(5620), /* Channel 124 */ CHAN5G(5640), /* Channel 128 */ CHAN5G(5660), /* Channel 132 */ CHAN5G(5680), /* Channel 136 */ CHAN5G(5700), /* Channel 140 */ CHAN5G(5745), /* Channel 149 */ CHAN5G(5765), /* Channel 153 */ CHAN5G(5785), /* Channel 157 */ CHAN5G(5805), /* Channel 161 */ CHAN5G(5825), /* Channel 165 */ CHAN5G(5845), /* Channel 169 */ CHAN5G(5855), /* Channel 171 */ CHAN5G(5860), /* Channel 172 */ CHAN5G(5865), /* Channel 173 */ CHAN5G(5870), /* Channel 174 */ CHAN5G(5875), /* Channel 175 */ CHAN5G(5880), /* Channel 176 */ CHAN5G(5885), /* Channel 177 */ CHAN5G(5890), /* Channel 178 */ CHAN5G(5895), /* Channel 179 */ CHAN5G(5900), /* Channel 180 */ CHAN5G(5905), /* Channel 181 */ CHAN5G(5910), /* Channel 182 */ CHAN5G(5915), /* Channel 183 */ CHAN5G(5920), /* Channel 184 */ CHAN5G(5925), /* Channel 185 */ }; static const struct ieee80211_channel hwsim_channels_6ghz[] = { CHAN6G(5955), /* Channel 1 */ CHAN6G(5975), /* Channel 5 */ CHAN6G(5995), /* Channel 9 */ CHAN6G(6015), /* Channel 13 */ CHAN6G(6035), /* Channel 17 */ CHAN6G(6055), /* Channel 21 */ CHAN6G(6075), /* Channel 25 */ CHAN6G(6095), /* Channel 29 */ CHAN6G(6115), /* Channel 33 */ CHAN6G(6135), /* Channel 37 */ CHAN6G(6155), /* Channel 41 */ CHAN6G(6175), /* Channel 45 */ CHAN6G(6195), /* Channel 49 */ CHAN6G(6215), /* Channel 53 */ CHAN6G(6235), /* Channel 57 */ CHAN6G(6255), /* Channel 61 */ CHAN6G(6275), /* Channel 65 */ CHAN6G(6295), /* Channel 69 */ CHAN6G(6315), /* Channel 73 */ CHAN6G(6335), /* Channel 77 */ CHAN6G(6355), /* Channel 81 */ CHAN6G(6375), /* Channel 85 */ CHAN6G(6395), /* Channel 89 */ CHAN6G(6415), /* Channel 93 */ CHAN6G(6435), /* Channel 97 */ CHAN6G(6455), /* Channel 181 */ CHAN6G(6475), /* Channel 105 */ CHAN6G(6495), /* Channel 109 */ CHAN6G(6515), /* Channel 113 */ CHAN6G(6535), /* Channel 117 */ CHAN6G(6555), /* Channel 121 */ CHAN6G(6575), /* Channel 125 */ CHAN6G(6595), /* Channel 129 */ CHAN6G(6615), /* Channel 133 */ CHAN6G(6635), /* Channel 137 */ CHAN6G(6655), /* Channel 141 */ CHAN6G(6675), /* Channel 145 */ CHAN6G(6695), /* Channel 149 */ CHAN6G(6715), /* Channel 153 */ CHAN6G(6735), /* Channel 157 */ CHAN6G(6755), /* Channel 161 */ CHAN6G(6775), /* Channel 165 */ CHAN6G(6795), /* Channel 169 */ CHAN6G(6815), /* Channel 173 */ CHAN6G(6835), /* Channel 177 */ CHAN6G(6855), /* Channel 181 */ CHAN6G(6875), /* Channel 185 */ CHAN6G(6895), /* Channel 189 */ CHAN6G(6915), /* Channel 193 */ CHAN6G(6935), /* Channel 197 */ CHAN6G(6955), /* Channel 201 */ CHAN6G(6975), /* Channel 205 */ CHAN6G(6995), /* Channel 209 */ CHAN6G(7015), /* Channel 213 */ CHAN6G(7035), /* Channel 217 */ CHAN6G(7055), /* Channel 221 */ CHAN6G(7075), /* Channel 225 */ CHAN6G(7095), /* Channel 229 */ CHAN6G(7115), /* Channel 233 */ }; #define NUM_S1G_CHANS_US 51 static struct ieee80211_channel hwsim_channels_s1g[NUM_S1G_CHANS_US]; static const struct ieee80211_sta_s1g_cap hwsim_s1g_cap = { .s1g = true, .cap = { S1G_CAP0_SGI_1MHZ | S1G_CAP0_SGI_2MHZ, 0, 0, S1G_CAP3_MAX_MPDU_LEN, 0, S1G_CAP5_AMPDU, 0, S1G_CAP7_DUP_1MHZ, S1G_CAP8_TWT_RESPOND | S1G_CAP8_TWT_REQUEST, 0}, .nss_mcs = { 0xfc | 1, /* MCS 7 for 1 SS */ /* RX Highest Supported Long GI Data Rate 0:7 */ 0, /* RX Highest Supported Long GI Data Rate 0:7 */ /* TX S1G MCS Map 0:6 */ 0xfa, /* TX S1G MCS Map :7 */ /* TX Highest Supported Long GI Data Rate 0:6 */ 0x80, /* TX Highest Supported Long GI Data Rate 7:8 */ /* Rx Single spatial stream and S1G-MCS Map for 1MHz */ /* Tx Single spatial stream and S1G-MCS Map for 1MHz */ 0 }, }; static void hwsim_init_s1g_channels(struct ieee80211_channel *chans) { int ch, freq; for (ch = 0; ch < NUM_S1G_CHANS_US; ch++) { freq = 902000 + (ch + 1) * 500; chans[ch].band = NL80211_BAND_S1GHZ; chans[ch].center_freq = KHZ_TO_MHZ(freq); chans[ch].freq_offset = freq % 1000; chans[ch].hw_value = ch + 1; } } static const struct ieee80211_rate hwsim_rates[] = { { .bitrate = 10 }, { .bitrate = 20, .flags = IEEE80211_RATE_SHORT_PREAMBLE }, { .bitrate = 55, .flags = IEEE80211_RATE_SHORT_PREAMBLE }, { .bitrate = 110, .flags = IEEE80211_RATE_SHORT_PREAMBLE }, { .bitrate = 60 }, { .bitrate = 90 }, { .bitrate = 120 }, { .bitrate = 180 }, { .bitrate = 240 }, { .bitrate = 360 }, { .bitrate = 480 }, { .bitrate = 540 } }; #define DEFAULT_RX_RSSI -50 static const u32 hwsim_ciphers[] = { WLAN_CIPHER_SUITE_WEP40, WLAN_CIPHER_SUITE_WEP104, WLAN_CIPHER_SUITE_TKIP, WLAN_CIPHER_SUITE_CCMP, WLAN_CIPHER_SUITE_CCMP_256, WLAN_CIPHER_SUITE_GCMP, WLAN_CIPHER_SUITE_GCMP_256, WLAN_CIPHER_SUITE_AES_CMAC, WLAN_CIPHER_SUITE_BIP_CMAC_256, WLAN_CIPHER_SUITE_BIP_GMAC_128, WLAN_CIPHER_SUITE_BIP_GMAC_256, }; #define OUI_QCA 0x001374 #define QCA_NL80211_SUBCMD_TEST 1 enum qca_nl80211_vendor_subcmds { QCA_WLAN_VENDOR_ATTR_TEST = 8, QCA_WLAN_VENDOR_ATTR_MAX = QCA_WLAN_VENDOR_ATTR_TEST }; static const struct nla_policy hwsim_vendor_test_policy[QCA_WLAN_VENDOR_ATTR_MAX + 1] = { [QCA_WLAN_VENDOR_ATTR_MAX] = { .type = NLA_U32 }, }; static int mac80211_hwsim_vendor_cmd_test(struct wiphy *wiphy, struct wireless_dev *wdev, const void *data, int data_len) { struct sk_buff *skb; struct nlattr *tb[QCA_WLAN_VENDOR_ATTR_MAX + 1]; int err; u32 val; err = nla_parse_deprecated(tb, QCA_WLAN_VENDOR_ATTR_MAX, data, data_len, hwsim_vendor_test_policy, NULL); if (err) return err; if (!tb[QCA_WLAN_VENDOR_ATTR_TEST]) return -EINVAL; val = nla_get_u32(tb[QCA_WLAN_VENDOR_ATTR_TEST]); wiphy_dbg(wiphy, "%s: test=%u\n", __func__, val); /* Send a vendor event as a test. Note that this would not normally be * done within a command handler, but rather, based on some other * trigger. For simplicity, this command is used to trigger the event * here. * * event_idx = 0 (index in mac80211_hwsim_vendor_commands) */ skb = cfg80211_vendor_event_alloc(wiphy, wdev, 100, 0, GFP_KERNEL); if (skb) { /* skb_put() or nla_put() will fill up data within * NL80211_ATTR_VENDOR_DATA. */ /* Add vendor data */ nla_put_u32(skb, QCA_WLAN_VENDOR_ATTR_TEST, val + 1); /* Send the event - this will call nla_nest_end() */ cfg80211_vendor_event(skb, GFP_KERNEL); } /* Send a response to the command */ skb = cfg80211_vendor_cmd_alloc_reply_skb(wiphy, 10); if (!skb) return -ENOMEM; /* skb_put() or nla_put() will fill up data within * NL80211_ATTR_VENDOR_DATA */ nla_put_u32(skb, QCA_WLAN_VENDOR_ATTR_TEST, val + 2); return cfg80211_vendor_cmd_reply(skb); } static struct wiphy_vendor_command mac80211_hwsim_vendor_commands[] = { { .info = { .vendor_id = OUI_QCA, .subcmd = QCA_NL80211_SUBCMD_TEST }, .flags = WIPHY_VENDOR_CMD_NEED_NETDEV, .doit = mac80211_hwsim_vendor_cmd_test, .policy = hwsim_vendor_test_policy, .maxattr = QCA_WLAN_VENDOR_ATTR_MAX, } }; /* Advertise support vendor specific events */ static const struct nl80211_vendor_cmd_info mac80211_hwsim_vendor_events[] = { { .vendor_id = OUI_QCA, .subcmd = 1 }, }; static DEFINE_SPINLOCK(hwsim_radio_lock); static LIST_HEAD(hwsim_radios); static struct rhashtable hwsim_radios_rht; static int hwsim_radio_idx; static int hwsim_radios_generation = 1; static u8 hwsim_nan_cluster_id[ETH_ALEN]; static struct platform_driver mac80211_hwsim_driver = { .driver = { .name = "mac80211_hwsim", }, }; struct mac80211_hwsim_link_data { u32 link_id; u64 beacon_int /* beacon interval in us */; struct hrtimer beacon_timer; }; struct mac80211_hwsim_data { struct list_head list; struct rhash_head rht; struct ieee80211_hw *hw; struct device *dev; struct ieee80211_supported_band bands[NUM_NL80211_BANDS]; struct ieee80211_channel channels_2ghz[ARRAY_SIZE(hwsim_channels_2ghz)]; struct ieee80211_channel channels_5ghz[ARRAY_SIZE(hwsim_channels_5ghz)]; struct ieee80211_channel channels_6ghz[ARRAY_SIZE(hwsim_channels_6ghz)]; struct ieee80211_channel channels_s1g[ARRAY_SIZE(hwsim_channels_s1g)]; struct ieee80211_rate rates[ARRAY_SIZE(hwsim_rates)]; struct ieee80211_iface_combination if_combination; struct ieee80211_iface_limit if_limits[4]; int n_if_limits; struct ieee80211_iface_combination if_combination_radio; struct wiphy_radio_freq_range radio_range[NUM_NL80211_BANDS]; struct wiphy_radio radio[NUM_NL80211_BANDS]; u32 ciphers[ARRAY_SIZE(hwsim_ciphers)]; struct mac_address addresses[3]; int channels, idx; bool use_chanctx; bool destroy_on_close; u32 portid; char alpha2[2]; const struct ieee80211_regdomain *regd; struct ieee80211_channel *tmp_chan; struct ieee80211_channel *roc_chan; u32 roc_duration; struct delayed_work roc_start; struct delayed_work roc_done; struct delayed_work hw_scan; struct cfg80211_scan_request *hw_scan_request; struct ieee80211_vif *hw_scan_vif; int scan_chan_idx; u8 scan_addr[ETH_ALEN]; struct { struct ieee80211_channel *channel; unsigned long next_start, start, end; } survey_data[ARRAY_SIZE(hwsim_channels_2ghz) + ARRAY_SIZE(hwsim_channels_5ghz) + ARRAY_SIZE(hwsim_channels_6ghz)]; struct ieee80211_channel *channel; enum nl80211_chan_width bw; unsigned int rx_filter; bool started, idle, scanning; struct mutex mutex; enum ps_mode { PS_DISABLED, PS_ENABLED, PS_AUTO_POLL, PS_MANUAL_POLL } ps; bool ps_poll_pending; struct dentry *debugfs; atomic_t pending_cookie; struct sk_buff_head pending; /* packets pending */ /* * Only radios in the same group can communicate together (the * channel has to match too). Each bit represents a group. A * radio can be in more than one group. */ u64 group; /* group shared by radios created in the same netns */ int netgroup; /* wmediumd portid responsible for netgroup of this radio */ u32 wmediumd; /* difference between this hw's clock and the real clock, in usecs */ s64 tsf_offset; s64 bcn_delta; /* absolute beacon transmission time. Used to cover up "tx" delay. */ u64 abs_bcn_ts; /* Stats */ u64 tx_pkts; u64 rx_pkts; u64 tx_bytes; u64 rx_bytes; u64 tx_dropped; u64 tx_failed; /* RSSI in rx status of the receiver */ int rx_rssi; /* only used when pmsr capability is supplied */ struct cfg80211_pmsr_capabilities pmsr_capa; struct cfg80211_pmsr_request *pmsr_request; struct wireless_dev *pmsr_request_wdev; struct mac80211_hwsim_link_data link_data[IEEE80211_MLD_MAX_NUM_LINKS]; struct ieee80211_vif *nan_device_vif; u8 nan_bands; enum nl80211_band nan_curr_dw_band; struct hrtimer nan_timer; bool notify_dw; struct ieee80211_vif *nan_vif; }; static const struct rhashtable_params hwsim_rht_params = { .nelem_hint = 2, .automatic_shrinking = true, .key_len = ETH_ALEN, .key_offset = offsetof(struct mac80211_hwsim_data, addresses[1]), .head_offset = offsetof(struct mac80211_hwsim_data, rht), }; struct hwsim_radiotap_hdr { struct ieee80211_radiotap_header_fixed hdr; __le64 rt_tsft; u8 rt_flags; u8 rt_rate; __le16 rt_channel; __le16 rt_chbitmask; } __packed; struct hwsim_radiotap_ack_hdr { struct ieee80211_radiotap_header_fixed hdr; u8 rt_flags; u8 pad; __le16 rt_channel; __le16 rt_chbitmask; } __packed; static struct mac80211_hwsim_data *get_hwsim_data_ref_from_addr(const u8 *addr) { return rhashtable_lookup_fast(&hwsim_radios_rht, addr, hwsim_rht_params); } /* MAC80211_HWSIM netlink family */ static struct genl_family hwsim_genl_family; enum hwsim_multicast_groups { HWSIM_MCGRP_CONFIG, }; static const struct genl_multicast_group hwsim_mcgrps[] = { [HWSIM_MCGRP_CONFIG] = { .name = "config", }, }; /* MAC80211_HWSIM netlink policy */ static const struct nla_policy hwsim_rate_info_policy[HWSIM_RATE_INFO_ATTR_MAX + 1] = { [HWSIM_RATE_INFO_ATTR_FLAGS] = { .type = NLA_U8 }, [HWSIM_RATE_INFO_ATTR_MCS] = { .type = NLA_U8 }, [HWSIM_RATE_INFO_ATTR_LEGACY] = { .type = NLA_U16 }, [HWSIM_RATE_INFO_ATTR_NSS] = { .type = NLA_U8 }, [HWSIM_RATE_INFO_ATTR_BW] = { .type = NLA_U8 }, [HWSIM_RATE_INFO_ATTR_HE_GI] = { .type = NLA_U8 }, [HWSIM_RATE_INFO_ATTR_HE_DCM] = { .type = NLA_U8 }, [HWSIM_RATE_INFO_ATTR_HE_RU_ALLOC] = { .type = NLA_U8 }, [HWSIM_RATE_INFO_ATTR_N_BOUNDED_CH] = { .type = NLA_U8 }, [HWSIM_RATE_INFO_ATTR_EHT_GI] = { .type = NLA_U8 }, [HWSIM_RATE_INFO_ATTR_EHT_RU_ALLOC] = { .type = NLA_U8 }, }; static const struct nla_policy hwsim_ftm_result_policy[NL80211_PMSR_FTM_RESP_ATTR_MAX + 1] = { [NL80211_PMSR_FTM_RESP_ATTR_FAIL_REASON] = { .type = NLA_U32 }, [NL80211_PMSR_FTM_RESP_ATTR_BURST_INDEX] = { .type = NLA_U16 }, [NL80211_PMSR_FTM_RESP_ATTR_NUM_FTMR_ATTEMPTS] = { .type = NLA_U32 }, [NL80211_PMSR_FTM_RESP_ATTR_NUM_FTMR_SUCCESSES] = { .type = NLA_U32 }, [NL80211_PMSR_FTM_RESP_ATTR_BUSY_RETRY_TIME] = { .type = NLA_U8 }, [NL80211_PMSR_FTM_RESP_ATTR_NUM_BURSTS_EXP] = { .type = NLA_U8 }, [NL80211_PMSR_FTM_RESP_ATTR_BURST_DURATION] = { .type = NLA_U8 }, [NL80211_PMSR_FTM_RESP_ATTR_FTMS_PER_BURST] = { .type = NLA_U8 }, [NL80211_PMSR_FTM_RESP_ATTR_RSSI_AVG] = { .type = NLA_U32 }, [NL80211_PMSR_FTM_RESP_ATTR_RSSI_SPREAD] = { .type = NLA_U32 }, [NL80211_PMSR_FTM_RESP_ATTR_TX_RATE] = NLA_POLICY_NESTED(hwsim_rate_info_policy), [NL80211_PMSR_FTM_RESP_ATTR_RX_RATE] = NLA_POLICY_NESTED(hwsim_rate_info_policy), [NL80211_PMSR_FTM_RESP_ATTR_RTT_AVG] = { .type = NLA_U64 }, [NL80211_PMSR_FTM_RESP_ATTR_RTT_VARIANCE] = { .type = NLA_U64 }, [NL80211_PMSR_FTM_RESP_ATTR_RTT_SPREAD] = { .type = NLA_U64 }, [NL80211_PMSR_FTM_RESP_ATTR_DIST_AVG] = { .type = NLA_U64 }, [NL80211_PMSR_FTM_RESP_ATTR_DIST_VARIANCE] = { .type = NLA_U64 }, [NL80211_PMSR_FTM_RESP_ATTR_DIST_SPREAD] = { .type = NLA_U64 }, [NL80211_PMSR_FTM_RESP_ATTR_LCI] = { .type = NLA_STRING }, [NL80211_PMSR_FTM_RESP_ATTR_CIVICLOC] = { .type = NLA_STRING }, }; static const struct nla_policy hwsim_pmsr_resp_type_policy[NL80211_PMSR_TYPE_MAX + 1] = { [NL80211_PMSR_TYPE_FTM] = NLA_POLICY_NESTED(hwsim_ftm_result_policy), }; static const struct nla_policy hwsim_pmsr_resp_policy[NL80211_PMSR_RESP_ATTR_MAX + 1] = { [NL80211_PMSR_RESP_ATTR_STATUS] = { .type = NLA_U32 }, [NL80211_PMSR_RESP_ATTR_HOST_TIME] = { .type = NLA_U64 }, [NL80211_PMSR_RESP_ATTR_AP_TSF] = { .type = NLA_U64 }, [NL80211_PMSR_RESP_ATTR_FINAL] = { .type = NLA_FLAG }, [NL80211_PMSR_RESP_ATTR_DATA] = NLA_POLICY_NESTED(hwsim_pmsr_resp_type_policy), }; static const struct nla_policy hwsim_pmsr_peer_result_policy[NL80211_PMSR_PEER_ATTR_MAX + 1] = { [NL80211_PMSR_PEER_ATTR_ADDR] = NLA_POLICY_ETH_ADDR_COMPAT, [NL80211_PMSR_PEER_ATTR_CHAN] = { .type = NLA_REJECT }, [NL80211_PMSR_PEER_ATTR_REQ] = { .type = NLA_REJECT }, [NL80211_PMSR_PEER_ATTR_RESP] = NLA_POLICY_NESTED(hwsim_pmsr_resp_policy), }; static const struct nla_policy hwsim_pmsr_peers_result_policy[NL80211_PMSR_ATTR_MAX + 1] = { [NL80211_PMSR_ATTR_MAX_PEERS] = { .type = NLA_REJECT }, [NL80211_PMSR_ATTR_REPORT_AP_TSF] = { .type = NLA_REJECT }, [NL80211_PMSR_ATTR_RANDOMIZE_MAC_ADDR] = { .type = NLA_REJECT }, [NL80211_PMSR_ATTR_TYPE_CAPA] = { .type = NLA_REJECT }, [NL80211_PMSR_ATTR_PEERS] = NLA_POLICY_NESTED_ARRAY(hwsim_pmsr_peer_result_policy), }; static const struct nla_policy hwsim_ftm_capa_policy[NL80211_PMSR_FTM_CAPA_ATTR_MAX + 1] = { [NL80211_PMSR_FTM_CAPA_ATTR_ASAP] = { .type = NLA_FLAG }, [NL80211_PMSR_FTM_CAPA_ATTR_NON_ASAP] = { .type = NLA_FLAG }, [NL80211_PMSR_FTM_CAPA_ATTR_REQ_LCI] = { .type = NLA_FLAG }, [NL80211_PMSR_FTM_CAPA_ATTR_REQ_CIVICLOC] = { .type = NLA_FLAG }, [NL80211_PMSR_FTM_CAPA_ATTR_PREAMBLES] = { .type = NLA_U32 }, [NL80211_PMSR_FTM_CAPA_ATTR_BANDWIDTHS] = { .type = NLA_U32 }, [NL80211_PMSR_FTM_CAPA_ATTR_MAX_BURSTS_EXPONENT] = NLA_POLICY_MAX(NLA_U8, 15), [NL80211_PMSR_FTM_CAPA_ATTR_MAX_FTMS_PER_BURST] = NLA_POLICY_MAX(NLA_U8, 31), [NL80211_PMSR_FTM_CAPA_ATTR_TRIGGER_BASED] = { .type = NLA_FLAG }, [NL80211_PMSR_FTM_CAPA_ATTR_NON_TRIGGER_BASED] = { .type = NLA_FLAG }, }; static const struct nla_policy hwsim_pmsr_capa_type_policy[NL80211_PMSR_TYPE_MAX + 1] = { [NL80211_PMSR_TYPE_FTM] = NLA_POLICY_NESTED(hwsim_ftm_capa_policy), }; static const struct nla_policy hwsim_pmsr_capa_policy[NL80211_PMSR_ATTR_MAX + 1] = { [NL80211_PMSR_ATTR_MAX_PEERS] = { .type = NLA_U32 }, [NL80211_PMSR_ATTR_REPORT_AP_TSF] = { .type = NLA_FLAG }, [NL80211_PMSR_ATTR_RANDOMIZE_MAC_ADDR] = { .type = NLA_FLAG }, [NL80211_PMSR_ATTR_TYPE_CAPA] = NLA_POLICY_NESTED(hwsim_pmsr_capa_type_policy), [NL80211_PMSR_ATTR_PEERS] = { .type = NLA_REJECT }, // only for request. }; static const struct nla_policy hwsim_genl_policy[HWSIM_ATTR_MAX + 1] = { [HWSIM_ATTR_ADDR_RECEIVER] = NLA_POLICY_ETH_ADDR_COMPAT, [HWSIM_ATTR_ADDR_TRANSMITTER] = NLA_POLICY_ETH_ADDR_COMPAT, [HWSIM_ATTR_FRAME] = { .type = NLA_BINARY, .len = IEEE80211_MAX_DATA_LEN }, [HWSIM_ATTR_FLAGS] = { .type = NLA_U32 }, [HWSIM_ATTR_RX_RATE] = { .type = NLA_U32 }, [HWSIM_ATTR_SIGNAL] = { .type = NLA_U32 }, [HWSIM_ATTR_TX_INFO] = { .type = NLA_BINARY, .len = IEEE80211_TX_MAX_RATES * sizeof(struct hwsim_tx_rate)}, [HWSIM_ATTR_COOKIE] = { .type = NLA_U64 }, [HWSIM_ATTR_CHANNELS] = { .type = NLA_U32 }, [HWSIM_ATTR_RADIO_ID] = { .type = NLA_U32 }, [HWSIM_ATTR_REG_HINT_ALPHA2] = { .type = NLA_STRING, .len = 2 }, [HWSIM_ATTR_REG_CUSTOM_REG] = { .type = NLA_U32 }, [HWSIM_ATTR_REG_STRICT_REG] = { .type = NLA_FLAG }, [HWSIM_ATTR_SUPPORT_P2P_DEVICE] = { .type = NLA_FLAG }, [HWSIM_ATTR_USE_CHANCTX] = { .type = NLA_FLAG }, [HWSIM_ATTR_DESTROY_RADIO_ON_CLOSE] = { .type = NLA_FLAG }, [HWSIM_ATTR_RADIO_NAME] = { .type = NLA_STRING }, [HWSIM_ATTR_NO_VIF] = { .type = NLA_FLAG }, [HWSIM_ATTR_FREQ] = { .type = NLA_U32 }, [HWSIM_ATTR_TX_INFO_FLAGS] = { .type = NLA_BINARY }, [HWSIM_ATTR_PERM_ADDR] = NLA_POLICY_ETH_ADDR_COMPAT, [HWSIM_ATTR_IFTYPE_SUPPORT] = { .type = NLA_U32 }, [HWSIM_ATTR_CIPHER_SUPPORT] = { .type = NLA_BINARY }, [HWSIM_ATTR_MLO_SUPPORT] = { .type = NLA_FLAG }, [HWSIM_ATTR_PMSR_SUPPORT] = NLA_POLICY_NESTED(hwsim_pmsr_capa_policy), [HWSIM_ATTR_PMSR_RESULT] = NLA_POLICY_NESTED(hwsim_pmsr_peers_result_policy), [HWSIM_ATTR_MULTI_RADIO] = { .type = NLA_FLAG }, [HWSIM_ATTR_SUPPORT_NAN_DEVICE] = { .type = NLA_FLAG }, }; #if IS_REACHABLE(CONFIG_VIRTIO) /* MAC80211_HWSIM virtio queues */ static struct virtqueue *hwsim_vqs[HWSIM_NUM_VQS]; static bool hwsim_virtio_enabled; static DEFINE_SPINLOCK(hwsim_virtio_lock); static void hwsim_virtio_rx_work(struct work_struct *work); static DECLARE_WORK(hwsim_virtio_rx, hwsim_virtio_rx_work); static int hwsim_tx_virtio(struct mac80211_hwsim_data *data, struct sk_buff *skb) { struct scatterlist sg[1]; unsigned long flags; int err; spin_lock_irqsave(&hwsim_virtio_lock, flags); if (!hwsim_virtio_enabled) { err = -ENODEV; goto out_free; } sg_init_one(sg, skb->head, skb_end_offset(skb)); err = virtqueue_add_outbuf(hwsim_vqs[HWSIM_VQ_TX], sg, 1, skb, GFP_ATOMIC); if (err) goto out_free; virtqueue_kick(hwsim_vqs[HWSIM_VQ_TX]); spin_unlock_irqrestore(&hwsim_virtio_lock, flags); return 0; out_free: spin_unlock_irqrestore(&hwsim_virtio_lock, flags); nlmsg_free(skb); return err; } #else /* cause a linker error if this ends up being needed */ extern int hwsim_tx_virtio(struct mac80211_hwsim_data *data, struct sk_buff *skb); #define hwsim_virtio_enabled false #endif static int hwsim_get_chanwidth(enum nl80211_chan_width bw) { switch (bw) { case NL80211_CHAN_WIDTH_20_NOHT: case NL80211_CHAN_WIDTH_20: return 20; case NL80211_CHAN_WIDTH_40: return 40; case NL80211_CHAN_WIDTH_80: return 80; case NL80211_CHAN_WIDTH_80P80: case NL80211_CHAN_WIDTH_160: return 160; case NL80211_CHAN_WIDTH_320: return 320; case NL80211_CHAN_WIDTH_5: return 5; case NL80211_CHAN_WIDTH_10: return 10; case NL80211_CHAN_WIDTH_1: return 1; case NL80211_CHAN_WIDTH_2: return 2; case NL80211_CHAN_WIDTH_4: return 4; case NL80211_CHAN_WIDTH_8: return 8; case NL80211_CHAN_WIDTH_16: return 16; } return INT_MAX; } static void mac80211_hwsim_tx_frame(struct ieee80211_hw *hw, struct sk_buff *skb, struct ieee80211_channel *chan); /* sysfs attributes */ static void hwsim_send_ps_poll(void *dat, u8 *mac, struct ieee80211_vif *vif) { struct mac80211_hwsim_data *data = dat; struct hwsim_vif_priv *vp = (void *)vif->drv_priv; struct sk_buff *skb; struct ieee80211_pspoll *pspoll; if (!vp->assoc) return; wiphy_dbg(data->hw->wiphy, "%s: send PS-Poll to %pM for aid %d\n", __func__, vp->bssid, vp->aid); skb = dev_alloc_skb(sizeof(*pspoll)); if (!skb) return; pspoll = skb_put(skb, sizeof(*pspoll)); pspoll->frame_control = cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_PSPOLL | IEEE80211_FCTL_PM); pspoll->aid = cpu_to_le16(0xc000 | vp->aid); memcpy(pspoll->bssid, vp->bssid, ETH_ALEN); memcpy(pspoll->ta, mac, ETH_ALEN); rcu_read_lock(); mac80211_hwsim_tx_frame(data->hw, skb, rcu_dereference(vif->bss_conf.chanctx_conf)->def.chan); rcu_read_unlock(); } static void hwsim_send_nullfunc(struct mac80211_hwsim_data *data, u8 *mac, struct ieee80211_vif *vif, int ps) { struct hwsim_vif_priv *vp = (void *)vif->drv_priv; struct sk_buff *skb; struct ieee80211_hdr *hdr; struct ieee80211_tx_info *cb; if (!vp->assoc) return; wiphy_dbg(data->hw->wiphy, "%s: send data::nullfunc to %pM ps=%d\n", __func__, vp->bssid, ps); skb = dev_alloc_skb(sizeof(*hdr)); if (!skb) return; hdr = skb_put(skb, sizeof(*hdr) - ETH_ALEN); hdr->frame_control = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_NULLFUNC | IEEE80211_FCTL_TODS | (ps ? IEEE80211_FCTL_PM : 0)); hdr->duration_id = cpu_to_le16(0); memcpy(hdr->addr1, vp->bssid, ETH_ALEN); memcpy(hdr->addr2, mac, ETH_ALEN); memcpy(hdr->addr3, vp->bssid, ETH_ALEN); cb = IEEE80211_SKB_CB(skb); cb->control.rates[0].count = 1; cb->control.rates[1].idx = -1; rcu_read_lock(); mac80211_hwsim_tx_frame(data->hw, skb, rcu_dereference(vif->bss_conf.chanctx_conf)->def.chan); rcu_read_unlock(); } static void hwsim_send_nullfunc_ps(void *dat, u8 *mac, struct ieee80211_vif *vif) { struct mac80211_hwsim_data *data = dat; hwsim_send_nullfunc(data, mac, vif, 1); } static void hwsim_send_nullfunc_no_ps(void *dat, u8 *mac, struct ieee80211_vif *vif) { struct mac80211_hwsim_data *data = dat; hwsim_send_nullfunc(data, mac, vif, 0); } static int hwsim_fops_ps_read(void *dat, u64 *val) { struct mac80211_hwsim_data *data = dat; *val = data->ps; return 0; } static int hwsim_fops_ps_write(void *dat, u64 val) { struct mac80211_hwsim_data *data = dat; enum ps_mode old_ps; if (val != PS_DISABLED && val != PS_ENABLED && val != PS_AUTO_POLL && val != PS_MANUAL_POLL) return -EINVAL; if (val == PS_MANUAL_POLL) { if (data->ps != PS_ENABLED) return -EINVAL; local_bh_disable(); ieee80211_iterate_active_interfaces_atomic( data->hw, IEEE80211_IFACE_ITER_NORMAL, hwsim_send_ps_poll, data); local_bh_enable(); return 0; } old_ps = data->ps; data->ps = val; local_bh_disable(); if (old_ps == PS_DISABLED && val != PS_DISABLED) { ieee80211_iterate_active_interfaces_atomic( data->hw, IEEE80211_IFACE_ITER_NORMAL, hwsim_send_nullfunc_ps, data); } else if (old_ps != PS_DISABLED && val == PS_DISABLED) { ieee80211_iterate_active_interfaces_atomic( data->hw, IEEE80211_IFACE_ITER_NORMAL, hwsim_send_nullfunc_no_ps, data); } local_bh_enable(); return 0; } DEFINE_DEBUGFS_ATTRIBUTE(hwsim_fops_ps, hwsim_fops_ps_read, hwsim_fops_ps_write, "%llu\n"); static int hwsim_write_simulate_radar(void *dat, u64 val) { struct mac80211_hwsim_data *data = dat; ieee80211_radar_detected(data->hw, NULL); return 0; } DEFINE_DEBUGFS_ATTRIBUTE(hwsim_simulate_radar, NULL, hwsim_write_simulate_radar, "%llu\n"); static int hwsim_fops_group_read(void *dat, u64 *val) { struct mac80211_hwsim_data *data = dat; *val = data->group; return 0; } static int hwsim_fops_group_write(void *dat, u64 val) { struct mac80211_hwsim_data *data = dat; data->group = val; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(hwsim_fops_group, hwsim_fops_group_read, hwsim_fops_group_write, "%llx\n"); static int hwsim_fops_rx_rssi_read(void *dat, u64 *val) { struct mac80211_hwsim_data *data = dat; *val = data->rx_rssi; return 0; } static int hwsim_fops_rx_rssi_write(void *dat, u64 val) { struct mac80211_hwsim_data *data = dat; int rssi = (int)val; if (rssi >= 0 || rssi < -100) return -EINVAL; data->rx_rssi = rssi; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(hwsim_fops_rx_rssi, hwsim_fops_rx_rssi_read, hwsim_fops_rx_rssi_write, "%lld\n"); static netdev_tx_t hwsim_mon_xmit(struct sk_buff *skb, struct net_device *dev) { /* TODO: allow packet injection */ dev_kfree_skb(skb); return NETDEV_TX_OK; } static inline u64 mac80211_hwsim_get_tsf_raw(void) { return ktime_to_us(ktime_get_real()); } static __le64 __mac80211_hwsim_get_tsf(struct mac80211_hwsim_data *data) { u64 now = mac80211_hwsim_get_tsf_raw(); return cpu_to_le64(now + data->tsf_offset); } static u64 mac80211_hwsim_get_tsf(struct ieee80211_hw *hw, struct ieee80211_vif *vif) { struct mac80211_hwsim_data *data = hw->priv; return le64_to_cpu(__mac80211_hwsim_get_tsf(data)); } static void mac80211_hwsim_set_tsf(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u64 tsf) { struct mac80211_hwsim_data *data = hw->priv; u64 now = mac80211_hwsim_get_tsf(hw, vif); /* MLD not supported here */ u32 bcn_int = data->link_data[0].beacon_int; u64 delta = abs(tsf - now); struct ieee80211_bss_conf *conf; conf = link_conf_dereference_protected(vif, data->link_data[0].link_id); if (conf && !conf->enable_beacon) return; /* adjust after beaconing with new timestamp at old TBTT */ if (tsf > now) { data->tsf_offset += delta; data->bcn_delta = do_div(delta, bcn_int); } else { data->tsf_offset -= delta; data->bcn_delta = -(s64)do_div(delta, bcn_int); } } static void mac80211_hwsim_monitor_rx(struct ieee80211_hw *hw, struct sk_buff *tx_skb, struct ieee80211_channel *chan) { struct mac80211_hwsim_data *data = hw->priv; struct sk_buff *skb; struct hwsim_radiotap_hdr *hdr; u16 flags, bitrate; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx_skb); struct ieee80211_rate *txrate = ieee80211_get_tx_rate(hw, info); if (!txrate) bitrate = 0; else bitrate = txrate->bitrate; if (!netif_running(hwsim_mon)) return; skb = skb_copy_expand(tx_skb, sizeof(*hdr), 0, GFP_ATOMIC); if (skb == NULL) return; hdr = skb_push(skb, sizeof(*hdr)); hdr->hdr.it_version = PKTHDR_RADIOTAP_VERSION; hdr->hdr.it_pad = 0; hdr->hdr.it_len = cpu_to_le16(sizeof(*hdr)); hdr->hdr.it_present = cpu_to_le32((1 << IEEE80211_RADIOTAP_FLAGS) | (1 << IEEE80211_RADIOTAP_RATE) | (1 << IEEE80211_RADIOTAP_TSFT) | (1 << IEEE80211_RADIOTAP_CHANNEL)); hdr->rt_tsft = __mac80211_hwsim_get_tsf(data); hdr->rt_flags = 0; hdr->rt_rate = bitrate / 5; hdr->rt_channel = cpu_to_le16(chan->center_freq); flags = IEEE80211_CHAN_2GHZ; if (txrate && txrate->flags & IEEE80211_RATE_ERP_G) flags |= IEEE80211_CHAN_OFDM; else flags |= IEEE80211_CHAN_CCK; hdr->rt_chbitmask = cpu_to_le16(flags); skb->dev = hwsim_mon; skb_reset_mac_header(skb); skb->ip_summed = CHECKSUM_UNNECESSARY; skb->pkt_type = PACKET_OTHERHOST; skb->protocol = htons(ETH_P_802_2); memset(skb->cb, 0, sizeof(skb->cb)); netif_rx(skb); } static void mac80211_hwsim_monitor_ack(struct ieee80211_channel *chan, const u8 *addr) { struct sk_buff *skb; struct hwsim_radiotap_ack_hdr *hdr; u16 flags; struct ieee80211_hdr *hdr11; if (!netif_running(hwsim_mon)) return; skb = dev_alloc_skb(100); if (skb == NULL) return; hdr = skb_put(skb, sizeof(*hdr)); hdr->hdr.it_version = PKTHDR_RADIOTAP_VERSION; hdr->hdr.it_pad = 0; hdr->hdr.it_len = cpu_to_le16(sizeof(*hdr)); hdr->hdr.it_present = cpu_to_le32((1 << IEEE80211_RADIOTAP_FLAGS) | (1 << IEEE80211_RADIOTAP_CHANNEL)); hdr->rt_flags = 0; hdr->pad = 0; hdr->rt_channel = cpu_to_le16(chan->center_freq); flags = IEEE80211_CHAN_2GHZ; hdr->rt_chbitmask = cpu_to_le16(flags); hdr11 = skb_put(skb, 10); hdr11->frame_control = cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_ACK); hdr11->duration_id = cpu_to_le16(0); memcpy(hdr11->addr1, addr, ETH_ALEN); skb->dev = hwsim_mon; skb_reset_mac_header(skb); skb->ip_summed = CHECKSUM_UNNECESSARY; skb->pkt_type = PACKET_OTHERHOST; skb->protocol = htons(ETH_P_802_2); memset(skb->cb, 0, sizeof(skb->cb)); netif_rx(skb); } struct mac80211_hwsim_addr_match_data { u8 addr[ETH_ALEN]; bool ret; }; static void mac80211_hwsim_addr_iter(void *data, u8 *mac, struct ieee80211_vif *vif) { int i; struct mac80211_hwsim_addr_match_data *md = data; if (memcmp(mac, md->addr, ETH_ALEN) == 0) { md->ret = true; return; } /* Match the link address */ for (i = 0; i < ARRAY_SIZE(vif->link_conf); i++) { struct ieee80211_bss_conf *conf; conf = rcu_dereference(vif->link_conf[i]); if (!conf) continue; if (memcmp(conf->addr, md->addr, ETH_ALEN) == 0) { md->ret = true; return; } } } static bool mac80211_hwsim_addr_match(struct mac80211_hwsim_data *data, const u8 *addr) { struct mac80211_hwsim_addr_match_data md = { .ret = false, }; if (data->scanning && memcmp(addr, data->scan_addr, ETH_ALEN) == 0) return true; memcpy(md.addr, addr, ETH_ALEN); ieee80211_iterate_active_interfaces_atomic(data->hw, IEEE80211_IFACE_ITER_NORMAL, mac80211_hwsim_addr_iter, &md); return md.ret; } static bool hwsim_ps_rx_ok(struct mac80211_hwsim_data *data, struct sk_buff *skb) { switch (data->ps) { case PS_DISABLED: return true; case PS_ENABLED: return false; case PS_AUTO_POLL: /* TODO: accept (some) Beacons by default and other frames only * if pending PS-Poll has been sent */ return true; case PS_MANUAL_POLL: /* Allow unicast frames to own address if there is a pending * PS-Poll */ if (data->ps_poll_pending && mac80211_hwsim_addr_match(data, skb->data + 4)) { data->ps_poll_pending = false; return true; } return false; } return true; } static int hwsim_unicast_netgroup(struct mac80211_hwsim_data *data, struct sk_buff *skb, int portid) { struct net *net; bool found = false; int res = -ENOENT; rcu_read_lock(); for_each_net_rcu(net) { if (data->netgroup == hwsim_net_get_netgroup(net)) { res = genlmsg_unicast(net, skb, portid); found = true; break; } } rcu_read_unlock(); if (!found) nlmsg_free(skb); return res; } static void mac80211_hwsim_config_mac_nl(struct ieee80211_hw *hw, const u8 *addr, bool add) { struct mac80211_hwsim_data *data = hw->priv; u32 _portid = READ_ONCE(data->wmediumd); struct sk_buff *skb; void *msg_head; WARN_ON(!is_valid_ether_addr(addr)); if (!_portid && !hwsim_virtio_enabled) return; skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!skb) return; msg_head = genlmsg_put(skb, 0, 0, &hwsim_genl_family, 0, add ? HWSIM_CMD_ADD_MAC_ADDR : HWSIM_CMD_DEL_MAC_ADDR); if (!msg_head) { pr_debug("mac80211_hwsim: problem with msg_head\n"); goto nla_put_failure; } if (nla_put(skb, HWSIM_ATTR_ADDR_TRANSMITTER, ETH_ALEN, data->addresses[1].addr)) goto nla_put_failure; if (nla_put(skb, HWSIM_ATTR_ADDR_RECEIVER, ETH_ALEN, addr)) goto nla_put_failure; genlmsg_end(skb, msg_head); if (hwsim_virtio_enabled) hwsim_tx_virtio(data, skb); else hwsim_unicast_netgroup(data, skb, _portid); return; nla_put_failure: nlmsg_free(skb); } static inline u16 trans_tx_rate_flags_ieee2hwsim(struct ieee80211_tx_rate *rate) { u16 result = 0; if (rate->flags & IEEE80211_TX_RC_USE_RTS_CTS) result |= MAC80211_HWSIM_TX_RC_USE_RTS_CTS; if (rate->flags & IEEE80211_TX_RC_USE_CTS_PROTECT) result |= MAC80211_HWSIM_TX_RC_USE_CTS_PROTECT; if (rate->flags & IEEE80211_TX_RC_USE_SHORT_PREAMBLE) result |= MAC80211_HWSIM_TX_RC_USE_SHORT_PREAMBLE; if (rate->flags & IEEE80211_TX_RC_MCS) result |= MAC80211_HWSIM_TX_RC_MCS; if (rate->flags & IEEE80211_TX_RC_GREEN_FIELD) result |= MAC80211_HWSIM_TX_RC_GREEN_FIELD; if (rate->flags & IEEE80211_TX_RC_40_MHZ_WIDTH) result |= MAC80211_HWSIM_TX_RC_40_MHZ_WIDTH; if (rate->flags & IEEE80211_TX_RC_DUP_DATA) result |= MAC80211_HWSIM_TX_RC_DUP_DATA; if (rate->flags & IEEE80211_TX_RC_SHORT_GI) result |= MAC80211_HWSIM_TX_RC_SHORT_GI; if (rate->flags & IEEE80211_TX_RC_VHT_MCS) result |= MAC80211_HWSIM_TX_RC_VHT_MCS; if (rate->flags & IEEE80211_TX_RC_80_MHZ_WIDTH) result |= MAC80211_HWSIM_TX_RC_80_MHZ_WIDTH; if (rate->flags & IEEE80211_TX_RC_160_MHZ_WIDTH) result |= MAC80211_HWSIM_TX_RC_160_MHZ_WIDTH; return result; } static void mac80211_hwsim_tx_frame_nl(struct ieee80211_hw *hw, struct sk_buff *my_skb, int dst_portid, struct ieee80211_channel *channel) { struct sk_buff *skb; struct mac80211_hwsim_data *data = hw->priv; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) my_skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(my_skb); void *msg_head; unsigned int hwsim_flags = 0; int i; struct hwsim_tx_rate tx_attempts[IEEE80211_TX_MAX_RATES]; struct hwsim_tx_rate_flag tx_attempts_flags[IEEE80211_TX_MAX_RATES]; uintptr_t cookie; if (data->ps != PS_DISABLED) hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_PM); /* If the queue contains MAX_QUEUE skb's drop some */ if (skb_queue_len(&data->pending) >= MAX_QUEUE) { /* Dropping until WARN_QUEUE level */ while (skb_queue_len(&data->pending) >= WARN_QUEUE) { ieee80211_free_txskb(hw, skb_dequeue(&data->pending)); data->tx_dropped++; } } skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (skb == NULL) goto nla_put_failure; msg_head = genlmsg_put(skb, 0, 0, &hwsim_genl_family, 0, HWSIM_CMD_FRAME); if (msg_head == NULL) { pr_debug("mac80211_hwsim: problem with msg_head\n"); goto nla_put_failure; } if (nla_put(skb, HWSIM_ATTR_ADDR_TRANSMITTER, ETH_ALEN, data->addresses[1].addr)) goto nla_put_failure; /* We get the skb->data */ if (nla_put(skb, HWSIM_ATTR_FRAME, my_skb->len, my_skb->data)) goto nla_put_failure; /* We get the flags for this transmission, and we translate them to wmediumd flags */ if (info->flags & IEEE80211_TX_CTL_REQ_TX_STATUS) hwsim_flags |= HWSIM_TX_CTL_REQ_TX_STATUS; if (info->flags & IEEE80211_TX_CTL_NO_ACK) hwsim_flags |= HWSIM_TX_CTL_NO_ACK; if (nla_put_u32(skb, HWSIM_ATTR_FLAGS, hwsim_flags)) goto nla_put_failure; if (nla_put_u32(skb, HWSIM_ATTR_FREQ, channel->center_freq)) goto nla_put_failure; /* We get the tx control (rate and retries) info*/ for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) { tx_attempts[i].idx = info->status.rates[i].idx; tx_attempts_flags[i].idx = info->status.rates[i].idx; tx_attempts[i].count = info->status.rates[i].count; tx_attempts_flags[i].flags = trans_tx_rate_flags_ieee2hwsim( &info->status.rates[i]); } if (nla_put(skb, HWSIM_ATTR_TX_INFO, sizeof(struct hwsim_tx_rate)*IEEE80211_TX_MAX_RATES, tx_attempts)) goto nla_put_failure; if (nla_put(skb, HWSIM_ATTR_TX_INFO_FLAGS, sizeof(struct hwsim_tx_rate_flag) * IEEE80211_TX_MAX_RATES, tx_attempts_flags)) goto nla_put_failure; /* We create a cookie to identify this skb */ cookie = atomic_inc_return(&data->pending_cookie); info->rate_driver_data[0] = (void *)cookie; if (nla_put_u64_64bit(skb, HWSIM_ATTR_COOKIE, cookie, HWSIM_ATTR_PAD)) goto nla_put_failure; genlmsg_end(skb, msg_head); if (hwsim_virtio_enabled) { if (hwsim_tx_virtio(data, skb)) goto err_free_txskb; } else { if (hwsim_unicast_netgroup(data, skb, dst_portid)) goto err_free_txskb; } /* Enqueue the packet */ skb_queue_tail(&data->pending, my_skb); data->tx_pkts++; data->tx_bytes += my_skb->len; return; nla_put_failure: nlmsg_free(skb); err_free_txskb: pr_debug("mac80211_hwsim: error occurred in %s\n", __func__); ieee80211_free_txskb(hw, my_skb); data->tx_failed++; } static bool hwsim_chans_compat(struct ieee80211_channel *c1, struct ieee80211_channel *c2) { if (!c1 || !c2) return false; return c1->center_freq == c2->center_freq; } struct tx_iter_data { struct ieee80211_channel *channel; bool receive; }; static void mac80211_hwsim_tx_iter(void *_data, u8 *addr, struct ieee80211_vif *vif) { struct tx_iter_data *data = _data; int i; /* For NAN Device simulation purposes, assume that NAN is always * on channel 6 or channel 149. */ if (vif->type == NL80211_IFTYPE_NAN) { data->receive = (data->channel && (data->channel->center_freq == 2437 || data->channel->center_freq == 5745)); return; } for (i = 0; i < ARRAY_SIZE(vif->link_conf); i++) { struct ieee80211_bss_conf *conf; struct ieee80211_chanctx_conf *chanctx; conf = rcu_dereference(vif->link_conf[i]); if (!conf) continue; chanctx = rcu_dereference(conf->chanctx_conf); if (!chanctx) continue; if (!hwsim_chans_compat(data->channel, chanctx->def.chan)) continue; data->receive = true; return; } } static void mac80211_hwsim_add_vendor_rtap(struct sk_buff *skb) { /* * To enable this code, #define the HWSIM_RADIOTAP_OUI, * e.g. like this: * #define HWSIM_RADIOTAP_OUI "\x02\x00\x00" * (but you should use a valid OUI, not that) * * If anyone wants to 'donate' a radiotap OUI/subns code * please send a patch removing this #ifdef and changing * the values accordingly. */ #ifdef HWSIM_RADIOTAP_OUI struct ieee80211_radiotap_vendor_tlv *rtap; static const char vendor_data[8] = "ABCDEFGH"; // Make sure no padding is needed BUILD_BUG_ON(sizeof(vendor_data) % 4); /* this is last radiotap info before the mac header, so * skb_reset_mac_header for mac8022 to know the end of * the radiotap TLV/beginning of the 802.11 header */ skb_reset_mac_header(skb); /* * Note that this code requires the headroom in the SKB * that was allocated earlier. */ rtap = skb_push(skb, sizeof(*rtap) + sizeof(vendor_data)); rtap->len = cpu_to_le16(sizeof(*rtap) - sizeof(struct ieee80211_radiotap_tlv) + sizeof(vendor_data)); rtap->type = cpu_to_le16(IEEE80211_RADIOTAP_VENDOR_NAMESPACE); rtap->content.oui[0] = HWSIM_RADIOTAP_OUI[0]; rtap->content.oui[1] = HWSIM_RADIOTAP_OUI[1]; rtap->content.oui[2] = HWSIM_RADIOTAP_OUI[2]; rtap->content.oui_subtype = 127; /* clear reserved field */ rtap->content.reserved = 0; rtap->content.vendor_type = 0; memcpy(rtap->content.data, vendor_data, sizeof(vendor_data)); IEEE80211_SKB_RXCB(skb)->flag |= RX_FLAG_RADIOTAP_TLV_AT_END; #endif } static void mac80211_hwsim_rx(struct mac80211_hwsim_data *data, struct ieee80211_rx_status *rx_status, struct sk_buff *skb) { struct ieee80211_hdr *hdr = (void *)skb->data; if (!ieee80211_has_morefrags(hdr->frame_control) && !is_multicast_ether_addr(hdr->addr1) && (ieee80211_is_mgmt(hdr->frame_control) || ieee80211_is_data(hdr->frame_control))) { struct ieee80211_sta *sta; unsigned int link_id; rcu_read_lock(); sta = ieee80211_find_sta_by_link_addrs(data->hw, hdr->addr2, hdr->addr1, &link_id); if (sta) { struct hwsim_sta_priv *sp = (void *)sta->drv_priv; if (ieee80211_has_pm(hdr->frame_control)) sp->active_links_rx &= ~BIT(link_id); else sp->active_links_rx |= BIT(link_id); rx_status->link_valid = true; rx_status->link_id = link_id; } rcu_read_unlock(); } memcpy(IEEE80211_SKB_RXCB(skb), rx_status, sizeof(*rx_status)); mac80211_hwsim_add_vendor_rtap(skb); data->rx_pkts++; data->rx_bytes += skb->len; ieee80211_rx_irqsafe(data->hw, skb); } static bool mac80211_hwsim_tx_frame_no_nl(struct ieee80211_hw *hw, struct sk_buff *skb, struct ieee80211_channel *chan) { struct mac80211_hwsim_data *data = hw->priv, *data2; bool ack = false; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_rx_status rx_status; u64 now; memset(&rx_status, 0, sizeof(rx_status)); rx_status.flag |= RX_FLAG_MACTIME_START; rx_status.freq = chan->center_freq; rx_status.freq_offset = chan->freq_offset ? 1 : 0; rx_status.band = chan->band; if (info->control.rates[0].flags & IEEE80211_TX_RC_VHT_MCS) { rx_status.rate_idx = ieee80211_rate_get_vht_mcs(&info->control.rates[0]); rx_status.nss = ieee80211_rate_get_vht_nss(&info->control.rates[0]); rx_status.encoding = RX_ENC_VHT; } else { rx_status.rate_idx = info->control.rates[0].idx; if (info->control.rates[0].flags & IEEE80211_TX_RC_MCS) rx_status.encoding = RX_ENC_HT; } if (info->control.rates[0].flags & IEEE80211_TX_RC_40_MHZ_WIDTH) rx_status.bw = RATE_INFO_BW_40; else if (info->control.rates[0].flags & IEEE80211_TX_RC_80_MHZ_WIDTH) rx_status.bw = RATE_INFO_BW_80; else if (info->control.rates[0].flags & IEEE80211_TX_RC_160_MHZ_WIDTH) rx_status.bw = RATE_INFO_BW_160; else rx_status.bw = RATE_INFO_BW_20; if (info->control.rates[0].flags & IEEE80211_TX_RC_SHORT_GI) rx_status.enc_flags |= RX_ENC_FLAG_SHORT_GI; /* TODO: simulate optional packet loss */ rx_status.signal = data->rx_rssi; if (info->control.vif) rx_status.signal += info->control.vif->bss_conf.txpower; if (data->ps != PS_DISABLED) hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_PM); /* release the skb's source info */ skb_orphan(skb); skb_dst_drop(skb); skb->mark = 0; skb_ext_reset(skb); nf_reset_ct(skb); /* * Get absolute mactime here so all HWs RX at the "same time", and * absolute TX time for beacon mactime so the timestamp matches. * Giving beacons a different mactime than non-beacons looks messy, but * it helps the Toffset be exact and a ~10us mactime discrepancy * probably doesn't really matter. */ if (ieee80211_is_beacon(hdr->frame_control) || ieee80211_is_probe_resp(hdr->frame_control)) { rx_status.boottime_ns = ktime_get_boottime_ns(); now = data->abs_bcn_ts; } else { now = mac80211_hwsim_get_tsf_raw(); } /* Copy skb to all enabled radios that are on the current frequency */ spin_lock(&hwsim_radio_lock); list_for_each_entry(data2, &hwsim_radios, list) { struct sk_buff *nskb; struct tx_iter_data tx_iter_data = { .receive = false, .channel = chan, }; if (data == data2) continue; if (!data2->started || (data2->idle && !data2->tmp_chan) || !hwsim_ps_rx_ok(data2, skb)) continue; if (!(data->group & data2->group)) continue; if (data->netgroup != data2->netgroup) continue; if (!hwsim_chans_compat(chan, data2->tmp_chan) && !hwsim_chans_compat(chan, data2->channel)) { ieee80211_iterate_active_interfaces_atomic( data2->hw, IEEE80211_IFACE_ITER_NORMAL, mac80211_hwsim_tx_iter, &tx_iter_data); if (!tx_iter_data.receive) continue; } /* * reserve some space for our vendor and the normal * radiotap header, since we're copying anyway */ if (skb->len < PAGE_SIZE && paged_rx) { struct page *page = alloc_page(GFP_ATOMIC); if (!page) continue; nskb = dev_alloc_skb(128); if (!nskb) { __free_page(page); continue; } memcpy(page_address(page), skb->data, skb->len); skb_add_rx_frag(nskb, 0, page, 0, skb->len, skb->len); } else { nskb = skb_copy(skb, GFP_ATOMIC); if (!nskb) continue; } if (mac80211_hwsim_addr_match(data2, hdr->addr1)) ack = true; rx_status.mactime = now + data2->tsf_offset; mac80211_hwsim_rx(data2, &rx_status, nskb); } spin_unlock(&hwsim_radio_lock); return ack; } static struct ieee80211_bss_conf * mac80211_hwsim_select_tx_link(struct mac80211_hwsim_data *data, struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct ieee80211_hdr *hdr, struct ieee80211_link_sta **link_sta) { struct hwsim_sta_priv *sp = (void *)sta->drv_priv; int i; if (!ieee80211_vif_is_mld(vif)) return &vif->bss_conf; WARN_ON(is_multicast_ether_addr(hdr->addr1)); if (WARN_ON_ONCE(!sta || !sta->valid_links)) return &vif->bss_conf; for (i = 0; i < ARRAY_SIZE(vif->link_conf); i++) { struct ieee80211_bss_conf *bss_conf; unsigned int link_id; /* round-robin the available link IDs */ link_id = (sp->last_link + i + 1) % ARRAY_SIZE(vif->link_conf); if (!(vif->active_links & BIT(link_id))) continue; if (!(sp->active_links_rx & BIT(link_id))) continue; *link_sta = rcu_dereference(sta->link[link_id]); if (!*link_sta) continue; bss_conf = rcu_dereference(vif->link_conf[link_id]); if (WARN_ON_ONCE(!bss_conf)) continue; /* can happen while switching links */ if (!rcu_access_pointer(bss_conf->chanctx_conf)) continue; sp->last_link = link_id; return bss_conf; } return NULL; } static void mac80211_hwsim_tx(struct ieee80211_hw *hw, struct ieee80211_tx_control *control, struct sk_buff *skb) { struct mac80211_hwsim_data *data = hw->priv; struct ieee80211_tx_info *txi = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr = (void *)skb->data; struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_channel *channel; struct ieee80211_vif *vif = txi->control.vif; bool ack; enum nl80211_chan_width confbw = NL80211_CHAN_WIDTH_20_NOHT; u32 _portid, i; if (WARN_ON(skb->len < 10)) { /* Should not happen; just a sanity check for addr1 use */ ieee80211_free_txskb(hw, skb); return; } if (vif && vif->type == NL80211_IFTYPE_NAN && !data->tmp_chan) { /* For NAN Device simulation purposes, assume that NAN is always * on channel 6 or channel 149, unless a ROC is in progress (for * USD use cases). */ if (data->nan_curr_dw_band == NL80211_BAND_2GHZ) channel = ieee80211_get_channel(hw->wiphy, 2437); else if (data->nan_curr_dw_band == NL80211_BAND_5GHZ) channel = ieee80211_get_channel(hw->wiphy, 5745); else channel = NULL; if (WARN_ON(!channel)) { ieee80211_free_txskb(hw, skb); return; } } else if (!data->use_chanctx) { channel = data->channel; confbw = data->bw; } else if (txi->hw_queue == 4) { channel = data->tmp_chan; } else { u8 link = u32_get_bits(IEEE80211_SKB_CB(skb)->control.flags, IEEE80211_TX_CTRL_MLO_LINK); struct ieee80211_link_sta *link_sta = NULL; struct ieee80211_sta *sta = control->sta; struct ieee80211_bss_conf *bss_conf; /* This can happen in case of monitor injection */ if (!vif) { ieee80211_free_txskb(hw, skb); return; } if (link != IEEE80211_LINK_UNSPECIFIED) { bss_conf = rcu_dereference(vif->link_conf[link]); if (sta) link_sta = rcu_dereference(sta->link[link]); } else { bss_conf = mac80211_hwsim_select_tx_link(data, vif, sta, hdr, &link_sta); } if (unlikely(!bss_conf)) { /* if it's an MLO STA, it might have deactivated all * links temporarily - but we don't handle real PS in * this code yet, so just drop the frame in that case */ WARN(link != IEEE80211_LINK_UNSPECIFIED || !sta || !sta->mlo, "link:%d, sta:%pM, sta->mlo:%d\n", link, sta ? sta->addr : NULL, sta ? sta->mlo : -1); ieee80211_free_txskb(hw, skb); return; } /* Do address translations only between shared links. It is * possible that while an non-AP MLD station and an AP MLD * station have shared links, the frame is intended to be sent * on a link which is not shared (for example when sending a * probe response). */ if (sta && sta->mlo && link_sta) { /* address translation to link addresses on TX */ ether_addr_copy(hdr->addr1, link_sta->addr); ether_addr_copy(hdr->addr2, bss_conf->addr); /* translate A3 only if it's the BSSID */ if (!ieee80211_has_tods(hdr->frame_control) && !ieee80211_has_fromds(hdr->frame_control)) { if (ether_addr_equal(hdr->addr3, sta->addr)) ether_addr_copy(hdr->addr3, link_sta->addr); else if (ether_addr_equal(hdr->addr3, vif->addr)) ether_addr_copy(hdr->addr3, bss_conf->addr); } /* no need to look at A4, if present it's SA */ } chanctx_conf = rcu_dereference(bss_conf->chanctx_conf); if (chanctx_conf) { channel = chanctx_conf->def.chan; confbw = chanctx_conf->def.width; } else { channel = NULL; } } if (WARN(!channel, "TX w/o channel - queue = %d\n", txi->hw_queue)) { ieee80211_free_txskb(hw, skb); return; } if (data->idle && !data->tmp_chan) { wiphy_dbg(hw->wiphy, "Trying to TX when idle - reject\n"); ieee80211_free_txskb(hw, skb); return; } if (vif) hwsim_check_magic(vif); if (control->sta) hwsim_check_sta_magic(control->sta); if (ieee80211_hw_check(hw, SUPPORTS_RC_TABLE)) ieee80211_get_tx_rates(vif, control->sta, skb, txi->control.rates, ARRAY_SIZE(txi->control.rates)); for (i = 0; i < ARRAY_SIZE(txi->control.rates); i++) { u16 rflags = txi->control.rates[i].flags; /* initialize to data->bw for 5/10 MHz handling */ enum nl80211_chan_width bw = data->bw; if (txi->control.rates[i].idx == -1) break; if (rflags & IEEE80211_TX_RC_40_MHZ_WIDTH) bw = NL80211_CHAN_WIDTH_40; else if (rflags & IEEE80211_TX_RC_80_MHZ_WIDTH) bw = NL80211_CHAN_WIDTH_80; else if (rflags & IEEE80211_TX_RC_160_MHZ_WIDTH) bw = NL80211_CHAN_WIDTH_160; if (WARN_ON(hwsim_get_chanwidth(bw) > hwsim_get_chanwidth(confbw))) return; } if (skb->len >= 24 + 8 && ieee80211_is_probe_resp(hdr->frame_control)) { /* fake header transmission time */ struct ieee80211_mgmt *mgmt; struct ieee80211_rate *txrate; /* TODO: get MCS */ int bitrate = 100; u64 ts; mgmt = (struct ieee80211_mgmt *)skb->data; txrate = ieee80211_get_tx_rate(hw, txi); if (txrate) bitrate = txrate->bitrate; ts = mac80211_hwsim_get_tsf_raw(); mgmt->u.probe_resp.timestamp = cpu_to_le64(ts + data->tsf_offset + 24 * 8 * 10 / bitrate); } mac80211_hwsim_monitor_rx(hw, skb, channel); /* wmediumd mode check */ _portid = READ_ONCE(data->wmediumd); if (_portid || hwsim_virtio_enabled) return mac80211_hwsim_tx_frame_nl(hw, skb, _portid, channel); /* NO wmediumd detected, perfect medium simulation */ data->tx_pkts++; data->tx_bytes += skb->len; ack = mac80211_hwsim_tx_frame_no_nl(hw, skb, channel); if (ack && skb->len >= 16) mac80211_hwsim_monitor_ack(channel, hdr->addr2); ieee80211_tx_info_clear_status(txi); /* frame was transmitted at most favorable rate at first attempt */ txi->control.rates[0].count = 1; txi->control.rates[1].idx = -1; if (!(txi->flags & IEEE80211_TX_CTL_NO_ACK) && ack) txi->flags |= IEEE80211_TX_STAT_ACK; ieee80211_tx_status_irqsafe(hw, skb); } static int mac80211_hwsim_start(struct ieee80211_hw *hw) { struct mac80211_hwsim_data *data = hw->priv; wiphy_dbg(hw->wiphy, "%s\n", __func__); data->started = true; return 0; } static void mac80211_hwsim_stop(struct ieee80211_hw *hw, bool suspend) { struct mac80211_hwsim_data *data = hw->priv; int i; data->started = false; for (i = 0; i < ARRAY_SIZE(data->link_data); i++) hrtimer_cancel(&data->link_data[i].beacon_timer); while (!skb_queue_empty(&data->pending)) ieee80211_free_txskb(hw, skb_dequeue(&data->pending)); wiphy_dbg(hw->wiphy, "%s\n", __func__); } static int mac80211_hwsim_add_interface(struct ieee80211_hw *hw, struct ieee80211_vif *vif) { wiphy_dbg(hw->wiphy, "%s (type=%d mac_addr=%pM)\n", __func__, ieee80211_vif_type_p2p(vif), vif->addr); hwsim_set_magic(vif); if (vif->type != NL80211_IFTYPE_MONITOR) mac80211_hwsim_config_mac_nl(hw, vif->addr, true); vif->cab_queue = 0; vif->hw_queue[IEEE80211_AC_VO] = 0; vif->hw_queue[IEEE80211_AC_VI] = 1; vif->hw_queue[IEEE80211_AC_BE] = 2; vif->hw_queue[IEEE80211_AC_BK] = 3; return 0; } #ifdef CONFIG_MAC80211_DEBUGFS static void mac80211_hwsim_link_add_debugfs(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *link_conf, struct dentry *dir) { struct hwsim_vif_priv *vp = (void *)vif->drv_priv; debugfs_create_u32("skip_beacons", 0600, dir, &vp->skip_beacons[link_conf->link_id]); } #endif static int mac80211_hwsim_change_interface(struct ieee80211_hw *hw, struct ieee80211_vif *vif, enum nl80211_iftype newtype, bool newp2p) { newtype = ieee80211_iftype_p2p(newtype, newp2p); wiphy_dbg(hw->wiphy, "%s (old type=%d, new type=%d, mac_addr=%pM)\n", __func__, ieee80211_vif_type_p2p(vif), newtype, vif->addr); hwsim_check_magic(vif); /* * interface may change from non-AP to AP in * which case this needs to be set up again */ vif->cab_queue = 0; return 0; } static void mac80211_hwsim_remove_interface( struct ieee80211_hw *hw, struct ieee80211_vif *vif) { wiphy_dbg(hw->wiphy, "%s (type=%d mac_addr=%pM)\n", __func__, ieee80211_vif_type_p2p(vif), vif->addr); hwsim_check_magic(vif); hwsim_clear_magic(vif); if (vif->type != NL80211_IFTYPE_MONITOR) mac80211_hwsim_config_mac_nl(hw, vif->addr, false); } static void mac80211_hwsim_tx_frame(struct ieee80211_hw *hw, struct sk_buff *skb, struct ieee80211_channel *chan) { struct mac80211_hwsim_data *data = hw->priv; u32 _portid = READ_ONCE(data->wmediumd); if (ieee80211_hw_check(hw, SUPPORTS_RC_TABLE)) { struct ieee80211_tx_info *txi = IEEE80211_SKB_CB(skb); ieee80211_get_tx_rates(txi->control.vif, NULL, skb, txi->control.rates, ARRAY_SIZE(txi->control.rates)); } mac80211_hwsim_monitor_rx(hw, skb, chan); if (_portid || hwsim_virtio_enabled) return mac80211_hwsim_tx_frame_nl(hw, skb, _portid, chan); data->tx_pkts++; data->tx_bytes += skb->len; mac80211_hwsim_tx_frame_no_nl(hw, skb, chan); dev_kfree_skb(skb); } static void __mac80211_hwsim_beacon_tx(struct ieee80211_bss_conf *link_conf, struct mac80211_hwsim_data *data, struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct sk_buff *skb) { struct hwsim_vif_priv *vp = (void *)vif->drv_priv; struct ieee80211_tx_info *info; struct ieee80211_rate *txrate; struct ieee80211_mgmt *mgmt; /* TODO: get MCS */ int bitrate = 100; if (vp->skip_beacons[link_conf->link_id]) { vp->skip_beacons[link_conf->link_id]--; dev_kfree_skb(skb); return; } info = IEEE80211_SKB_CB(skb); if (ieee80211_hw_check(hw, SUPPORTS_RC_TABLE)) ieee80211_get_tx_rates(vif, NULL, skb, info->control.rates, ARRAY_SIZE(info->control.rates)); txrate = ieee80211_get_tx_rate(hw, info); if (txrate) bitrate = txrate->bitrate; mgmt = (struct ieee80211_mgmt *) skb->data; /* fake header transmission time */ data->abs_bcn_ts = mac80211_hwsim_get_tsf_raw(); if (ieee80211_is_s1g_beacon(mgmt->frame_control)) { struct ieee80211_ext *ext = (void *) mgmt; ext->u.s1g_beacon.timestamp = cpu_to_le32(data->abs_bcn_ts + data->tsf_offset + 10 * 8 * 10 / bitrate); } else { mgmt->u.beacon.timestamp = cpu_to_le64(data->abs_bcn_ts + data->tsf_offset + 24 * 8 * 10 / bitrate); } mac80211_hwsim_tx_frame(hw, skb, rcu_dereference(link_conf->chanctx_conf)->def.chan); } static void mac80211_hwsim_beacon_tx(void *arg, u8 *mac, struct ieee80211_vif *vif) { struct mac80211_hwsim_link_data *link_data = arg; u32 link_id = link_data->link_id; struct ieee80211_bss_conf *link_conf, *tx_bss_conf; struct mac80211_hwsim_data *data = container_of(link_data, struct mac80211_hwsim_data, link_data[link_id]); struct ieee80211_hw *hw = data->hw; struct sk_buff *skb; hwsim_check_magic(vif); link_conf = rcu_dereference(vif->link_conf[link_id]); if (!link_conf) return; if (vif->type != NL80211_IFTYPE_AP && vif->type != NL80211_IFTYPE_MESH_POINT && vif->type != NL80211_IFTYPE_ADHOC && vif->type != NL80211_IFTYPE_OCB) return; tx_bss_conf = rcu_access_pointer(link_conf->tx_bss_conf); if (tx_bss_conf && tx_bss_conf != link_conf) return; if (link_conf->ema_ap) { struct ieee80211_ema_beacons *ema; u8 i = 0; ema = ieee80211_beacon_get_template_ema_list(hw, vif, link_id); if (!ema || !ema->cnt) return; for (i = 0; i < ema->cnt; i++) { __mac80211_hwsim_beacon_tx(link_conf, data, hw, vif, ema->bcn[i].skb); ema->bcn[i].skb = NULL; /* Already freed */ } ieee80211_beacon_free_ema_list(ema); } else { skb = ieee80211_beacon_get(hw, vif, link_id); if (!skb) return; __mac80211_hwsim_beacon_tx(link_conf, data, hw, vif, skb); } while ((skb = ieee80211_get_buffered_bc(hw, vif)) != NULL) { mac80211_hwsim_tx_frame(hw, skb, rcu_dereference(link_conf->chanctx_conf)->def.chan); } if (link_conf->csa_active && ieee80211_beacon_cntdwn_is_complete(vif, link_id)) ieee80211_csa_finish(vif, link_id); if (link_conf->color_change_active && ieee80211_beacon_cntdwn_is_complete(vif, link_id)) ieee80211_color_change_finish(vif, link_id); } static enum hrtimer_restart mac80211_hwsim_beacon(struct hrtimer *timer) { struct mac80211_hwsim_link_data *link_data = container_of(timer, struct mac80211_hwsim_link_data, beacon_timer); struct mac80211_hwsim_data *data = container_of(link_data, struct mac80211_hwsim_data, link_data[link_data->link_id]); struct ieee80211_hw *hw = data->hw; u64 bcn_int = link_data->beacon_int; if (!data->started) return HRTIMER_NORESTART; ieee80211_iterate_active_interfaces_atomic( hw, IEEE80211_IFACE_ITER_NORMAL, mac80211_hwsim_beacon_tx, link_data); /* beacon at new TBTT + beacon interval */ if (data->bcn_delta) { bcn_int -= data->bcn_delta; data->bcn_delta = 0; } hrtimer_forward_now(&link_data->beacon_timer, ns_to_ktime(bcn_int * NSEC_PER_USEC)); return HRTIMER_RESTART; } static const char * const hwsim_chanwidths[] = { [NL80211_CHAN_WIDTH_5] = "ht5", [NL80211_CHAN_WIDTH_10] = "ht10", [NL80211_CHAN_WIDTH_20_NOHT] = "noht", [NL80211_CHAN_WIDTH_20] = "ht20", [NL80211_CHAN_WIDTH_40] = "ht40", [NL80211_CHAN_WIDTH_80] = "vht80", [NL80211_CHAN_WIDTH_80P80] = "vht80p80", [NL80211_CHAN_WIDTH_160] = "vht160", [NL80211_CHAN_WIDTH_1] = "1MHz", [NL80211_CHAN_WIDTH_2] = "2MHz", [NL80211_CHAN_WIDTH_4] = "4MHz", [NL80211_CHAN_WIDTH_8] = "8MHz", [NL80211_CHAN_WIDTH_16] = "16MHz", [NL80211_CHAN_WIDTH_320] = "eht320", }; static int mac80211_hwsim_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct mac80211_hwsim_data *data = hw->priv; struct ieee80211_conf *conf = &hw->conf; static const char *smps_modes[IEEE80211_SMPS_NUM_MODES] = { [IEEE80211_SMPS_AUTOMATIC] = "auto", [IEEE80211_SMPS_OFF] = "off", [IEEE80211_SMPS_STATIC] = "static", [IEEE80211_SMPS_DYNAMIC] = "dynamic", }; int idx; if (conf->chandef.chan) wiphy_dbg(hw->wiphy, "%s (freq=%d(%d - %d)/%s idle=%d ps=%d smps=%s)\n", __func__, conf->chandef.chan->center_freq, conf->chandef.center_freq1, conf->chandef.center_freq2, hwsim_chanwidths[conf->chandef.width], !!(conf->flags & IEEE80211_CONF_IDLE), !!(conf->flags & IEEE80211_CONF_PS), smps_modes[conf->smps_mode]); else wiphy_dbg(hw->wiphy, "%s (freq=0 idle=%d ps=%d smps=%s)\n", __func__, !!(conf->flags & IEEE80211_CONF_IDLE), !!(conf->flags & IEEE80211_CONF_PS), smps_modes[conf->smps_mode]); data->idle = !!(conf->flags & IEEE80211_CONF_IDLE); WARN_ON(conf->chandef.chan && data->use_chanctx); mutex_lock(&data->mutex); if (data->scanning && conf->chandef.chan) { for (idx = 0; idx < ARRAY_SIZE(data->survey_data); idx++) { if (data->survey_data[idx].channel == data->channel) { data->survey_data[idx].start = data->survey_data[idx].next_start; data->survey_data[idx].end = jiffies; break; } } data->channel = conf->chandef.chan; data->bw = conf->chandef.width; for (idx = 0; idx < ARRAY_SIZE(data->survey_data); idx++) { if (data->survey_data[idx].channel && data->survey_data[idx].channel != data->channel) continue; data->survey_data[idx].channel = data->channel; data->survey_data[idx].next_start = jiffies; break; } } else { data->channel = conf->chandef.chan; data->bw = conf->chandef.width; } mutex_unlock(&data->mutex); for (idx = 0; idx < ARRAY_SIZE(data->link_data); idx++) { struct mac80211_hwsim_link_data *link_data = &data->link_data[idx]; if (!data->started || !link_data->beacon_int) { hrtimer_cancel(&link_data->beacon_timer); } else if (!hrtimer_active(&link_data->beacon_timer)) { u64 tsf = mac80211_hwsim_get_tsf(hw, NULL); u32 bcn_int = link_data->beacon_int; u64 until_tbtt = bcn_int - do_div(tsf, bcn_int); hrtimer_start(&link_data->beacon_timer, ns_to_ktime(until_tbtt * NSEC_PER_USEC), HRTIMER_MODE_REL_SOFT); } } return 0; } static void mac80211_hwsim_configure_filter(struct ieee80211_hw *hw, unsigned int changed_flags, unsigned int *total_flags,u64 multicast) { struct mac80211_hwsim_data *data = hw->priv; wiphy_dbg(hw->wiphy, "%s\n", __func__); data->rx_filter = 0; if (*total_flags & FIF_ALLMULTI) data->rx_filter |= FIF_ALLMULTI; if (*total_flags & FIF_MCAST_ACTION) data->rx_filter |= FIF_MCAST_ACTION; *total_flags = data->rx_filter; } static void mac80211_hwsim_bcn_en_iter(void *data, u8 *mac, struct ieee80211_vif *vif) { unsigned int *count = data; struct hwsim_vif_priv *vp = (void *)vif->drv_priv; if (vp->bcn_en) (*count)++; } static void mac80211_hwsim_vif_info_changed(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u64 changed) { struct hwsim_vif_priv *vp = (void *)vif->drv_priv; hwsim_check_magic(vif); wiphy_dbg(hw->wiphy, "%s(changed=0x%llx vif->addr=%pM)\n", __func__, changed, vif->addr); if (changed & BSS_CHANGED_ASSOC) { wiphy_dbg(hw->wiphy, " ASSOC: assoc=%d aid=%d\n", vif->cfg.assoc, vif->cfg.aid); vp->assoc = vif->cfg.assoc; vp->aid = vif->cfg.aid; } if (vif->type == NL80211_IFTYPE_STATION && changed & (BSS_CHANGED_MLD_VALID_LINKS | BSS_CHANGED_MLD_TTLM)) { u16 usable_links = ieee80211_vif_usable_links(vif); if (vif->active_links != usable_links) ieee80211_set_active_links_async(vif, usable_links); } } static void mac80211_hwsim_link_info_changed(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *info, u64 changed) { struct hwsim_vif_priv *vp = (void *)vif->drv_priv; struct mac80211_hwsim_data *data = hw->priv; unsigned int link_id = info->link_id; struct mac80211_hwsim_link_data *link_data = &data->link_data[link_id]; hwsim_check_magic(vif); wiphy_dbg(hw->wiphy, "%s(changed=0x%llx vif->addr=%pM, link id %u)\n", __func__, (unsigned long long)changed, vif->addr, link_id); if (changed & BSS_CHANGED_BSSID) { wiphy_dbg(hw->wiphy, "%s: BSSID changed: %pM\n", __func__, info->bssid); memcpy(vp->bssid, info->bssid, ETH_ALEN); } if (changed & BSS_CHANGED_BEACON_ENABLED) { wiphy_dbg(hw->wiphy, " BCN EN: %d (BI=%u)\n", info->enable_beacon, info->beacon_int); vp->bcn_en = info->enable_beacon; if (data->started && !hrtimer_active(&link_data->beacon_timer) && info->enable_beacon) { u64 tsf, until_tbtt; u32 bcn_int; link_data->beacon_int = info->beacon_int * 1024; tsf = mac80211_hwsim_get_tsf(hw, vif); bcn_int = link_data->beacon_int; until_tbtt = bcn_int - do_div(tsf, bcn_int); hrtimer_start(&link_data->beacon_timer, ns_to_ktime(until_tbtt * NSEC_PER_USEC), HRTIMER_MODE_REL_SOFT); } else if (!info->enable_beacon) { unsigned int count = 0; ieee80211_iterate_active_interfaces_atomic( data->hw, IEEE80211_IFACE_ITER_NORMAL, mac80211_hwsim_bcn_en_iter, &count); wiphy_dbg(hw->wiphy, " beaconing vifs remaining: %u", count); if (count == 0) { hrtimer_cancel(&link_data->beacon_timer); link_data->beacon_int = 0; } } } if (changed & BSS_CHANGED_ERP_CTS_PROT) { wiphy_dbg(hw->wiphy, " ERP_CTS_PROT: %d\n", info->use_cts_prot); } if (changed & BSS_CHANGED_ERP_PREAMBLE) { wiphy_dbg(hw->wiphy, " ERP_PREAMBLE: %d\n", info->use_short_preamble); } if (changed & BSS_CHANGED_ERP_SLOT) { wiphy_dbg(hw->wiphy, " ERP_SLOT: %d\n", info->use_short_slot); } if (changed & BSS_CHANGED_HT) { wiphy_dbg(hw->wiphy, " HT: op_mode=0x%x\n", info->ht_operation_mode); } if (changed & BSS_CHANGED_BASIC_RATES) { wiphy_dbg(hw->wiphy, " BASIC_RATES: 0x%llx\n", (unsigned long long) info->basic_rates); } if (changed & BSS_CHANGED_TXPOWER) wiphy_dbg(hw->wiphy, " TX Power: %d dBm\n", info->txpower); } static void mac80211_hwsim_sta_rc_update(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_link_sta *link_sta, u32 changed) { struct mac80211_hwsim_data *data = hw->priv; struct ieee80211_sta *sta = link_sta->sta; u32 bw = U32_MAX; int link_id; rcu_read_lock(); for (link_id = 0; link_id < ARRAY_SIZE(vif->link_conf); link_id++) { enum nl80211_chan_width confbw = NL80211_CHAN_WIDTH_20_NOHT; struct ieee80211_bss_conf *vif_conf; link_sta = rcu_dereference(sta->link[link_id]); if (!link_sta) continue; switch (link_sta->bandwidth) { #define C(_bw) case IEEE80211_STA_RX_BW_##_bw: bw = _bw; break C(20); C(40); C(80); C(160); C(320); #undef C } if (!data->use_chanctx) { confbw = data->bw; } else { struct ieee80211_chanctx_conf *chanctx_conf; vif_conf = rcu_dereference(vif->link_conf[link_id]); if (WARN_ON(!vif_conf)) continue; chanctx_conf = rcu_dereference(vif_conf->chanctx_conf); if (!WARN_ON(!chanctx_conf)) confbw = chanctx_conf->def.width; } WARN(bw > hwsim_get_chanwidth(confbw), "intf %pM [link=%d]: bad STA %pM bandwidth %d MHz (%d) > channel config %d MHz (%d)\n", vif->addr, link_id, sta->addr, bw, sta->deflink.bandwidth, hwsim_get_chanwidth(data->bw), data->bw); } rcu_read_unlock(); } static int mac80211_hwsim_sta_add(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta) { struct hwsim_sta_priv *sp = (void *)sta->drv_priv; hwsim_check_magic(vif); hwsim_set_sta_magic(sta); mac80211_hwsim_sta_rc_update(hw, vif, &sta->deflink, 0); if (sta->valid_links) { WARN(hweight16(sta->valid_links) > 1, "expect to add STA with single link, have 0x%x\n", sta->valid_links); sp->active_links_rx = sta->valid_links; } return 0; } static int mac80211_hwsim_sta_remove(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta) { hwsim_check_magic(vif); hwsim_clear_sta_magic(sta); return 0; } static int mac80211_hwsim_sta_state(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, enum ieee80211_sta_state old_state, enum ieee80211_sta_state new_state) { if (new_state == IEEE80211_STA_NOTEXIST) return mac80211_hwsim_sta_remove(hw, vif, sta); if (old_state == IEEE80211_STA_NOTEXIST) return mac80211_hwsim_sta_add(hw, vif, sta); /* * in an MLO connection, when client is authorized * (AP station marked as such), enable all links */ if (ieee80211_vif_is_mld(vif) && vif->type == NL80211_IFTYPE_STATION && new_state == IEEE80211_STA_AUTHORIZED && !sta->tdls) ieee80211_set_active_links_async(vif, ieee80211_vif_usable_links(vif)); return 0; } static void mac80211_hwsim_sta_notify(struct ieee80211_hw *hw, struct ieee80211_vif *vif, enum sta_notify_cmd cmd, struct ieee80211_sta *sta) { hwsim_check_magic(vif); switch (cmd) { case STA_NOTIFY_SLEEP: case STA_NOTIFY_AWAKE: /* TODO: make good use of these flags */ break; default: WARN(1, "Invalid sta notify: %d\n", cmd); break; } } static int mac80211_hwsim_set_tim(struct ieee80211_hw *hw, struct ieee80211_sta *sta, bool set) { hwsim_check_sta_magic(sta); return 0; } static int mac80211_hwsim_conf_tx(struct ieee80211_hw *hw, struct ieee80211_vif *vif, unsigned int link_id, u16 queue, const struct ieee80211_tx_queue_params *params) { wiphy_dbg(hw->wiphy, "%s (queue=%d txop=%d cw_min=%d cw_max=%d aifs=%d)\n", __func__, queue, params->txop, params->cw_min, params->cw_max, params->aifs); return 0; } static int mac80211_hwsim_get_survey(struct ieee80211_hw *hw, int idx, struct survey_info *survey) { struct mac80211_hwsim_data *hwsim = hw->priv; if (idx < 0 || idx >= ARRAY_SIZE(hwsim->survey_data)) return -ENOENT; mutex_lock(&hwsim->mutex); survey->channel = hwsim->survey_data[idx].channel; if (!survey->channel) { mutex_unlock(&hwsim->mutex); return -ENOENT; } /* * Magically conjured dummy values --- this is only ok for simulated hardware. * * A real driver which cannot determine real values noise MUST NOT * report any, especially not a magically conjured ones :-) */ survey->filled = SURVEY_INFO_NOISE_DBM | SURVEY_INFO_TIME | SURVEY_INFO_TIME_BUSY; survey->noise = -92; survey->time = jiffies_to_msecs(hwsim->survey_data[idx].end - hwsim->survey_data[idx].start); /* report 12.5% of channel time is used */ survey->time_busy = survey->time/8; mutex_unlock(&hwsim->mutex); return 0; } static enum ieee80211_neg_ttlm_res mac80211_hwsim_can_neg_ttlm(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_neg_ttlm *neg_ttlm) { u32 i; /* For testing purposes, accept if all TIDs are mapped to the same links * set, otherwise reject. */ for (i = 0; i < IEEE80211_TTLM_NUM_TIDS; i++) { if (neg_ttlm->downlink[i] != neg_ttlm->uplink[i] || neg_ttlm->downlink[i] != neg_ttlm->downlink[0]) return NEG_TTLM_RES_REJECT; } return NEG_TTLM_RES_ACCEPT; } #ifdef CONFIG_NL80211_TESTMODE /* * This section contains example code for using netlink * attributes with the testmode command in nl80211. */ /* These enums need to be kept in sync with userspace */ enum hwsim_testmode_attr { __HWSIM_TM_ATTR_INVALID = 0, HWSIM_TM_ATTR_CMD = 1, HWSIM_TM_ATTR_PS = 2, /* keep last */ __HWSIM_TM_ATTR_AFTER_LAST, HWSIM_TM_ATTR_MAX = __HWSIM_TM_ATTR_AFTER_LAST - 1 }; enum hwsim_testmode_cmd { HWSIM_TM_CMD_SET_PS = 0, HWSIM_TM_CMD_GET_PS = 1, HWSIM_TM_CMD_STOP_QUEUES = 2, HWSIM_TM_CMD_WAKE_QUEUES = 3, }; static const struct nla_policy hwsim_testmode_policy[HWSIM_TM_ATTR_MAX + 1] = { [HWSIM_TM_ATTR_CMD] = { .type = NLA_U32 }, [HWSIM_TM_ATTR_PS] = { .type = NLA_U32 }, }; static int mac80211_hwsim_testmode_cmd(struct ieee80211_hw *hw, struct ieee80211_vif *vif, void *data, int len) { struct mac80211_hwsim_data *hwsim = hw->priv; struct nlattr *tb[HWSIM_TM_ATTR_MAX + 1]; struct sk_buff *skb; int err, ps; err = nla_parse_deprecated(tb, HWSIM_TM_ATTR_MAX, data, len, hwsim_testmode_policy, NULL); if (err) return err; if (!tb[HWSIM_TM_ATTR_CMD]) return -EINVAL; switch (nla_get_u32(tb[HWSIM_TM_ATTR_CMD])) { case HWSIM_TM_CMD_SET_PS: if (!tb[HWSIM_TM_ATTR_PS]) return -EINVAL; ps = nla_get_u32(tb[HWSIM_TM_ATTR_PS]); return hwsim_fops_ps_write(hwsim, ps); case HWSIM_TM_CMD_GET_PS: skb = cfg80211_testmode_alloc_reply_skb(hw->wiphy, nla_total_size(sizeof(u32))); if (!skb) return -ENOMEM; if (nla_put_u32(skb, HWSIM_TM_ATTR_PS, hwsim->ps)) goto nla_put_failure; return cfg80211_testmode_reply(skb); case HWSIM_TM_CMD_STOP_QUEUES: ieee80211_stop_queues(hw); return 0; case HWSIM_TM_CMD_WAKE_QUEUES: ieee80211_wake_queues(hw); return 0; default: return -EOPNOTSUPP; } nla_put_failure: kfree_skb(skb); return -ENOBUFS; } #endif static int mac80211_hwsim_ampdu_action(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_ampdu_params *params) { struct ieee80211_sta *sta = params->sta; enum ieee80211_ampdu_mlme_action action = params->action; u16 tid = params->tid; switch (action) { case IEEE80211_AMPDU_TX_START: return IEEE80211_AMPDU_TX_START_IMMEDIATE; case IEEE80211_AMPDU_TX_STOP_CONT: case IEEE80211_AMPDU_TX_STOP_FLUSH: case IEEE80211_AMPDU_TX_STOP_FLUSH_CONT: ieee80211_stop_tx_ba_cb_irqsafe(vif, sta->addr, tid); break; case IEEE80211_AMPDU_TX_OPERATIONAL: break; case IEEE80211_AMPDU_RX_START: case IEEE80211_AMPDU_RX_STOP: break; default: return -EOPNOTSUPP; } return 0; } static void mac80211_hwsim_flush(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u32 queues, bool drop) { /* Not implemented, queues only on kernel side */ } static void hw_scan_work(struct work_struct *work) { struct mac80211_hwsim_data *hwsim = container_of(work, struct mac80211_hwsim_data, hw_scan.work); struct cfg80211_scan_request *req = hwsim->hw_scan_request; int dwell, i; mutex_lock(&hwsim->mutex); if (hwsim->scan_chan_idx >= req->n_channels) { struct cfg80211_scan_info info = { .aborted = false, }; wiphy_dbg(hwsim->hw->wiphy, "hw scan complete\n"); ieee80211_scan_completed(hwsim->hw, &info); hwsim->hw_scan_request = NULL; hwsim->hw_scan_vif = NULL; hwsim->tmp_chan = NULL; mutex_unlock(&hwsim->mutex); mac80211_hwsim_config_mac_nl(hwsim->hw, hwsim->scan_addr, false); return; } wiphy_dbg(hwsim->hw->wiphy, "hw scan %d MHz\n", req->channels[hwsim->scan_chan_idx]->center_freq); hwsim->tmp_chan = req->channels[hwsim->scan_chan_idx]; if (hwsim->tmp_chan->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_RADAR) || !req->n_ssids) { dwell = 120; } else { dwell = 30; /* send probes */ for (i = 0; i < req->n_ssids; i++) { struct sk_buff *probe; struct ieee80211_mgmt *mgmt; probe = ieee80211_probereq_get(hwsim->hw, hwsim->scan_addr, req->ssids[i].ssid, req->ssids[i].ssid_len, req->ie_len); if (!probe) continue; mgmt = (struct ieee80211_mgmt *) probe->data; memcpy(mgmt->da, req->bssid, ETH_ALEN); memcpy(mgmt->bssid, req->bssid, ETH_ALEN); if (req->ie_len) skb_put_data(probe, req->ie, req->ie_len); rcu_read_lock(); if (!ieee80211_tx_prepare_skb(hwsim->hw, hwsim->hw_scan_vif, probe, hwsim->tmp_chan->band, NULL)) { rcu_read_unlock(); kfree_skb(probe); continue; } local_bh_disable(); mac80211_hwsim_tx_frame(hwsim->hw, probe, hwsim->tmp_chan); rcu_read_unlock(); local_bh_enable(); } } ieee80211_queue_delayed_work(hwsim->hw, &hwsim->hw_scan, msecs_to_jiffies(dwell)); hwsim->survey_data[hwsim->scan_chan_idx].channel = hwsim->tmp_chan; hwsim->survey_data[hwsim->scan_chan_idx].start = jiffies; hwsim->survey_data[hwsim->scan_chan_idx].end = jiffies + msecs_to_jiffies(dwell); hwsim->scan_chan_idx++; mutex_unlock(&hwsim->mutex); } static int mac80211_hwsim_hw_scan(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_scan_request *hw_req) { struct mac80211_hwsim_data *hwsim = hw->priv; struct cfg80211_scan_request *req = &hw_req->req; mutex_lock(&hwsim->mutex); if (WARN_ON(hwsim->tmp_chan || hwsim->hw_scan_request)) { mutex_unlock(&hwsim->mutex); return -EBUSY; } hwsim->hw_scan_request = req; hwsim->hw_scan_vif = vif; hwsim->scan_chan_idx = 0; if (req->flags & NL80211_SCAN_FLAG_RANDOM_ADDR) get_random_mask_addr(hwsim->scan_addr, hw_req->req.mac_addr, hw_req->req.mac_addr_mask); else memcpy(hwsim->scan_addr, vif->addr, ETH_ALEN); memset(hwsim->survey_data, 0, sizeof(hwsim->survey_data)); mutex_unlock(&hwsim->mutex); mac80211_hwsim_config_mac_nl(hw, hwsim->scan_addr, true); wiphy_dbg(hw->wiphy, "hwsim hw_scan request\n"); ieee80211_queue_delayed_work(hwsim->hw, &hwsim->hw_scan, 0); return 0; } static void mac80211_hwsim_cancel_hw_scan(struct ieee80211_hw *hw, struct ieee80211_vif *vif) { struct mac80211_hwsim_data *hwsim = hw->priv; struct cfg80211_scan_info info = { .aborted = true, }; wiphy_dbg(hw->wiphy, "hwsim cancel_hw_scan\n"); cancel_delayed_work_sync(&hwsim->hw_scan); mutex_lock(&hwsim->mutex); ieee80211_scan_completed(hwsim->hw, &info); hwsim->tmp_chan = NULL; hwsim->hw_scan_request = NULL; hwsim->hw_scan_vif = NULL; mutex_unlock(&hwsim->mutex); } static void mac80211_hwsim_sw_scan(struct ieee80211_hw *hw, struct ieee80211_vif *vif, const u8 *mac_addr) { struct mac80211_hwsim_data *hwsim = hw->priv; mutex_lock(&hwsim->mutex); if (hwsim->scanning) { pr_debug("two hwsim sw_scans detected!\n"); goto out; } pr_debug("hwsim sw_scan request, prepping stuff\n"); memcpy(hwsim->scan_addr, mac_addr, ETH_ALEN); mac80211_hwsim_config_mac_nl(hw, hwsim->scan_addr, true); hwsim->scanning = true; memset(hwsim->survey_data, 0, sizeof(hwsim->survey_data)); out: mutex_unlock(&hwsim->mutex); } static void mac80211_hwsim_sw_scan_complete(struct ieee80211_hw *hw, struct ieee80211_vif *vif) { struct mac80211_hwsim_data *hwsim = hw->priv; mutex_lock(&hwsim->mutex); pr_debug("hwsim sw_scan_complete\n"); hwsim->scanning = false; mac80211_hwsim_config_mac_nl(hw, hwsim->scan_addr, false); eth_zero_addr(hwsim->scan_addr); mutex_unlock(&hwsim->mutex); } static void hw_roc_start(struct work_struct *work) { struct mac80211_hwsim_data *hwsim = container_of(work, struct mac80211_hwsim_data, roc_start.work); mutex_lock(&hwsim->mutex); wiphy_dbg(hwsim->hw->wiphy, "hwsim ROC begins\n"); hwsim->tmp_chan = hwsim->roc_chan; ieee80211_ready_on_channel(hwsim->hw); ieee80211_queue_delayed_work(hwsim->hw, &hwsim->roc_done, msecs_to_jiffies(hwsim->roc_duration)); mutex_unlock(&hwsim->mutex); } static void hw_roc_done(struct work_struct *work) { struct mac80211_hwsim_data *hwsim = container_of(work, struct mac80211_hwsim_data, roc_done.work); mutex_lock(&hwsim->mutex); ieee80211_remain_on_channel_expired(hwsim->hw); hwsim->tmp_chan = NULL; mutex_unlock(&hwsim->mutex); wiphy_dbg(hwsim->hw->wiphy, "hwsim ROC expired\n"); } static int mac80211_hwsim_roc(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_channel *chan, int duration, enum ieee80211_roc_type type) { struct mac80211_hwsim_data *hwsim = hw->priv; mutex_lock(&hwsim->mutex); if (WARN_ON(hwsim->tmp_chan || hwsim->hw_scan_request)) { mutex_unlock(&hwsim->mutex); return -EBUSY; } hwsim->roc_chan = chan; hwsim->roc_duration = duration; mutex_unlock(&hwsim->mutex); wiphy_dbg(hw->wiphy, "hwsim ROC (%d MHz, %d ms)\n", chan->center_freq, duration); ieee80211_queue_delayed_work(hw, &hwsim->roc_start, HZ/50); return 0; } static int mac80211_hwsim_croc(struct ieee80211_hw *hw, struct ieee80211_vif *vif) { struct mac80211_hwsim_data *hwsim = hw->priv; cancel_delayed_work_sync(&hwsim->roc_start); cancel_delayed_work_sync(&hwsim->roc_done); mutex_lock(&hwsim->mutex); hwsim->tmp_chan = NULL; mutex_unlock(&hwsim->mutex); wiphy_dbg(hw->wiphy, "hwsim ROC canceled\n"); return 0; } static int mac80211_hwsim_add_chanctx(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *ctx) { hwsim_set_chanctx_magic(ctx); wiphy_dbg(hw->wiphy, "add channel context control: %d MHz/width: %d/cfreqs:%d/%d MHz\n", ctx->def.chan->center_freq, ctx->def.width, ctx->def.center_freq1, ctx->def.center_freq2); return 0; } static void mac80211_hwsim_remove_chanctx(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *ctx) { wiphy_dbg(hw->wiphy, "remove channel context control: %d MHz/width: %d/cfreqs:%d/%d MHz\n", ctx->def.chan->center_freq, ctx->def.width, ctx->def.center_freq1, ctx->def.center_freq2); hwsim_check_chanctx_magic(ctx); hwsim_clear_chanctx_magic(ctx); } static void mac80211_hwsim_change_chanctx(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *ctx, u32 changed) { hwsim_check_chanctx_magic(ctx); wiphy_dbg(hw->wiphy, "change channel context control: %d MHz/width: %d/cfreqs:%d/%d MHz\n", ctx->def.chan->center_freq, ctx->def.width, ctx->def.center_freq1, ctx->def.center_freq2); } static int mac80211_hwsim_assign_vif_chanctx(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *link_conf, struct ieee80211_chanctx_conf *ctx) { hwsim_check_magic(vif); hwsim_check_chanctx_magic(ctx); /* if we activate a link while already associated wake it up */ if (vif->type == NL80211_IFTYPE_STATION && vif->cfg.assoc) { struct sk_buff *skb; skb = ieee80211_nullfunc_get(hw, vif, link_conf->link_id, true); if (skb) { local_bh_disable(); mac80211_hwsim_tx_frame(hw, skb, ctx->def.chan); local_bh_enable(); } } return 0; } static void mac80211_hwsim_unassign_vif_chanctx(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *link_conf, struct ieee80211_chanctx_conf *ctx) { hwsim_check_magic(vif); hwsim_check_chanctx_magic(ctx); /* if we deactivate a link while associated suspend it first */ if (vif->type == NL80211_IFTYPE_STATION && vif->cfg.assoc) { struct sk_buff *skb; skb = ieee80211_nullfunc_get(hw, vif, link_conf->link_id, true); if (skb) { struct ieee80211_hdr *hdr = (void *)skb->data; hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_PM); local_bh_disable(); mac80211_hwsim_tx_frame(hw, skb, ctx->def.chan); local_bh_enable(); } } } static int mac80211_hwsim_switch_vif_chanctx(struct ieee80211_hw *hw, struct ieee80211_vif_chanctx_switch *vifs, int n_vifs, enum ieee80211_chanctx_switch_mode mode) { int i; if (n_vifs <= 0) return -EINVAL; wiphy_dbg(hw->wiphy, "switch vif channel context mode: %u\n", mode); for (i = 0; i < n_vifs; i++) { hwsim_check_chanctx_magic(vifs[i].old_ctx); wiphy_dbg(hw->wiphy, "switch vif channel context: %d MHz/width: %d/cfreqs:%d/%d MHz -> %d MHz/width: %d/cfreqs:%d/%d MHz\n", vifs[i].old_ctx->def.chan->center_freq, vifs[i].old_ctx->def.width, vifs[i].old_ctx->def.center_freq1, vifs[i].old_ctx->def.center_freq2, vifs[i].new_ctx->def.chan->center_freq, vifs[i].new_ctx->def.width, vifs[i].new_ctx->def.center_freq1, vifs[i].new_ctx->def.center_freq2); switch (mode) { case CHANCTX_SWMODE_REASSIGN_VIF: hwsim_check_chanctx_magic(vifs[i].new_ctx); break; case CHANCTX_SWMODE_SWAP_CONTEXTS: hwsim_set_chanctx_magic(vifs[i].new_ctx); hwsim_clear_chanctx_magic(vifs[i].old_ctx); break; default: WARN(1, "Invalid mode %d\n", mode); } } return 0; } static const char mac80211_hwsim_gstrings_stats[][ETH_GSTRING_LEN] = { "tx_pkts_nic", "tx_bytes_nic", "rx_pkts_nic", "rx_bytes_nic", "d_tx_dropped", "d_tx_failed", "d_ps_mode", "d_group", }; #define MAC80211_HWSIM_SSTATS_LEN ARRAY_SIZE(mac80211_hwsim_gstrings_stats) static void mac80211_hwsim_get_et_strings(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u32 sset, u8 *data) { if (sset == ETH_SS_STATS) memcpy(data, mac80211_hwsim_gstrings_stats, sizeof(mac80211_hwsim_gstrings_stats)); } static int mac80211_hwsim_get_et_sset_count(struct ieee80211_hw *hw, struct ieee80211_vif *vif, int sset) { if (sset == ETH_SS_STATS) return MAC80211_HWSIM_SSTATS_LEN; return 0; } static void mac80211_hwsim_get_et_stats(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ethtool_stats *stats, u64 *data) { struct mac80211_hwsim_data *ar = hw->priv; int i = 0; data[i++] = ar->tx_pkts; data[i++] = ar->tx_bytes; data[i++] = ar->rx_pkts; data[i++] = ar->rx_bytes; data[i++] = ar->tx_dropped; data[i++] = ar->tx_failed; data[i++] = ar->ps; data[i++] = ar->group; WARN_ON(i != MAC80211_HWSIM_SSTATS_LEN); } static int mac80211_hwsim_tx_last_beacon(struct ieee80211_hw *hw) { return 1; } static int mac80211_hwsim_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, u32 value) { return -EOPNOTSUPP; } static int mac80211_hwsim_change_vif_links(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u16 old_links, u16 new_links, struct ieee80211_bss_conf *old[IEEE80211_MLD_MAX_NUM_LINKS]) { unsigned long rem = old_links & ~new_links; unsigned long add = new_links & ~old_links; int i; if (!old_links) rem |= BIT(0); if (!new_links) add |= BIT(0); for_each_set_bit(i, &rem, IEEE80211_MLD_MAX_NUM_LINKS) mac80211_hwsim_config_mac_nl(hw, old[i]->addr, false); for_each_set_bit(i, &add, IEEE80211_MLD_MAX_NUM_LINKS) { struct ieee80211_bss_conf *link_conf; link_conf = link_conf_dereference_protected(vif, i); if (WARN_ON(!link_conf)) continue; mac80211_hwsim_config_mac_nl(hw, link_conf->addr, true); } return 0; } static int mac80211_hwsim_change_sta_links(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, u16 old_links, u16 new_links) { struct hwsim_sta_priv *sp = (void *)sta->drv_priv; hwsim_check_sta_magic(sta); if (vif->type == NL80211_IFTYPE_STATION) sp->active_links_rx = new_links; return 0; } static int mac80211_hwsim_send_pmsr_ftm_request_peer(struct sk_buff *msg, struct cfg80211_pmsr_ftm_request_peer *request) { struct nlattr *ftm; if (!request->requested) return -EINVAL; ftm = nla_nest_start(msg, NL80211_PMSR_TYPE_FTM); if (!ftm) return -ENOBUFS; if (nla_put_u32(msg, NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE, request->preamble)) return -ENOBUFS; if (nla_put_u16(msg, NL80211_PMSR_FTM_REQ_ATTR_BURST_PERIOD, request->burst_period)) return -ENOBUFS; if (request->asap && nla_put_flag(msg, NL80211_PMSR_FTM_REQ_ATTR_ASAP)) return -ENOBUFS; if (request->request_lci && nla_put_flag(msg, NL80211_PMSR_FTM_REQ_ATTR_REQUEST_LCI)) return -ENOBUFS; if (request->request_civicloc && nla_put_flag(msg, NL80211_PMSR_FTM_REQ_ATTR_REQUEST_CIVICLOC)) return -ENOBUFS; if (request->trigger_based && nla_put_flag(msg, NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED)) return -ENOBUFS; if (request->non_trigger_based && nla_put_flag(msg, NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED)) return -ENOBUFS; if (request->lmr_feedback && nla_put_flag(msg, NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK)) return -ENOBUFS; if (nla_put_u8(msg, NL80211_PMSR_FTM_REQ_ATTR_NUM_BURSTS_EXP, request->num_bursts_exp)) return -ENOBUFS; if (nla_put_u8(msg, NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION, request->burst_duration)) return -ENOBUFS; if (nla_put_u8(msg, NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST, request->ftms_per_burst)) return -ENOBUFS; if (nla_put_u8(msg, NL80211_PMSR_FTM_REQ_ATTR_NUM_FTMR_RETRIES, request->ftmr_retries)) return -ENOBUFS; if (nla_put_u8(msg, NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION, request->burst_duration)) return -ENOBUFS; if (nla_put_u8(msg, NL80211_PMSR_FTM_REQ_ATTR_BSS_COLOR, request->bss_color)) return -ENOBUFS; nla_nest_end(msg, ftm); return 0; } static int mac80211_hwsim_send_pmsr_request_peer(struct sk_buff *msg, struct cfg80211_pmsr_request_peer *request) { struct nlattr *peer, *chandef, *req, *data; int err; peer = nla_nest_start(msg, NL80211_PMSR_ATTR_PEERS); if (!peer) return -ENOBUFS; if (nla_put(msg, NL80211_PMSR_PEER_ATTR_ADDR, ETH_ALEN, request->addr)) return -ENOBUFS; chandef = nla_nest_start(msg, NL80211_PMSR_PEER_ATTR_CHAN); if (!chandef) return -ENOBUFS; err = nl80211_send_chandef(msg, &request->chandef); if (err) return err; nla_nest_end(msg, chandef); req = nla_nest_start(msg, NL80211_PMSR_PEER_ATTR_REQ); if (!req) return -ENOBUFS; if (request->report_ap_tsf && nla_put_flag(msg, NL80211_PMSR_REQ_ATTR_GET_AP_TSF)) return -ENOBUFS; data = nla_nest_start(msg, NL80211_PMSR_REQ_ATTR_DATA); if (!data) return -ENOBUFS; err = mac80211_hwsim_send_pmsr_ftm_request_peer(msg, &request->ftm); if (err) return err; nla_nest_end(msg, data); nla_nest_end(msg, req); nla_nest_end(msg, peer); return 0; } static int mac80211_hwsim_send_pmsr_request(struct sk_buff *msg, struct cfg80211_pmsr_request *request) { struct nlattr *pmsr; int err; pmsr = nla_nest_start(msg, NL80211_ATTR_PEER_MEASUREMENTS); if (!pmsr) return -ENOBUFS; if (nla_put_u32(msg, NL80211_ATTR_TIMEOUT, request->timeout)) return -ENOBUFS; if (!is_zero_ether_addr(request->mac_addr)) { if (nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, request->mac_addr)) return -ENOBUFS; if (nla_put(msg, NL80211_ATTR_MAC_MASK, ETH_ALEN, request->mac_addr_mask)) return -ENOBUFS; } for (int i = 0; i < request->n_peers; i++) { err = mac80211_hwsim_send_pmsr_request_peer(msg, &request->peers[i]); if (err) return err; } nla_nest_end(msg, pmsr); return 0; } static int mac80211_hwsim_start_pmsr(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct cfg80211_pmsr_request *request) { struct mac80211_hwsim_data *data; struct sk_buff *skb = NULL; struct nlattr *pmsr; void *msg_head; u32 _portid; int err = 0; data = hw->priv; _portid = READ_ONCE(data->wmediumd); if (!_portid && !hwsim_virtio_enabled) return -EOPNOTSUPP; mutex_lock(&data->mutex); if (data->pmsr_request) { err = -EBUSY; goto out_free; } skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) { err = -ENOMEM; goto out_free; } msg_head = genlmsg_put(skb, 0, 0, &hwsim_genl_family, 0, HWSIM_CMD_START_PMSR); if (nla_put(skb, HWSIM_ATTR_ADDR_TRANSMITTER, ETH_ALEN, data->addresses[1].addr)) { err = -ENOMEM; goto out_free; } pmsr = nla_nest_start(skb, HWSIM_ATTR_PMSR_REQUEST); if (!pmsr) { err = -ENOMEM; goto out_free; } err = mac80211_hwsim_send_pmsr_request(skb, request); if (err) goto out_free; nla_nest_end(skb, pmsr); genlmsg_end(skb, msg_head); if (hwsim_virtio_enabled) hwsim_tx_virtio(data, skb); else hwsim_unicast_netgroup(data, skb, _portid); data->pmsr_request = request; data->pmsr_request_wdev = ieee80211_vif_to_wdev(vif); out_free: if (err && skb) nlmsg_free(skb); mutex_unlock(&data->mutex); return err; } static void mac80211_hwsim_abort_pmsr(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct cfg80211_pmsr_request *request) { struct mac80211_hwsim_data *data; struct sk_buff *skb = NULL; struct nlattr *pmsr; void *msg_head; u32 _portid; int err = 0; data = hw->priv; _portid = READ_ONCE(data->wmediumd); if (!_portid && !hwsim_virtio_enabled) return; mutex_lock(&data->mutex); if (data->pmsr_request != request) { err = -EINVAL; goto out; } skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) { err = -ENOMEM; goto out; } msg_head = genlmsg_put(skb, 0, 0, &hwsim_genl_family, 0, HWSIM_CMD_ABORT_PMSR); if (nla_put(skb, HWSIM_ATTR_ADDR_TRANSMITTER, ETH_ALEN, data->addresses[1].addr)) goto out; pmsr = nla_nest_start(skb, HWSIM_ATTR_PMSR_REQUEST); if (!pmsr) { err = -ENOMEM; goto out; } err = mac80211_hwsim_send_pmsr_request(skb, request); if (err) goto out; err = nla_nest_end(skb, pmsr); if (err) goto out; genlmsg_end(skb, msg_head); if (hwsim_virtio_enabled) hwsim_tx_virtio(data, skb); else hwsim_unicast_netgroup(data, skb, _portid); out: if (err && skb) nlmsg_free(skb); mutex_unlock(&data->mutex); } static int mac80211_hwsim_parse_rate_info(struct nlattr *rateattr, struct rate_info *rate_info, struct genl_info *info) { struct nlattr *tb[HWSIM_RATE_INFO_ATTR_MAX + 1]; int ret; ret = nla_parse_nested(tb, HWSIM_RATE_INFO_ATTR_MAX, rateattr, hwsim_rate_info_policy, info->extack); if (ret) return ret; if (tb[HWSIM_RATE_INFO_ATTR_FLAGS]) rate_info->flags = nla_get_u8(tb[HWSIM_RATE_INFO_ATTR_FLAGS]); if (tb[HWSIM_RATE_INFO_ATTR_MCS]) rate_info->mcs = nla_get_u8(tb[HWSIM_RATE_INFO_ATTR_MCS]); if (tb[HWSIM_RATE_INFO_ATTR_LEGACY]) rate_info->legacy = nla_get_u16(tb[HWSIM_RATE_INFO_ATTR_LEGACY]); if (tb[HWSIM_RATE_INFO_ATTR_NSS]) rate_info->nss = nla_get_u8(tb[HWSIM_RATE_INFO_ATTR_NSS]); if (tb[HWSIM_RATE_INFO_ATTR_BW]) rate_info->bw = nla_get_u8(tb[HWSIM_RATE_INFO_ATTR_BW]); if (tb[HWSIM_RATE_INFO_ATTR_HE_GI]) rate_info->he_gi = nla_get_u8(tb[HWSIM_RATE_INFO_ATTR_HE_GI]); if (tb[HWSIM_RATE_INFO_ATTR_HE_DCM]) rate_info->he_dcm = nla_get_u8(tb[HWSIM_RATE_INFO_ATTR_HE_DCM]); if (tb[HWSIM_RATE_INFO_ATTR_HE_RU_ALLOC]) rate_info->he_ru_alloc = nla_get_u8(tb[HWSIM_RATE_INFO_ATTR_HE_RU_ALLOC]); if (tb[HWSIM_RATE_INFO_ATTR_N_BOUNDED_CH]) rate_info->n_bonded_ch = nla_get_u8(tb[HWSIM_RATE_INFO_ATTR_N_BOUNDED_CH]); if (tb[HWSIM_RATE_INFO_ATTR_EHT_GI]) rate_info->eht_gi = nla_get_u8(tb[HWSIM_RATE_INFO_ATTR_EHT_GI]); if (tb[HWSIM_RATE_INFO_ATTR_EHT_RU_ALLOC]) rate_info->eht_ru_alloc = nla_get_u8(tb[HWSIM_RATE_INFO_ATTR_EHT_RU_ALLOC]); return 0; } static int mac80211_hwsim_parse_ftm_result(struct nlattr *ftm, struct cfg80211_pmsr_ftm_result *result, struct genl_info *info) { struct nlattr *tb[NL80211_PMSR_FTM_RESP_ATTR_MAX + 1]; int ret; ret = nla_parse_nested(tb, NL80211_PMSR_FTM_RESP_ATTR_MAX, ftm, hwsim_ftm_result_policy, info->extack); if (ret) return ret; if (tb[NL80211_PMSR_FTM_RESP_ATTR_FAIL_REASON]) result->failure_reason = nla_get_u32(tb[NL80211_PMSR_FTM_RESP_ATTR_FAIL_REASON]); if (tb[NL80211_PMSR_FTM_RESP_ATTR_BURST_INDEX]) result->burst_index = nla_get_u16(tb[NL80211_PMSR_FTM_RESP_ATTR_BURST_INDEX]); if (tb[NL80211_PMSR_FTM_RESP_ATTR_NUM_FTMR_ATTEMPTS]) { result->num_ftmr_attempts_valid = 1; result->num_ftmr_attempts = nla_get_u32(tb[NL80211_PMSR_FTM_RESP_ATTR_NUM_FTMR_ATTEMPTS]); } if (tb[NL80211_PMSR_FTM_RESP_ATTR_NUM_FTMR_SUCCESSES]) { result->num_ftmr_successes_valid = 1; result->num_ftmr_successes = nla_get_u32(tb[NL80211_PMSR_FTM_RESP_ATTR_NUM_FTMR_SUCCESSES]); } if (tb[NL80211_PMSR_FTM_RESP_ATTR_BUSY_RETRY_TIME]) result->busy_retry_time = nla_get_u8(tb[NL80211_PMSR_FTM_RESP_ATTR_BUSY_RETRY_TIME]); if (tb[NL80211_PMSR_FTM_RESP_ATTR_NUM_BURSTS_EXP]) result->num_bursts_exp = nla_get_u8(tb[NL80211_PMSR_FTM_RESP_ATTR_NUM_BURSTS_EXP]); if (tb[NL80211_PMSR_FTM_RESP_ATTR_BURST_DURATION]) result->burst_duration = nla_get_u8(tb[NL80211_PMSR_FTM_RESP_ATTR_BURST_DURATION]); if (tb[NL80211_PMSR_FTM_RESP_ATTR_FTMS_PER_BURST]) result->ftms_per_burst = nla_get_u8(tb[NL80211_PMSR_FTM_RESP_ATTR_FTMS_PER_BURST]); if (tb[NL80211_PMSR_FTM_RESP_ATTR_RSSI_AVG]) { result->rssi_avg_valid = 1; result->rssi_avg = nla_get_s32(tb[NL80211_PMSR_FTM_RESP_ATTR_RSSI_AVG]); } if (tb[NL80211_PMSR_FTM_RESP_ATTR_RSSI_SPREAD]) { result->rssi_spread_valid = 1; result->rssi_spread = nla_get_s32(tb[NL80211_PMSR_FTM_RESP_ATTR_RSSI_SPREAD]); } if (tb[NL80211_PMSR_FTM_RESP_ATTR_TX_RATE]) { result->tx_rate_valid = 1; ret = mac80211_hwsim_parse_rate_info(tb[NL80211_PMSR_FTM_RESP_ATTR_TX_RATE], &result->tx_rate, info); if (ret) return ret; } if (tb[NL80211_PMSR_FTM_RESP_ATTR_RX_RATE]) { result->rx_rate_valid = 1; ret = mac80211_hwsim_parse_rate_info(tb[NL80211_PMSR_FTM_RESP_ATTR_RX_RATE], &result->rx_rate, info); if (ret) return ret; } if (tb[NL80211_PMSR_FTM_RESP_ATTR_RTT_AVG]) { result->rtt_avg_valid = 1; result->rtt_avg = nla_get_u64(tb[NL80211_PMSR_FTM_RESP_ATTR_RTT_AVG]); } if (tb[NL80211_PMSR_FTM_RESP_ATTR_RTT_VARIANCE]) { result->rtt_variance_valid = 1; result->rtt_variance = nla_get_u64(tb[NL80211_PMSR_FTM_RESP_ATTR_RTT_VARIANCE]); } if (tb[NL80211_PMSR_FTM_RESP_ATTR_RTT_SPREAD]) { result->rtt_spread_valid = 1; result->rtt_spread = nla_get_u64(tb[NL80211_PMSR_FTM_RESP_ATTR_RTT_SPREAD]); } if (tb[NL80211_PMSR_FTM_RESP_ATTR_DIST_AVG]) { result->dist_avg_valid = 1; result->dist_avg = nla_get_u64(tb[NL80211_PMSR_FTM_RESP_ATTR_DIST_AVG]); } if (tb[NL80211_PMSR_FTM_RESP_ATTR_DIST_VARIANCE]) { result->dist_variance_valid = 1; result->dist_variance = nla_get_u64(tb[NL80211_PMSR_FTM_RESP_ATTR_DIST_VARIANCE]); } if (tb[NL80211_PMSR_FTM_RESP_ATTR_DIST_SPREAD]) { result->dist_spread_valid = 1; result->dist_spread = nla_get_u64(tb[NL80211_PMSR_FTM_RESP_ATTR_DIST_SPREAD]); } if (tb[NL80211_PMSR_FTM_RESP_ATTR_LCI]) { result->lci = nla_data(tb[NL80211_PMSR_FTM_RESP_ATTR_LCI]); result->lci_len = nla_len(tb[NL80211_PMSR_FTM_RESP_ATTR_LCI]); } if (tb[NL80211_PMSR_FTM_RESP_ATTR_CIVICLOC]) { result->civicloc = nla_data(tb[NL80211_PMSR_FTM_RESP_ATTR_CIVICLOC]); result->civicloc_len = nla_len(tb[NL80211_PMSR_FTM_RESP_ATTR_CIVICLOC]); } return 0; } static int mac80211_hwsim_parse_pmsr_resp(struct nlattr *resp, struct cfg80211_pmsr_result *result, struct genl_info *info) { struct nlattr *tb[NL80211_PMSR_RESP_ATTR_MAX + 1]; struct nlattr *pmsr; int rem; int ret; ret = nla_parse_nested(tb, NL80211_PMSR_RESP_ATTR_MAX, resp, hwsim_pmsr_resp_policy, info->extack); if (ret) return ret; if (tb[NL80211_PMSR_RESP_ATTR_STATUS]) result->status = nla_get_u32(tb[NL80211_PMSR_RESP_ATTR_STATUS]); if (tb[NL80211_PMSR_RESP_ATTR_HOST_TIME]) result->host_time = nla_get_u64(tb[NL80211_PMSR_RESP_ATTR_HOST_TIME]); if (tb[NL80211_PMSR_RESP_ATTR_AP_TSF]) { result->ap_tsf_valid = 1; result->ap_tsf = nla_get_u64(tb[NL80211_PMSR_RESP_ATTR_AP_TSF]); } result->final = !!tb[NL80211_PMSR_RESP_ATTR_FINAL]; if (!tb[NL80211_PMSR_RESP_ATTR_DATA]) return 0; nla_for_each_nested(pmsr, tb[NL80211_PMSR_RESP_ATTR_DATA], rem) { switch (nla_type(pmsr)) { case NL80211_PMSR_TYPE_FTM: result->type = NL80211_PMSR_TYPE_FTM; ret = mac80211_hwsim_parse_ftm_result(pmsr, &result->ftm, info); if (ret) return ret; break; default: NL_SET_ERR_MSG_ATTR(info->extack, pmsr, "Unknown pmsr resp type"); return -EINVAL; } } return 0; } static int mac80211_hwsim_parse_pmsr_result(struct nlattr *peer, struct cfg80211_pmsr_result *result, struct genl_info *info) { struct nlattr *tb[NL80211_PMSR_PEER_ATTR_MAX + 1]; int ret; if (!peer) return -EINVAL; ret = nla_parse_nested(tb, NL80211_PMSR_PEER_ATTR_MAX, peer, hwsim_pmsr_peer_result_policy, info->extack); if (ret) return ret; if (tb[NL80211_PMSR_PEER_ATTR_ADDR]) memcpy(result->addr, nla_data(tb[NL80211_PMSR_PEER_ATTR_ADDR]), ETH_ALEN); if (tb[NL80211_PMSR_PEER_ATTR_RESP]) { ret = mac80211_hwsim_parse_pmsr_resp(tb[NL80211_PMSR_PEER_ATTR_RESP], result, info); if (ret) return ret; } return 0; }; static int hwsim_pmsr_report_nl(struct sk_buff *msg, struct genl_info *info) { struct mac80211_hwsim_data *data; struct nlattr *peers, *peer; struct nlattr *reqattr; const u8 *src; int err; int rem; if (!info->attrs[HWSIM_ATTR_ADDR_TRANSMITTER]) return -EINVAL; src = nla_data(info->attrs[HWSIM_ATTR_ADDR_TRANSMITTER]); data = get_hwsim_data_ref_from_addr(src); if (!data) return -EINVAL; mutex_lock(&data->mutex); if (!data->pmsr_request) { err = -EINVAL; goto out; } reqattr = info->attrs[HWSIM_ATTR_PMSR_RESULT]; if (!reqattr) { err = -EINVAL; goto out; } peers = nla_find_nested(reqattr, NL80211_PMSR_ATTR_PEERS); if (!peers) { err = -EINVAL; goto out; } nla_for_each_nested(peer, peers, rem) { struct cfg80211_pmsr_result result = {}; err = mac80211_hwsim_parse_pmsr_result(peer, &result, info); if (err) goto out; cfg80211_pmsr_report(data->pmsr_request_wdev, data->pmsr_request, &result, GFP_KERNEL); } cfg80211_pmsr_complete(data->pmsr_request_wdev, data->pmsr_request, GFP_KERNEL); err = 0; out: data->pmsr_request = NULL; data->pmsr_request_wdev = NULL; mutex_unlock(&data->mutex); return err; } static enum hrtimer_restart mac80211_hwsim_nan_dw_start(struct hrtimer *timer) { struct mac80211_hwsim_data *data = container_of(timer, struct mac80211_hwsim_data, nan_timer); struct ieee80211_hw *hw = data->hw; u64 orig_tsf = mac80211_hwsim_get_tsf(hw, NULL), tsf = orig_tsf; u32 dw_int = 512 * 1024; u64 until_dw; if (!data->nan_device_vif) return HRTIMER_NORESTART; if (data->nan_bands & BIT(NL80211_BAND_5GHZ)) { if (data->nan_curr_dw_band == NL80211_BAND_2GHZ) { dw_int = 128 * 1024; data->nan_curr_dw_band = NL80211_BAND_5GHZ; } else if (data->nan_curr_dw_band == NL80211_BAND_5GHZ) { data->nan_curr_dw_band = NL80211_BAND_2GHZ; } } until_dw = dw_int - do_div(tsf, dw_int); /* The timer might fire just before the actual DW, in which case * update the timeout to the actual next DW */ if (until_dw < dw_int / 2) until_dw += dw_int; /* The above do_div() call directly modifies the 'tsf' variable, thus, * use a copy so that the print below would show the original TSF. */ wiphy_debug(hw->wiphy, "%s: tsf=%llx, curr_dw_band=%u, next_dw=%llu\n", __func__, orig_tsf, data->nan_curr_dw_band, until_dw); hrtimer_forward_now(&data->nan_timer, ns_to_ktime(until_dw * NSEC_PER_USEC)); if (data->notify_dw) { struct ieee80211_channel *ch; struct wireless_dev *wdev = ieee80211_vif_to_wdev(data->nan_device_vif); if (data->nan_curr_dw_band == NL80211_BAND_5GHZ) ch = ieee80211_get_channel(hw->wiphy, 5745); else ch = ieee80211_get_channel(hw->wiphy, 2437); cfg80211_next_nan_dw_notif(wdev, ch, GFP_ATOMIC); } return HRTIMER_RESTART; } static int mac80211_hwsim_start_nan(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct cfg80211_nan_conf *conf) { struct mac80211_hwsim_data *data = hw->priv; u64 tsf = mac80211_hwsim_get_tsf(hw, NULL); u32 dw_int = 512 * 1000; u64 until_dw = dw_int - do_div(tsf, dw_int); struct wireless_dev *wdev = ieee80211_vif_to_wdev(vif); if (vif->type != NL80211_IFTYPE_NAN) return -EINVAL; if (data->nan_device_vif) return -EALREADY; /* set this before starting the timer, as preemption might occur */ data->nan_device_vif = vif; data->nan_bands = conf->bands; data->nan_curr_dw_band = NL80211_BAND_2GHZ; wiphy_debug(hw->wiphy, "nan_started, next_dw=%llu\n", until_dw); hrtimer_start(&data->nan_timer, ns_to_ktime(until_dw * NSEC_PER_USEC), HRTIMER_MODE_REL_SOFT); if (conf->cluster_id && !is_zero_ether_addr(conf->cluster_id) && is_zero_ether_addr(hwsim_nan_cluster_id)) { memcpy(hwsim_nan_cluster_id, conf->cluster_id, ETH_ALEN); } else if (is_zero_ether_addr(hwsim_nan_cluster_id)) { hwsim_nan_cluster_id[0] = 0x50; hwsim_nan_cluster_id[1] = 0x6f; hwsim_nan_cluster_id[2] = 0x9a; hwsim_nan_cluster_id[3] = 0x01; hwsim_nan_cluster_id[4] = get_random_u8(); hwsim_nan_cluster_id[5] = get_random_u8(); } data->notify_dw = conf->enable_dw_notification; cfg80211_nan_cluster_joined(wdev, hwsim_nan_cluster_id, true, GFP_KERNEL); return 0; } static int mac80211_hwsim_stop_nan(struct ieee80211_hw *hw, struct ieee80211_vif *vif) { struct mac80211_hwsim_data *data = hw->priv; struct mac80211_hwsim_data *data2; bool nan_cluster_running = false; if (vif->type != NL80211_IFTYPE_NAN || !data->nan_device_vif || data->nan_device_vif != vif) return -EINVAL; hrtimer_cancel(&data->nan_timer); data->nan_device_vif = NULL; spin_lock_bh(&hwsim_radio_lock); list_for_each_entry(data2, &hwsim_radios, list) { if (data2->nan_device_vif) { nan_cluster_running = true; break; } } spin_unlock_bh(&hwsim_radio_lock); if (!nan_cluster_running) memset(hwsim_nan_cluster_id, 0, ETH_ALEN); return 0; } static int mac80211_hwsim_change_nan_config(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct cfg80211_nan_conf *conf, u32 changes) { struct mac80211_hwsim_data *data = hw->priv; if (vif->type != NL80211_IFTYPE_NAN) return -EINVAL; if (!data->nan_device_vif) return -EINVAL; wiphy_debug(hw->wiphy, "nan_config_changed: changes=0x%x\n", changes); /* Handle only the changes we care about for simulation purposes */ if (changes & CFG80211_NAN_CONF_CHANGED_BANDS) { data->nan_bands = conf->bands; data->nan_curr_dw_band = NL80211_BAND_2GHZ; } if (changes & CFG80211_NAN_CONF_CHANGED_CONFIG) data->notify_dw = conf->enable_dw_notification; return 0; } #ifdef CONFIG_MAC80211_DEBUGFS #define HWSIM_DEBUGFS_OPS \ .link_add_debugfs = mac80211_hwsim_link_add_debugfs, #else #define HWSIM_DEBUGFS_OPS #endif #define HWSIM_COMMON_OPS \ .tx = mac80211_hwsim_tx, \ .wake_tx_queue = ieee80211_handle_wake_tx_queue, \ .start = mac80211_hwsim_start, \ .stop = mac80211_hwsim_stop, \ .add_interface = mac80211_hwsim_add_interface, \ .change_interface = mac80211_hwsim_change_interface, \ .remove_interface = mac80211_hwsim_remove_interface, \ .config = mac80211_hwsim_config, \ .configure_filter = mac80211_hwsim_configure_filter, \ .vif_cfg_changed = mac80211_hwsim_vif_info_changed, \ .link_info_changed = mac80211_hwsim_link_info_changed, \ .tx_last_beacon = mac80211_hwsim_tx_last_beacon, \ .sta_notify = mac80211_hwsim_sta_notify, \ .link_sta_rc_update = mac80211_hwsim_sta_rc_update, \ .conf_tx = mac80211_hwsim_conf_tx, \ .get_survey = mac80211_hwsim_get_survey, \ CFG80211_TESTMODE_CMD(mac80211_hwsim_testmode_cmd) \ .ampdu_action = mac80211_hwsim_ampdu_action, \ .flush = mac80211_hwsim_flush, \ .get_et_sset_count = mac80211_hwsim_get_et_sset_count, \ .get_et_stats = mac80211_hwsim_get_et_stats, \ .get_et_strings = mac80211_hwsim_get_et_strings, \ .start_pmsr = mac80211_hwsim_start_pmsr, \ .abort_pmsr = mac80211_hwsim_abort_pmsr, \ .start_nan = mac80211_hwsim_start_nan, \ .stop_nan = mac80211_hwsim_stop_nan, \ .nan_change_conf = mac80211_hwsim_change_nan_config, \ HWSIM_DEBUGFS_OPS #define HWSIM_NON_MLO_OPS \ .sta_add = mac80211_hwsim_sta_add, \ .sta_remove = mac80211_hwsim_sta_remove, \ .set_tim = mac80211_hwsim_set_tim, \ .get_tsf = mac80211_hwsim_get_tsf, \ .set_tsf = mac80211_hwsim_set_tsf, static const struct ieee80211_ops mac80211_hwsim_ops = { HWSIM_COMMON_OPS HWSIM_NON_MLO_OPS .sw_scan_start = mac80211_hwsim_sw_scan, .sw_scan_complete = mac80211_hwsim_sw_scan_complete, .add_chanctx = ieee80211_emulate_add_chanctx, .remove_chanctx = ieee80211_emulate_remove_chanctx, .change_chanctx = ieee80211_emulate_change_chanctx, .switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx, }; #define HWSIM_CHANCTX_OPS \ .hw_scan = mac80211_hwsim_hw_scan, \ .cancel_hw_scan = mac80211_hwsim_cancel_hw_scan, \ .remain_on_channel = mac80211_hwsim_roc, \ .cancel_remain_on_channel = mac80211_hwsim_croc, \ .add_chanctx = mac80211_hwsim_add_chanctx, \ .remove_chanctx = mac80211_hwsim_remove_chanctx, \ .change_chanctx = mac80211_hwsim_change_chanctx, \ .assign_vif_chanctx = mac80211_hwsim_assign_vif_chanctx,\ .unassign_vif_chanctx = mac80211_hwsim_unassign_vif_chanctx, \ .switch_vif_chanctx = mac80211_hwsim_switch_vif_chanctx, static const struct ieee80211_ops mac80211_hwsim_mchan_ops = { HWSIM_COMMON_OPS HWSIM_NON_MLO_OPS HWSIM_CHANCTX_OPS }; static const struct ieee80211_ops mac80211_hwsim_mlo_ops = { HWSIM_COMMON_OPS HWSIM_CHANCTX_OPS .set_rts_threshold = mac80211_hwsim_set_rts_threshold, .change_vif_links = mac80211_hwsim_change_vif_links, .change_sta_links = mac80211_hwsim_change_sta_links, .sta_state = mac80211_hwsim_sta_state, .can_neg_ttlm = mac80211_hwsim_can_neg_ttlm, }; struct hwsim_new_radio_params { unsigned int channels; const char *reg_alpha2; const struct ieee80211_regdomain *regd; bool reg_strict; bool p2p_device; bool use_chanctx; bool multi_radio; bool destroy_on_close; const char *hwname; bool no_vif; const u8 *perm_addr; u32 iftypes; u32 *ciphers; u8 n_ciphers; bool mlo; const struct cfg80211_pmsr_capabilities *pmsr_capa; bool nan_device; }; static void hwsim_mcast_config_msg(struct sk_buff *mcast_skb, struct genl_info *info) { if (info) genl_notify(&hwsim_genl_family, mcast_skb, info, HWSIM_MCGRP_CONFIG, GFP_KERNEL); else genlmsg_multicast(&hwsim_genl_family, mcast_skb, 0, HWSIM_MCGRP_CONFIG, GFP_KERNEL); } static int append_radio_msg(struct sk_buff *skb, int id, struct hwsim_new_radio_params *param) { int ret; ret = nla_put_u32(skb, HWSIM_ATTR_RADIO_ID, id); if (ret < 0) return ret; if (param->channels) { ret = nla_put_u32(skb, HWSIM_ATTR_CHANNELS, param->channels); if (ret < 0) return ret; } if (param->reg_alpha2) { ret = nla_put(skb, HWSIM_ATTR_REG_HINT_ALPHA2, 2, param->reg_alpha2); if (ret < 0) return ret; } if (param->regd) { int i; for (i = 0; i < ARRAY_SIZE(hwsim_world_regdom_custom); i++) { if (hwsim_world_regdom_custom[i] != param->regd) continue; ret = nla_put_u32(skb, HWSIM_ATTR_REG_CUSTOM_REG, i); if (ret < 0) return ret; break; } } if (param->reg_strict) { ret = nla_put_flag(skb, HWSIM_ATTR_REG_STRICT_REG); if (ret < 0) return ret; } if (param->p2p_device) { ret = nla_put_flag(skb, HWSIM_ATTR_SUPPORT_P2P_DEVICE); if (ret < 0) return ret; } if (param->use_chanctx) { ret = nla_put_flag(skb, HWSIM_ATTR_USE_CHANCTX); if (ret < 0) return ret; } if (param->multi_radio) { ret = nla_put_flag(skb, HWSIM_ATTR_MULTI_RADIO); if (ret < 0) return ret; } if (param->hwname) { ret = nla_put(skb, HWSIM_ATTR_RADIO_NAME, strlen(param->hwname), param->hwname); if (ret < 0) return ret; } if (param->nan_device) { ret = nla_put_flag(skb, HWSIM_ATTR_SUPPORT_NAN_DEVICE); if (ret < 0) return ret; } return 0; } static void hwsim_mcast_new_radio(int id, struct genl_info *info, struct hwsim_new_radio_params *param) { struct sk_buff *mcast_skb; void *data; mcast_skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!mcast_skb) return; data = genlmsg_put(mcast_skb, 0, 0, &hwsim_genl_family, 0, HWSIM_CMD_NEW_RADIO); if (!data) goto out_err; if (append_radio_msg(mcast_skb, id, param) < 0) goto out_err; genlmsg_end(mcast_skb, data); hwsim_mcast_config_msg(mcast_skb, info); return; out_err: nlmsg_free(mcast_skb); } static const struct ieee80211_sband_iftype_data sband_capa_2ghz[] = { { .types_mask = BIT(NL80211_IFTYPE_STATION) | BIT(NL80211_IFTYPE_P2P_CLIENT), .he_cap = { .has_he = true, .he_cap_elem = { .mac_cap_info[0] = IEEE80211_HE_MAC_CAP0_HTC_HE, .mac_cap_info[1] = IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_16US | IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8, .mac_cap_info[2] = IEEE80211_HE_MAC_CAP2_BSR | IEEE80211_HE_MAC_CAP2_MU_CASCADING | IEEE80211_HE_MAC_CAP2_ACK_EN, .mac_cap_info[3] = IEEE80211_HE_MAC_CAP3_OMI_CONTROL | IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3, .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU, .phy_cap_info[0] = IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G, .phy_cap_info[1] = IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK | IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A | IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD | IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS, .phy_cap_info[2] = IEEE80211_HE_PHY_CAP2_NDP_4x_LTF_AND_3_2US | IEEE80211_HE_PHY_CAP2_STBC_TX_UNDER_80MHZ | IEEE80211_HE_PHY_CAP2_STBC_RX_UNDER_80MHZ | IEEE80211_HE_PHY_CAP2_UL_MU_FULL_MU_MIMO | IEEE80211_HE_PHY_CAP2_UL_MU_PARTIAL_MU_MIMO, /* Leave all the other PHY capability bytes * unset, as DCM, beam forming, RU and PPE * threshold information are not supported */ }, .he_mcs_nss_supp = { .rx_mcs_80 = cpu_to_le16(0xfffa), .tx_mcs_80 = cpu_to_le16(0xfffa), .rx_mcs_160 = cpu_to_le16(0xffff), .tx_mcs_160 = cpu_to_le16(0xffff), .rx_mcs_80p80 = cpu_to_le16(0xffff), .tx_mcs_80p80 = cpu_to_le16(0xffff), }, }, .eht_cap = { .has_eht = true, .eht_cap_elem = { .mac_cap_info[0] = IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS | IEEE80211_EHT_MAC_CAP0_OM_CONTROL | IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE1, .phy_cap_info[0] = IEEE80211_EHT_PHY_CAP0_242_TONE_RU_GT20MHZ | IEEE80211_EHT_PHY_CAP0_NDP_4_EHT_LFT_32_GI | IEEE80211_EHT_PHY_CAP0_PARTIAL_BW_UL_MU_MIMO | IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMER | IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMEE, .phy_cap_info[3] = IEEE80211_EHT_PHY_CAP3_NG_16_SU_FEEDBACK | IEEE80211_EHT_PHY_CAP3_NG_16_MU_FEEDBACK | IEEE80211_EHT_PHY_CAP3_CODEBOOK_4_2_SU_FDBK | IEEE80211_EHT_PHY_CAP3_CODEBOOK_7_5_MU_FDBK | IEEE80211_EHT_PHY_CAP3_TRIG_SU_BF_FDBK | IEEE80211_EHT_PHY_CAP3_TRIG_MU_BF_PART_BW_FDBK | IEEE80211_EHT_PHY_CAP3_TRIG_CQI_FDBK, .phy_cap_info[4] = IEEE80211_EHT_PHY_CAP4_PART_BW_DL_MU_MIMO | IEEE80211_EHT_PHY_CAP4_PSR_SR_SUPP | IEEE80211_EHT_PHY_CAP4_POWER_BOOST_FACT_SUPP | IEEE80211_EHT_PHY_CAP4_EHT_MU_PPDU_4_EHT_LTF_08_GI | IEEE80211_EHT_PHY_CAP4_MAX_NC_MASK, .phy_cap_info[5] = IEEE80211_EHT_PHY_CAP5_NON_TRIG_CQI_FEEDBACK | IEEE80211_EHT_PHY_CAP5_TX_LESS_242_TONE_RU_SUPP | IEEE80211_EHT_PHY_CAP5_RX_LESS_242_TONE_RU_SUPP | IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT | IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_MASK | IEEE80211_EHT_PHY_CAP5_MAX_NUM_SUPP_EHT_LTF_MASK, .phy_cap_info[6] = IEEE80211_EHT_PHY_CAP6_MAX_NUM_SUPP_EHT_LTF_MASK | IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_MASK, .phy_cap_info[7] = IEEE80211_EHT_PHY_CAP7_20MHZ_STA_RX_NDP_WIDER_BW, }, /* For all MCS and bandwidth, set 8 NSS for both Tx and * Rx */ .eht_mcs_nss_supp = { /* * Since B0, B1, B2 and B3 are not set in * the supported channel width set field in the * HE PHY capabilities information field the * device is a 20MHz only device on 2.4GHz band. */ .only_20mhz = { .rx_tx_mcs7_max_nss = 0x88, .rx_tx_mcs9_max_nss = 0x88, .rx_tx_mcs11_max_nss = 0x88, .rx_tx_mcs13_max_nss = 0x88, }, }, /* PPE threshold information is not supported */ }, }, { .types_mask = BIT(NL80211_IFTYPE_AP) | BIT(NL80211_IFTYPE_P2P_GO), .he_cap = { .has_he = true, .he_cap_elem = { .mac_cap_info[0] = IEEE80211_HE_MAC_CAP0_HTC_HE, .mac_cap_info[1] = IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_16US | IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8, .mac_cap_info[2] = IEEE80211_HE_MAC_CAP2_BSR | IEEE80211_HE_MAC_CAP2_MU_CASCADING | IEEE80211_HE_MAC_CAP2_ACK_EN, .mac_cap_info[3] = IEEE80211_HE_MAC_CAP3_OMI_CONTROL | IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3, .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU, .phy_cap_info[0] = IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G, .phy_cap_info[1] = IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK | IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A | IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD | IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS, .phy_cap_info[2] = IEEE80211_HE_PHY_CAP2_NDP_4x_LTF_AND_3_2US | IEEE80211_HE_PHY_CAP2_STBC_TX_UNDER_80MHZ | IEEE80211_HE_PHY_CAP2_STBC_RX_UNDER_80MHZ | IEEE80211_HE_PHY_CAP2_UL_MU_FULL_MU_MIMO | IEEE80211_HE_PHY_CAP2_UL_MU_PARTIAL_MU_MIMO, /* Leave all the other PHY capability bytes * unset, as DCM, beam forming, RU and PPE * threshold information are not supported */ }, .he_mcs_nss_supp = { .rx_mcs_80 = cpu_to_le16(0xfffa), .tx_mcs_80 = cpu_to_le16(0xfffa), .rx_mcs_160 = cpu_to_le16(0xffff), .tx_mcs_160 = cpu_to_le16(0xffff), .rx_mcs_80p80 = cpu_to_le16(0xffff), .tx_mcs_80p80 = cpu_to_le16(0xffff), }, }, .eht_cap = { .has_eht = true, .eht_cap_elem = { .mac_cap_info[0] = IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS | IEEE80211_EHT_MAC_CAP0_OM_CONTROL | IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE1, .phy_cap_info[0] = IEEE80211_EHT_PHY_CAP0_242_TONE_RU_GT20MHZ | IEEE80211_EHT_PHY_CAP0_NDP_4_EHT_LFT_32_GI | IEEE80211_EHT_PHY_CAP0_PARTIAL_BW_UL_MU_MIMO | IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMER | IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMEE, .phy_cap_info[3] = IEEE80211_EHT_PHY_CAP3_NG_16_SU_FEEDBACK | IEEE80211_EHT_PHY_CAP3_NG_16_MU_FEEDBACK | IEEE80211_EHT_PHY_CAP3_CODEBOOK_4_2_SU_FDBK | IEEE80211_EHT_PHY_CAP3_CODEBOOK_7_5_MU_FDBK | IEEE80211_EHT_PHY_CAP3_TRIG_SU_BF_FDBK | IEEE80211_EHT_PHY_CAP3_TRIG_MU_BF_PART_BW_FDBK | IEEE80211_EHT_PHY_CAP3_TRIG_CQI_FDBK, .phy_cap_info[4] = IEEE80211_EHT_PHY_CAP4_PART_BW_DL_MU_MIMO | IEEE80211_EHT_PHY_CAP4_PSR_SR_SUPP | IEEE80211_EHT_PHY_CAP4_POWER_BOOST_FACT_SUPP | IEEE80211_EHT_PHY_CAP4_EHT_MU_PPDU_4_EHT_LTF_08_GI | IEEE80211_EHT_PHY_CAP4_MAX_NC_MASK, .phy_cap_info[5] = IEEE80211_EHT_PHY_CAP5_NON_TRIG_CQI_FEEDBACK | IEEE80211_EHT_PHY_CAP5_TX_LESS_242_TONE_RU_SUPP | IEEE80211_EHT_PHY_CAP5_RX_LESS_242_TONE_RU_SUPP | IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT | IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_MASK | IEEE80211_EHT_PHY_CAP5_MAX_NUM_SUPP_EHT_LTF_MASK, .phy_cap_info[6] = IEEE80211_EHT_PHY_CAP6_MAX_NUM_SUPP_EHT_LTF_MASK | IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_MASK, .phy_cap_info[7] = IEEE80211_EHT_PHY_CAP7_20MHZ_STA_RX_NDP_WIDER_BW, }, /* For all MCS and bandwidth, set 8 NSS for both Tx and * Rx */ .eht_mcs_nss_supp = { /* * Since B0, B1, B2 and B3 are not set in * the supported channel width set field in the * HE PHY capabilities information field the * device is a 20MHz only device on 2.4GHz band. */ .only_20mhz = { .rx_tx_mcs7_max_nss = 0x88, .rx_tx_mcs9_max_nss = 0x88, .rx_tx_mcs11_max_nss = 0x88, .rx_tx_mcs13_max_nss = 0x88, }, }, /* PPE threshold information is not supported */ }, }, #ifdef CONFIG_MAC80211_MESH { .types_mask = BIT(NL80211_IFTYPE_MESH_POINT), .he_cap = { .has_he = true, .he_cap_elem = { .mac_cap_info[0] = IEEE80211_HE_MAC_CAP0_HTC_HE, .mac_cap_info[1] = IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8, .mac_cap_info[2] = IEEE80211_HE_MAC_CAP2_ACK_EN, .mac_cap_info[3] = IEEE80211_HE_MAC_CAP3_OMI_CONTROL | IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3, .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU, .phy_cap_info[0] = IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G, .phy_cap_info[1] = IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK | IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A | IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD | IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS, .phy_cap_info[2] = 0, /* Leave all the other PHY capability bytes * unset, as DCM, beam forming, RU and PPE * threshold information are not supported */ }, .he_mcs_nss_supp = { .rx_mcs_80 = cpu_to_le16(0xfffa), .tx_mcs_80 = cpu_to_le16(0xfffa), .rx_mcs_160 = cpu_to_le16(0xffff), .tx_mcs_160 = cpu_to_le16(0xffff), .rx_mcs_80p80 = cpu_to_le16(0xffff), .tx_mcs_80p80 = cpu_to_le16(0xffff), }, }, }, #endif }; static const struct ieee80211_sband_iftype_data sband_capa_5ghz[] = { { .types_mask = BIT(NL80211_IFTYPE_STATION) | BIT(NL80211_IFTYPE_P2P_CLIENT), .he_cap = { .has_he = true, .he_cap_elem = { .mac_cap_info[0] = IEEE80211_HE_MAC_CAP0_HTC_HE, .mac_cap_info[1] = IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_16US | IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8, .mac_cap_info[2] = IEEE80211_HE_MAC_CAP2_BSR | IEEE80211_HE_MAC_CAP2_MU_CASCADING | IEEE80211_HE_MAC_CAP2_ACK_EN, .mac_cap_info[3] = IEEE80211_HE_MAC_CAP3_OMI_CONTROL | IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3, .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU, .phy_cap_info[0] = IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G | IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G | IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G, .phy_cap_info[1] = IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK | IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A | IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD | IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS, .phy_cap_info[2] = IEEE80211_HE_PHY_CAP2_NDP_4x_LTF_AND_3_2US | IEEE80211_HE_PHY_CAP2_STBC_TX_UNDER_80MHZ | IEEE80211_HE_PHY_CAP2_STBC_RX_UNDER_80MHZ | IEEE80211_HE_PHY_CAP2_UL_MU_FULL_MU_MIMO | IEEE80211_HE_PHY_CAP2_UL_MU_PARTIAL_MU_MIMO, /* Leave all the other PHY capability bytes * unset, as DCM, beam forming, RU and PPE * threshold information are not supported */ }, .he_mcs_nss_supp = { .rx_mcs_80 = cpu_to_le16(0xfffa), .tx_mcs_80 = cpu_to_le16(0xfffa), .rx_mcs_160 = cpu_to_le16(0xfffa), .tx_mcs_160 = cpu_to_le16(0xfffa), .rx_mcs_80p80 = cpu_to_le16(0xfffa), .tx_mcs_80p80 = cpu_to_le16(0xfffa), }, }, .eht_cap = { .has_eht = true, .eht_cap_elem = { .mac_cap_info[0] = IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS | IEEE80211_EHT_MAC_CAP0_OM_CONTROL | IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE1, .phy_cap_info[0] = IEEE80211_EHT_PHY_CAP0_242_TONE_RU_GT20MHZ | IEEE80211_EHT_PHY_CAP0_NDP_4_EHT_LFT_32_GI | IEEE80211_EHT_PHY_CAP0_PARTIAL_BW_UL_MU_MIMO | IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMER | IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMEE | IEEE80211_EHT_PHY_CAP0_BEAMFORMEE_SS_80MHZ_MASK, .phy_cap_info[1] = IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_80MHZ_MASK | IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_160MHZ_MASK, .phy_cap_info[2] = IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_80MHZ_MASK | IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_160MHZ_MASK, .phy_cap_info[3] = IEEE80211_EHT_PHY_CAP3_NG_16_SU_FEEDBACK | IEEE80211_EHT_PHY_CAP3_NG_16_MU_FEEDBACK | IEEE80211_EHT_PHY_CAP3_CODEBOOK_4_2_SU_FDBK | IEEE80211_EHT_PHY_CAP3_CODEBOOK_7_5_MU_FDBK | IEEE80211_EHT_PHY_CAP3_TRIG_SU_BF_FDBK | IEEE80211_EHT_PHY_CAP3_TRIG_MU_BF_PART_BW_FDBK | IEEE80211_EHT_PHY_CAP3_TRIG_CQI_FDBK, .phy_cap_info[4] = IEEE80211_EHT_PHY_CAP4_PART_BW_DL_MU_MIMO | IEEE80211_EHT_PHY_CAP4_PSR_SR_SUPP | IEEE80211_EHT_PHY_CAP4_POWER_BOOST_FACT_SUPP | IEEE80211_EHT_PHY_CAP4_EHT_MU_PPDU_4_EHT_LTF_08_GI | IEEE80211_EHT_PHY_CAP4_MAX_NC_MASK, .phy_cap_info[5] = IEEE80211_EHT_PHY_CAP5_NON_TRIG_CQI_FEEDBACK | IEEE80211_EHT_PHY_CAP5_TX_LESS_242_TONE_RU_SUPP | IEEE80211_EHT_PHY_CAP5_RX_LESS_242_TONE_RU_SUPP | IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT | IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_MASK | IEEE80211_EHT_PHY_CAP5_MAX_NUM_SUPP_EHT_LTF_MASK, .phy_cap_info[6] = IEEE80211_EHT_PHY_CAP6_MAX_NUM_SUPP_EHT_LTF_MASK | IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_MASK, .phy_cap_info[7] = IEEE80211_EHT_PHY_CAP7_20MHZ_STA_RX_NDP_WIDER_BW | IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_80MHZ | IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_160MHZ | IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_80MHZ | IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_160MHZ, }, /* For all MCS and bandwidth, set 8 NSS for both Tx and * Rx */ .eht_mcs_nss_supp = { /* * As B1 and B2 are set in the supported * channel width set field in the HE PHY * capabilities information field include all * the following MCS/NSS. */ .bw._80 = { .rx_tx_mcs9_max_nss = 0x88, .rx_tx_mcs11_max_nss = 0x88, .rx_tx_mcs13_max_nss = 0x88, }, .bw._160 = { .rx_tx_mcs9_max_nss = 0x88, .rx_tx_mcs11_max_nss = 0x88, .rx_tx_mcs13_max_nss = 0x88, }, }, /* PPE threshold information is not supported */ }, }, { .types_mask = BIT(NL80211_IFTYPE_AP) | BIT(NL80211_IFTYPE_P2P_GO), .he_cap = { .has_he = true, .he_cap_elem = { .mac_cap_info[0] = IEEE80211_HE_MAC_CAP0_HTC_HE, .mac_cap_info[1] = IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_16US | IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8, .mac_cap_info[2] = IEEE80211_HE_MAC_CAP2_BSR | IEEE80211_HE_MAC_CAP2_MU_CASCADING | IEEE80211_HE_MAC_CAP2_ACK_EN, .mac_cap_info[3] = IEEE80211_HE_MAC_CAP3_OMI_CONTROL | IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3, .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU, .phy_cap_info[0] = IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G | IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G | IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G, .phy_cap_info[1] = IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK | IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A | IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD | IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS, .phy_cap_info[2] = IEEE80211_HE_PHY_CAP2_NDP_4x_LTF_AND_3_2US | IEEE80211_HE_PHY_CAP2_STBC_TX_UNDER_80MHZ | IEEE80211_HE_PHY_CAP2_STBC_RX_UNDER_80MHZ | IEEE80211_HE_PHY_CAP2_UL_MU_FULL_MU_MIMO | IEEE80211_HE_PHY_CAP2_UL_MU_PARTIAL_MU_MIMO, /* Leave all the other PHY capability bytes * unset, as DCM, beam forming, RU and PPE * threshold information are not supported */ }, .he_mcs_nss_supp = { .rx_mcs_80 = cpu_to_le16(0xfffa), .tx_mcs_80 = cpu_to_le16(0xfffa), .rx_mcs_160 = cpu_to_le16(0xfffa), .tx_mcs_160 = cpu_to_le16(0xfffa), .rx_mcs_80p80 = cpu_to_le16(0xfffa), .tx_mcs_80p80 = cpu_to_le16(0xfffa), }, }, .eht_cap = { .has_eht = true, .eht_cap_elem = { .mac_cap_info[0] = IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS | IEEE80211_EHT_MAC_CAP0_OM_CONTROL | IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE1, .phy_cap_info[0] = IEEE80211_EHT_PHY_CAP0_242_TONE_RU_GT20MHZ | IEEE80211_EHT_PHY_CAP0_NDP_4_EHT_LFT_32_GI | IEEE80211_EHT_PHY_CAP0_PARTIAL_BW_UL_MU_MIMO | IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMER | IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMEE | IEEE80211_EHT_PHY_CAP0_BEAMFORMEE_SS_80MHZ_MASK, .phy_cap_info[1] = IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_80MHZ_MASK | IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_160MHZ_MASK, .phy_cap_info[2] = IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_80MHZ_MASK | IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_160MHZ_MASK, .phy_cap_info[3] = IEEE80211_EHT_PHY_CAP3_NG_16_SU_FEEDBACK | IEEE80211_EHT_PHY_CAP3_NG_16_MU_FEEDBACK | IEEE80211_EHT_PHY_CAP3_CODEBOOK_4_2_SU_FDBK | IEEE80211_EHT_PHY_CAP3_CODEBOOK_7_5_MU_FDBK | IEEE80211_EHT_PHY_CAP3_TRIG_SU_BF_FDBK | IEEE80211_EHT_PHY_CAP3_TRIG_MU_BF_PART_BW_FDBK | IEEE80211_EHT_PHY_CAP3_TRIG_CQI_FDBK, .phy_cap_info[4] = IEEE80211_EHT_PHY_CAP4_PART_BW_DL_MU_MIMO | IEEE80211_EHT_PHY_CAP4_PSR_SR_SUPP | IEEE80211_EHT_PHY_CAP4_POWER_BOOST_FACT_SUPP | IEEE80211_EHT_PHY_CAP4_EHT_MU_PPDU_4_EHT_LTF_08_GI | IEEE80211_EHT_PHY_CAP4_MAX_NC_MASK, .phy_cap_info[5] = IEEE80211_EHT_PHY_CAP5_NON_TRIG_CQI_FEEDBACK | IEEE80211_EHT_PHY_CAP5_TX_LESS_242_TONE_RU_SUPP | IEEE80211_EHT_PHY_CAP5_RX_LESS_242_TONE_RU_SUPP | IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT | IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_MASK | IEEE80211_EHT_PHY_CAP5_MAX_NUM_SUPP_EHT_LTF_MASK, .phy_cap_info[6] = IEEE80211_EHT_PHY_CAP6_MAX_NUM_SUPP_EHT_LTF_MASK | IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_MASK, .phy_cap_info[7] = IEEE80211_EHT_PHY_CAP7_20MHZ_STA_RX_NDP_WIDER_BW | IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_80MHZ | IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_160MHZ | IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_80MHZ | IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_160MHZ, }, /* For all MCS and bandwidth, set 8 NSS for both Tx and * Rx */ .eht_mcs_nss_supp = { /* * As B1 and B2 are set in the supported * channel width set field in the HE PHY * capabilities information field include all * the following MCS/NSS. */ .bw._80 = { .rx_tx_mcs9_max_nss = 0x88, .rx_tx_mcs11_max_nss = 0x88, .rx_tx_mcs13_max_nss = 0x88, }, .bw._160 = { .rx_tx_mcs9_max_nss = 0x88, .rx_tx_mcs11_max_nss = 0x88, .rx_tx_mcs13_max_nss = 0x88, }, }, /* PPE threshold information is not supported */ }, }, #ifdef CONFIG_MAC80211_MESH { /* TODO: should we support other types, e.g., IBSS?*/ .types_mask = BIT(NL80211_IFTYPE_MESH_POINT), .he_cap = { .has_he = true, .he_cap_elem = { .mac_cap_info[0] = IEEE80211_HE_MAC_CAP0_HTC_HE, .mac_cap_info[1] = IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8, .mac_cap_info[2] = IEEE80211_HE_MAC_CAP2_ACK_EN, .mac_cap_info[3] = IEEE80211_HE_MAC_CAP3_OMI_CONTROL | IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3, .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU, .phy_cap_info[0] = IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G | IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G | IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G, .phy_cap_info[1] = IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK | IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A | IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD | IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS, .phy_cap_info[2] = 0, /* Leave all the other PHY capability bytes * unset, as DCM, beam forming, RU and PPE * threshold information are not supported */ }, .he_mcs_nss_supp = { .rx_mcs_80 = cpu_to_le16(0xfffa), .tx_mcs_80 = cpu_to_le16(0xfffa), .rx_mcs_160 = cpu_to_le16(0xfffa), .tx_mcs_160 = cpu_to_le16(0xfffa), .rx_mcs_80p80 = cpu_to_le16(0xfffa), .tx_mcs_80p80 = cpu_to_le16(0xfffa), }, }, }, #endif }; static const struct ieee80211_sband_iftype_data sband_capa_6ghz[] = { { .types_mask = BIT(NL80211_IFTYPE_STATION) | BIT(NL80211_IFTYPE_P2P_CLIENT), .he_6ghz_capa = { .capa = cpu_to_le16(IEEE80211_HE_6GHZ_CAP_MIN_MPDU_START | IEEE80211_HE_6GHZ_CAP_MAX_AMPDU_LEN_EXP | IEEE80211_HE_6GHZ_CAP_MAX_MPDU_LEN | IEEE80211_HE_6GHZ_CAP_SM_PS | IEEE80211_HE_6GHZ_CAP_RD_RESPONDER | IEEE80211_HE_6GHZ_CAP_TX_ANTPAT_CONS | IEEE80211_HE_6GHZ_CAP_RX_ANTPAT_CONS), }, .he_cap = { .has_he = true, .he_cap_elem = { .mac_cap_info[0] = IEEE80211_HE_MAC_CAP0_HTC_HE, .mac_cap_info[1] = IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_16US | IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8, .mac_cap_info[2] = IEEE80211_HE_MAC_CAP2_BSR | IEEE80211_HE_MAC_CAP2_MU_CASCADING | IEEE80211_HE_MAC_CAP2_ACK_EN, .mac_cap_info[3] = IEEE80211_HE_MAC_CAP3_OMI_CONTROL | IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3, .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU, .phy_cap_info[0] = IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G | IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G | IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G, .phy_cap_info[1] = IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK | IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A | IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD | IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS, .phy_cap_info[2] = IEEE80211_HE_PHY_CAP2_NDP_4x_LTF_AND_3_2US | IEEE80211_HE_PHY_CAP2_STBC_TX_UNDER_80MHZ | IEEE80211_HE_PHY_CAP2_STBC_RX_UNDER_80MHZ | IEEE80211_HE_PHY_CAP2_UL_MU_FULL_MU_MIMO | IEEE80211_HE_PHY_CAP2_UL_MU_PARTIAL_MU_MIMO, /* Leave all the other PHY capability bytes * unset, as DCM, beam forming, RU and PPE * threshold information are not supported */ }, .he_mcs_nss_supp = { .rx_mcs_80 = cpu_to_le16(0xfffa), .tx_mcs_80 = cpu_to_le16(0xfffa), .rx_mcs_160 = cpu_to_le16(0xfffa), .tx_mcs_160 = cpu_to_le16(0xfffa), .rx_mcs_80p80 = cpu_to_le16(0xfffa), .tx_mcs_80p80 = cpu_to_le16(0xfffa), }, }, .eht_cap = { .has_eht = true, .eht_cap_elem = { .mac_cap_info[0] = IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS | IEEE80211_EHT_MAC_CAP0_OM_CONTROL | IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE1, .phy_cap_info[0] = IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ | IEEE80211_EHT_PHY_CAP0_242_TONE_RU_GT20MHZ | IEEE80211_EHT_PHY_CAP0_NDP_4_EHT_LFT_32_GI | IEEE80211_EHT_PHY_CAP0_PARTIAL_BW_UL_MU_MIMO | IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMER | IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMEE | IEEE80211_EHT_PHY_CAP0_BEAMFORMEE_SS_80MHZ_MASK, .phy_cap_info[1] = IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_80MHZ_MASK | IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_160MHZ_MASK | IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_320MHZ_MASK, .phy_cap_info[2] = IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_80MHZ_MASK | IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_160MHZ_MASK | IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_320MHZ_MASK, .phy_cap_info[3] = IEEE80211_EHT_PHY_CAP3_NG_16_SU_FEEDBACK | IEEE80211_EHT_PHY_CAP3_NG_16_MU_FEEDBACK | IEEE80211_EHT_PHY_CAP3_CODEBOOK_4_2_SU_FDBK | IEEE80211_EHT_PHY_CAP3_CODEBOOK_7_5_MU_FDBK | IEEE80211_EHT_PHY_CAP3_TRIG_SU_BF_FDBK | IEEE80211_EHT_PHY_CAP3_TRIG_MU_BF_PART_BW_FDBK | IEEE80211_EHT_PHY_CAP3_TRIG_CQI_FDBK, .phy_cap_info[4] = IEEE80211_EHT_PHY_CAP4_PART_BW_DL_MU_MIMO | IEEE80211_EHT_PHY_CAP4_PSR_SR_SUPP | IEEE80211_EHT_PHY_CAP4_POWER_BOOST_FACT_SUPP | IEEE80211_EHT_PHY_CAP4_EHT_MU_PPDU_4_EHT_LTF_08_GI | IEEE80211_EHT_PHY_CAP4_MAX_NC_MASK, .phy_cap_info[5] = IEEE80211_EHT_PHY_CAP5_NON_TRIG_CQI_FEEDBACK | IEEE80211_EHT_PHY_CAP5_TX_LESS_242_TONE_RU_SUPP | IEEE80211_EHT_PHY_CAP5_RX_LESS_242_TONE_RU_SUPP | IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT | IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_MASK | IEEE80211_EHT_PHY_CAP5_MAX_NUM_SUPP_EHT_LTF_MASK, .phy_cap_info[6] = IEEE80211_EHT_PHY_CAP6_MAX_NUM_SUPP_EHT_LTF_MASK | IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_MASK | IEEE80211_EHT_PHY_CAP6_EHT_DUP_6GHZ_SUPP, .phy_cap_info[7] = IEEE80211_EHT_PHY_CAP7_20MHZ_STA_RX_NDP_WIDER_BW | IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_80MHZ | IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_160MHZ | IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_320MHZ | IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_80MHZ | IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_160MHZ | IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_320MHZ, }, /* For all MCS and bandwidth, set 8 NSS for both Tx and * Rx */ .eht_mcs_nss_supp = { /* * As B1 and B2 are set in the supported * channel width set field in the HE PHY * capabilities information field and 320MHz in * 6GHz is supported include all the following * MCS/NSS. */ .bw._80 = { .rx_tx_mcs9_max_nss = 0x88, .rx_tx_mcs11_max_nss = 0x88, .rx_tx_mcs13_max_nss = 0x88, }, .bw._160 = { .rx_tx_mcs9_max_nss = 0x88, .rx_tx_mcs11_max_nss = 0x88, .rx_tx_mcs13_max_nss = 0x88, }, .bw._320 = { .rx_tx_mcs9_max_nss = 0x88, .rx_tx_mcs11_max_nss = 0x88, .rx_tx_mcs13_max_nss = 0x88, }, }, /* PPE threshold information is not supported */ }, }, { .types_mask = BIT(NL80211_IFTYPE_AP) | BIT(NL80211_IFTYPE_P2P_GO), .he_6ghz_capa = { .capa = cpu_to_le16(IEEE80211_HE_6GHZ_CAP_MIN_MPDU_START | IEEE80211_HE_6GHZ_CAP_MAX_AMPDU_LEN_EXP | IEEE80211_HE_6GHZ_CAP_MAX_MPDU_LEN | IEEE80211_HE_6GHZ_CAP_SM_PS | IEEE80211_HE_6GHZ_CAP_RD_RESPONDER | IEEE80211_HE_6GHZ_CAP_TX_ANTPAT_CONS | IEEE80211_HE_6GHZ_CAP_RX_ANTPAT_CONS), }, .he_cap = { .has_he = true, .he_cap_elem = { .mac_cap_info[0] = IEEE80211_HE_MAC_CAP0_HTC_HE, .mac_cap_info[1] = IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_16US | IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8, .mac_cap_info[2] = IEEE80211_HE_MAC_CAP2_BSR | IEEE80211_HE_MAC_CAP2_MU_CASCADING | IEEE80211_HE_MAC_CAP2_ACK_EN, .mac_cap_info[3] = IEEE80211_HE_MAC_CAP3_OMI_CONTROL | IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3, .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU, .phy_cap_info[0] = IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G | IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G | IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G, .phy_cap_info[1] = IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK | IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A | IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD | IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS, .phy_cap_info[2] = IEEE80211_HE_PHY_CAP2_NDP_4x_LTF_AND_3_2US | IEEE80211_HE_PHY_CAP2_STBC_TX_UNDER_80MHZ | IEEE80211_HE_PHY_CAP2_STBC_RX_UNDER_80MHZ | IEEE80211_HE_PHY_CAP2_UL_MU_FULL_MU_MIMO | IEEE80211_HE_PHY_CAP2_UL_MU_PARTIAL_MU_MIMO, /* Leave all the other PHY capability bytes * unset, as DCM, beam forming, RU and PPE * threshold information are not supported */ }, .he_mcs_nss_supp = { .rx_mcs_80 = cpu_to_le16(0xfffa), .tx_mcs_80 = cpu_to_le16(0xfffa), .rx_mcs_160 = cpu_to_le16(0xfffa), .tx_mcs_160 = cpu_to_le16(0xfffa), .rx_mcs_80p80 = cpu_to_le16(0xfffa), .tx_mcs_80p80 = cpu_to_le16(0xfffa), }, }, .eht_cap = { .has_eht = true, .eht_cap_elem = { .mac_cap_info[0] = IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS | IEEE80211_EHT_MAC_CAP0_OM_CONTROL | IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE1, .phy_cap_info[0] = IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ | IEEE80211_EHT_PHY_CAP0_242_TONE_RU_GT20MHZ | IEEE80211_EHT_PHY_CAP0_NDP_4_EHT_LFT_32_GI | IEEE80211_EHT_PHY_CAP0_PARTIAL_BW_UL_MU_MIMO | IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMER | IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMEE | IEEE80211_EHT_PHY_CAP0_BEAMFORMEE_SS_80MHZ_MASK, .phy_cap_info[1] = IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_80MHZ_MASK | IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_160MHZ_MASK | IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_320MHZ_MASK, .phy_cap_info[2] = IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_80MHZ_MASK | IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_160MHZ_MASK | IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_320MHZ_MASK, .phy_cap_info[3] = IEEE80211_EHT_PHY_CAP3_NG_16_SU_FEEDBACK | IEEE80211_EHT_PHY_CAP3_NG_16_MU_FEEDBACK | IEEE80211_EHT_PHY_CAP3_CODEBOOK_4_2_SU_FDBK | IEEE80211_EHT_PHY_CAP3_CODEBOOK_7_5_MU_FDBK | IEEE80211_EHT_PHY_CAP3_TRIG_SU_BF_FDBK | IEEE80211_EHT_PHY_CAP3_TRIG_MU_BF_PART_BW_FDBK | IEEE80211_EHT_PHY_CAP3_TRIG_CQI_FDBK, .phy_cap_info[4] = IEEE80211_EHT_PHY_CAP4_PART_BW_DL_MU_MIMO | IEEE80211_EHT_PHY_CAP4_PSR_SR_SUPP | IEEE80211_EHT_PHY_CAP4_POWER_BOOST_FACT_SUPP | IEEE80211_EHT_PHY_CAP4_EHT_MU_PPDU_4_EHT_LTF_08_GI | IEEE80211_EHT_PHY_CAP4_MAX_NC_MASK, .phy_cap_info[5] = IEEE80211_EHT_PHY_CAP5_NON_TRIG_CQI_FEEDBACK | IEEE80211_EHT_PHY_CAP5_TX_LESS_242_TONE_RU_SUPP | IEEE80211_EHT_PHY_CAP5_RX_LESS_242_TONE_RU_SUPP | IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT | IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_MASK | IEEE80211_EHT_PHY_CAP5_MAX_NUM_SUPP_EHT_LTF_MASK, .phy_cap_info[6] = IEEE80211_EHT_PHY_CAP6_MAX_NUM_SUPP_EHT_LTF_MASK | IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_MASK | IEEE80211_EHT_PHY_CAP6_EHT_DUP_6GHZ_SUPP, .phy_cap_info[7] = IEEE80211_EHT_PHY_CAP7_20MHZ_STA_RX_NDP_WIDER_BW | IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_80MHZ | IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_160MHZ | IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_320MHZ | IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_80MHZ | IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_160MHZ | IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_320MHZ, }, /* For all MCS and bandwidth, set 8 NSS for both Tx and * Rx */ .eht_mcs_nss_supp = { /* * As B1 and B2 are set in the supported * channel width set field in the HE PHY * capabilities information field and 320MHz in * 6GHz is supported include all the following * MCS/NSS. */ .bw._80 = { .rx_tx_mcs9_max_nss = 0x88, .rx_tx_mcs11_max_nss = 0x88, .rx_tx_mcs13_max_nss = 0x88, }, .bw._160 = { .rx_tx_mcs9_max_nss = 0x88, .rx_tx_mcs11_max_nss = 0x88, .rx_tx_mcs13_max_nss = 0x88, }, .bw._320 = { .rx_tx_mcs9_max_nss = 0x88, .rx_tx_mcs11_max_nss = 0x88, .rx_tx_mcs13_max_nss = 0x88, }, }, /* PPE threshold information is not supported */ }, }, #ifdef CONFIG_MAC80211_MESH { /* TODO: should we support other types, e.g., IBSS?*/ .types_mask = BIT(NL80211_IFTYPE_MESH_POINT), .he_6ghz_capa = { .capa = cpu_to_le16(IEEE80211_HE_6GHZ_CAP_MIN_MPDU_START | IEEE80211_HE_6GHZ_CAP_MAX_AMPDU_LEN_EXP | IEEE80211_HE_6GHZ_CAP_MAX_MPDU_LEN | IEEE80211_HE_6GHZ_CAP_SM_PS | IEEE80211_HE_6GHZ_CAP_RD_RESPONDER | IEEE80211_HE_6GHZ_CAP_TX_ANTPAT_CONS | IEEE80211_HE_6GHZ_CAP_RX_ANTPAT_CONS), }, .he_cap = { .has_he = true, .he_cap_elem = { .mac_cap_info[0] = IEEE80211_HE_MAC_CAP0_HTC_HE, .mac_cap_info[1] = IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8, .mac_cap_info[2] = IEEE80211_HE_MAC_CAP2_ACK_EN, .mac_cap_info[3] = IEEE80211_HE_MAC_CAP3_OMI_CONTROL | IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3, .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU, .phy_cap_info[0] = IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G | IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G | IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G, .phy_cap_info[1] = IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK | IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A | IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD | IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS, .phy_cap_info[2] = 0, /* Leave all the other PHY capability bytes * unset, as DCM, beam forming, RU and PPE * threshold information are not supported */ }, .he_mcs_nss_supp = { .rx_mcs_80 = cpu_to_le16(0xfffa), .tx_mcs_80 = cpu_to_le16(0xfffa), .rx_mcs_160 = cpu_to_le16(0xfffa), .tx_mcs_160 = cpu_to_le16(0xfffa), .rx_mcs_80p80 = cpu_to_le16(0xfffa), .tx_mcs_80p80 = cpu_to_le16(0xfffa), }, }, .eht_cap = { .has_eht = true, .eht_cap_elem = { .mac_cap_info[0] = IEEE80211_EHT_MAC_CAP0_OM_CONTROL | IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE1, .phy_cap_info[0] = IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ, /* Leave all the other PHY capability bytes * unset, as DCM, beam forming, RU and PPE * threshold information are not supported */ }, /* For all MCS and bandwidth, set 8 NSS for both Tx and * Rx */ .eht_mcs_nss_supp = { /* As B1 and B2 are set in the supported * channel width set field in the HE PHY * capabilities information field and 320MHz in * 6GHz is supported include all the following * MCS/NSS. */ .bw._80 = { .rx_tx_mcs9_max_nss = 0x88, .rx_tx_mcs11_max_nss = 0x88, .rx_tx_mcs13_max_nss = 0x88, }, .bw._160 = { .rx_tx_mcs9_max_nss = 0x88, .rx_tx_mcs11_max_nss = 0x88, .rx_tx_mcs13_max_nss = 0x88, }, .bw._320 = { .rx_tx_mcs9_max_nss = 0x88, .rx_tx_mcs11_max_nss = 0x88, .rx_tx_mcs13_max_nss = 0x88, }, }, /* PPE threshold information is not supported */ }, }, #endif }; static void mac80211_hwsim_sband_capab(struct ieee80211_supported_band *sband) { switch (sband->band) { case NL80211_BAND_2GHZ: ieee80211_set_sband_iftype_data(sband, sband_capa_2ghz); break; case NL80211_BAND_5GHZ: ieee80211_set_sband_iftype_data(sband, sband_capa_5ghz); break; case NL80211_BAND_6GHZ: ieee80211_set_sband_iftype_data(sband, sband_capa_6ghz); break; default: break; } } #ifdef CONFIG_MAC80211_MESH #define HWSIM_MESH_BIT BIT(NL80211_IFTYPE_MESH_POINT) #else #define HWSIM_MESH_BIT 0 #endif #define HWSIM_DEFAULT_IF_LIMIT \ (BIT(NL80211_IFTYPE_STATION) | \ BIT(NL80211_IFTYPE_P2P_CLIENT) | \ BIT(NL80211_IFTYPE_AP) | \ BIT(NL80211_IFTYPE_P2P_GO) | \ HWSIM_MESH_BIT) #define HWSIM_IFTYPE_SUPPORT_MASK \ (BIT(NL80211_IFTYPE_STATION) | \ BIT(NL80211_IFTYPE_AP) | \ BIT(NL80211_IFTYPE_P2P_CLIENT) | \ BIT(NL80211_IFTYPE_P2P_GO) | \ BIT(NL80211_IFTYPE_ADHOC) | \ BIT(NL80211_IFTYPE_MESH_POINT) | \ BIT(NL80211_IFTYPE_OCB)) static const u8 iftypes_ext_capa_ap[] = { [0] = WLAN_EXT_CAPA1_EXT_CHANNEL_SWITCHING, [2] = WLAN_EXT_CAPA3_MULTI_BSSID_SUPPORT, [7] = WLAN_EXT_CAPA8_OPMODE_NOTIF | WLAN_EXT_CAPA8_MAX_MSDU_IN_AMSDU_LSB, [8] = WLAN_EXT_CAPA9_MAX_MSDU_IN_AMSDU_MSB, [9] = WLAN_EXT_CAPA10_TWT_RESPONDER_SUPPORT, }; #define MAC80211_HWSIM_MLD_CAPA_OPS \ FIELD_PREP_CONST(IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP, \ IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP_SAME) | \ FIELD_PREP_CONST(IEEE80211_MLD_CAP_OP_MAX_SIMUL_LINKS, \ IEEE80211_MLD_MAX_NUM_LINKS - 1) static const struct wiphy_iftype_ext_capab mac80211_hwsim_iftypes_ext_capa[] = { { .iftype = NL80211_IFTYPE_AP, .extended_capabilities = iftypes_ext_capa_ap, .extended_capabilities_mask = iftypes_ext_capa_ap, .extended_capabilities_len = sizeof(iftypes_ext_capa_ap), .eml_capabilities = IEEE80211_EML_CAP_EMLSR_SUPP | IEEE80211_EML_CAP_EMLMR_SUPPORT, .mld_capa_and_ops = MAC80211_HWSIM_MLD_CAPA_OPS, }, }; static int mac80211_hwsim_new_radio(struct genl_info *info, struct hwsim_new_radio_params *param) { int err; u8 addr[ETH_ALEN]; struct mac80211_hwsim_data *data; struct ieee80211_hw *hw; enum nl80211_band band; const struct ieee80211_ops *ops = &mac80211_hwsim_ops; struct net *net; int idx, i; int n_limits = 0; int n_bands = 0; if (WARN_ON(param->channels > 1 && !param->use_chanctx)) return -EINVAL; spin_lock_bh(&hwsim_radio_lock); idx = hwsim_radio_idx++; spin_unlock_bh(&hwsim_radio_lock); if (param->mlo) ops = &mac80211_hwsim_mlo_ops; else if (param->use_chanctx) ops = &mac80211_hwsim_mchan_ops; hw = ieee80211_alloc_hw_nm(sizeof(*data), ops, param->hwname); if (!hw) { pr_debug("mac80211_hwsim: ieee80211_alloc_hw failed\n"); err = -ENOMEM; goto failed; } /* ieee80211_alloc_hw_nm may have used a default name */ param->hwname = wiphy_name(hw->wiphy); if (info) net = genl_info_net(info); else net = &init_net; wiphy_net_set(hw->wiphy, net); data = hw->priv; data->hw = hw; data->dev = device_create(hwsim_class, NULL, 0, hw, "hwsim%d", idx); if (IS_ERR(data->dev)) { printk(KERN_DEBUG "mac80211_hwsim: device_create failed (%ld)\n", PTR_ERR(data->dev)); err = -ENOMEM; goto failed_drvdata; } data->dev->driver = &mac80211_hwsim_driver.driver; err = device_bind_driver(data->dev); if (err != 0) { pr_debug("mac80211_hwsim: device_bind_driver failed (%d)\n", err); goto failed_bind; } skb_queue_head_init(&data->pending); SET_IEEE80211_DEV(hw, data->dev); if (!param->perm_addr) { eth_zero_addr(addr); addr[0] = 0x02; addr[3] = idx >> 8; addr[4] = idx; memcpy(data->addresses[0].addr, addr, ETH_ALEN); /* Why need here second address ? */ memcpy(data->addresses[1].addr, addr, ETH_ALEN); data->addresses[1].addr[0] |= 0x40; memcpy(data->addresses[2].addr, addr, ETH_ALEN); data->addresses[2].addr[0] |= 0x50; hw->wiphy->n_addresses = 3; hw->wiphy->addresses = data->addresses; /* possible address clash is checked at hash table insertion */ } else { memcpy(data->addresses[0].addr, param->perm_addr, ETH_ALEN); /* compatibility with automatically generated mac addr */ memcpy(data->addresses[1].addr, param->perm_addr, ETH_ALEN); memcpy(data->addresses[2].addr, param->perm_addr, ETH_ALEN); hw->wiphy->n_addresses = 3; hw->wiphy->addresses = data->addresses; } data->channels = param->channels; data->use_chanctx = param->use_chanctx; data->idx = idx; data->destroy_on_close = param->destroy_on_close; if (info) data->portid = info->snd_portid; /* setup interface limits, only on interface types we support */ if (param->iftypes & BIT(NL80211_IFTYPE_ADHOC)) { data->if_limits[n_limits].max = 1; data->if_limits[n_limits].types = BIT(NL80211_IFTYPE_ADHOC); n_limits++; } if (param->iftypes & HWSIM_DEFAULT_IF_LIMIT) { data->if_limits[n_limits].max = 2048; /* * For this case, we may only support a subset of * HWSIM_DEFAULT_IF_LIMIT, therefore we only want to add the * bits that both param->iftype & HWSIM_DEFAULT_IF_LIMIT have. */ data->if_limits[n_limits].types = HWSIM_DEFAULT_IF_LIMIT & param->iftypes; n_limits++; } if (param->iftypes & BIT(NL80211_IFTYPE_P2P_DEVICE)) { data->if_limits[n_limits].max = 1; data->if_limits[n_limits].types = BIT(NL80211_IFTYPE_P2P_DEVICE); n_limits++; } if (param->iftypes & BIT(NL80211_IFTYPE_NAN)) { data->if_limits[n_limits].max = 1; data->if_limits[n_limits].types = BIT(NL80211_IFTYPE_NAN); n_limits++; hw->wiphy->nan_supported_bands = BIT(NL80211_BAND_2GHZ) | BIT(NL80211_BAND_5GHZ); hw->wiphy->nan_capa.flags = WIPHY_NAN_FLAGS_CONFIGURABLE_SYNC | WIPHY_NAN_FLAGS_USERSPACE_DE; hw->wiphy->nan_capa.op_mode = NAN_OP_MODE_PHY_MODE_MASK | NAN_OP_MODE_80P80MHZ | NAN_OP_MODE_160MHZ; hw->wiphy->nan_capa.n_antennas = 0x22; hw->wiphy->nan_capa.max_channel_switch_time = 0; hw->wiphy->nan_capa.dev_capabilities = NAN_DEV_CAPA_EXT_KEY_ID_SUPPORTED | NAN_DEV_CAPA_NDPE_SUPPORTED; hrtimer_setup(&data->nan_timer, mac80211_hwsim_nan_dw_start, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_SOFT); } data->if_combination.radar_detect_widths = BIT(NL80211_CHAN_WIDTH_5) | BIT(NL80211_CHAN_WIDTH_10) | BIT(NL80211_CHAN_WIDTH_20_NOHT) | BIT(NL80211_CHAN_WIDTH_20) | BIT(NL80211_CHAN_WIDTH_40) | BIT(NL80211_CHAN_WIDTH_80) | BIT(NL80211_CHAN_WIDTH_160); if (data->use_chanctx) { hw->wiphy->max_scan_ssids = 255; hw->wiphy->max_scan_ie_len = IEEE80211_MAX_DATA_LEN; hw->wiphy->max_remain_on_channel_duration = 1000; data->if_combination.num_different_channels = data->channels; } else { data->if_combination.num_different_channels = 1; } if (!n_limits) { err = -EINVAL; goto failed_hw; } data->if_combination.max_interfaces = 0; for (i = 0; i < n_limits; i++) data->if_combination.max_interfaces += data->if_limits[i].max; data->if_combination.n_limits = n_limits; data->if_combination.limits = data->if_limits; /* * If we actually were asked to support combinations, * advertise them - if there's only a single thing like * only IBSS then don't advertise it as combinations. */ if (data->if_combination.max_interfaces > 1) { hw->wiphy->iface_combinations = &data->if_combination; hw->wiphy->n_iface_combinations = 1; } if (param->ciphers) { memcpy(data->ciphers, param->ciphers, param->n_ciphers * sizeof(u32)); hw->wiphy->cipher_suites = data->ciphers; hw->wiphy->n_cipher_suites = param->n_ciphers; } hw->wiphy->mbssid_max_interfaces = 8; hw->wiphy->ema_max_profile_periodicity = 3; data->rx_rssi = DEFAULT_RX_RSSI; INIT_DELAYED_WORK(&data->roc_start, hw_roc_start); INIT_DELAYED_WORK(&data->roc_done, hw_roc_done); INIT_DELAYED_WORK(&data->hw_scan, hw_scan_work); hw->queues = 5; hw->offchannel_tx_hw_queue = 4; ieee80211_hw_set(hw, SUPPORT_FAST_XMIT); ieee80211_hw_set(hw, CHANCTX_STA_CSA); ieee80211_hw_set(hw, SUPPORTS_HT_CCK_RATES); ieee80211_hw_set(hw, QUEUE_CONTROL); ieee80211_hw_set(hw, WANT_MONITOR_VIF); ieee80211_hw_set(hw, AMPDU_AGGREGATION); ieee80211_hw_set(hw, MFP_CAPABLE); ieee80211_hw_set(hw, SIGNAL_DBM); ieee80211_hw_set(hw, SUPPORTS_PS); ieee80211_hw_set(hw, REPORTS_TX_ACK_STATUS); ieee80211_hw_set(hw, TDLS_WIDER_BW); ieee80211_hw_set(hw, SUPPORTS_MULTI_BSSID); ieee80211_hw_set(hw, STRICT); if (param->mlo) { hw->wiphy->flags |= WIPHY_FLAG_SUPPORTS_MLO; ieee80211_hw_set(hw, HAS_RATE_CONTROL); ieee80211_hw_set(hw, SUPPORTS_DYNAMIC_PS); ieee80211_hw_set(hw, CONNECTION_MONITOR); ieee80211_hw_set(hw, AP_LINK_PS); hw->wiphy->iftype_ext_capab = mac80211_hwsim_iftypes_ext_capa; hw->wiphy->num_iftype_ext_capab = ARRAY_SIZE(mac80211_hwsim_iftypes_ext_capa); } else { ieee80211_hw_set(hw, HOST_BROADCAST_PS_BUFFERING); ieee80211_hw_set(hw, PS_NULLFUNC_STACK); if (rctbl) ieee80211_hw_set(hw, SUPPORTS_RC_TABLE); } hw->wiphy->flags &= ~WIPHY_FLAG_PS_ON_BY_DEFAULT; hw->wiphy->flags |= WIPHY_FLAG_SUPPORTS_TDLS | WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL | WIPHY_FLAG_AP_UAPSD | WIPHY_FLAG_SUPPORTS_5_10_MHZ | WIPHY_FLAG_HAS_CHANNEL_SWITCH; hw->wiphy->features |= NL80211_FEATURE_ACTIVE_MONITOR | NL80211_FEATURE_AP_MODE_CHAN_WIDTH_CHANGE | NL80211_FEATURE_STATIC_SMPS | NL80211_FEATURE_DYNAMIC_SMPS | NL80211_FEATURE_SCAN_RANDOM_MAC_ADDR | NL80211_FEATURE_AP_SCAN; wiphy_ext_feature_set(hw->wiphy, NL80211_EXT_FEATURE_VHT_IBSS); wiphy_ext_feature_set(hw->wiphy, NL80211_EXT_FEATURE_BEACON_PROTECTION); wiphy_ext_feature_set(hw->wiphy, NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS); wiphy_ext_feature_set(hw->wiphy, NL80211_EXT_FEATURE_BEACON_RATE_LEGACY); wiphy_ext_feature_set(hw->wiphy, NL80211_EXT_FEATURE_ENABLE_FTM_RESPONDER); wiphy_ext_feature_set(hw->wiphy, NL80211_EXT_FEATURE_SCAN_MIN_PREQ_CONTENT); wiphy_ext_feature_set(hw->wiphy, NL80211_EXT_FEATURE_BSS_COLOR); hw->wiphy->interface_modes = param->iftypes; /* ask mac80211 to reserve space for magic */ hw->vif_data_size = sizeof(struct hwsim_vif_priv); hw->sta_data_size = sizeof(struct hwsim_sta_priv); hw->chanctx_data_size = sizeof(struct hwsim_chanctx_priv); memcpy(data->channels_2ghz, hwsim_channels_2ghz, sizeof(hwsim_channels_2ghz)); memcpy(data->channels_5ghz, hwsim_channels_5ghz, sizeof(hwsim_channels_5ghz)); memcpy(data->channels_6ghz, hwsim_channels_6ghz, sizeof(hwsim_channels_6ghz)); memcpy(data->channels_s1g, hwsim_channels_s1g, sizeof(hwsim_channels_s1g)); memcpy(data->rates, hwsim_rates, sizeof(hwsim_rates)); for (band = NL80211_BAND_2GHZ; band < NUM_NL80211_BANDS; band++) { struct ieee80211_supported_band *sband = &data->bands[band]; struct wiphy_radio_freq_range *radio_range; const struct ieee80211_channel *c; struct wiphy_radio *radio; sband->band = band; switch (band) { case NL80211_BAND_2GHZ: sband->channels = data->channels_2ghz; sband->n_channels = ARRAY_SIZE(hwsim_channels_2ghz); sband->bitrates = data->rates; sband->n_bitrates = ARRAY_SIZE(hwsim_rates); break; case NL80211_BAND_5GHZ: sband->channels = data->channels_5ghz; sband->n_channels = ARRAY_SIZE(hwsim_channels_5ghz); sband->bitrates = data->rates + 4; sband->n_bitrates = ARRAY_SIZE(hwsim_rates) - 4; sband->vht_cap.vht_supported = true; sband->vht_cap.cap = IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454 | IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ | IEEE80211_VHT_CAP_RXLDPC | IEEE80211_VHT_CAP_SHORT_GI_80 | IEEE80211_VHT_CAP_SHORT_GI_160 | IEEE80211_VHT_CAP_TXSTBC | IEEE80211_VHT_CAP_RXSTBC_4 | IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK; sband->vht_cap.vht_mcs.rx_mcs_map = cpu_to_le16(IEEE80211_VHT_MCS_SUPPORT_0_9 << 0 | IEEE80211_VHT_MCS_SUPPORT_0_9 << 2 | IEEE80211_VHT_MCS_SUPPORT_0_9 << 4 | IEEE80211_VHT_MCS_SUPPORT_0_9 << 6 | IEEE80211_VHT_MCS_SUPPORT_0_9 << 8 | IEEE80211_VHT_MCS_SUPPORT_0_9 << 10 | IEEE80211_VHT_MCS_SUPPORT_0_9 << 12 | IEEE80211_VHT_MCS_SUPPORT_0_9 << 14); sband->vht_cap.vht_mcs.tx_mcs_map = sband->vht_cap.vht_mcs.rx_mcs_map; break; case NL80211_BAND_6GHZ: sband->channels = data->channels_6ghz; sband->n_channels = ARRAY_SIZE(hwsim_channels_6ghz); sband->bitrates = data->rates + 4; sband->n_bitrates = ARRAY_SIZE(hwsim_rates) - 4; break; case NL80211_BAND_S1GHZ: memcpy(&sband->s1g_cap, &hwsim_s1g_cap, sizeof(sband->s1g_cap)); sband->channels = data->channels_s1g; sband->n_channels = ARRAY_SIZE(hwsim_channels_s1g); break; default: continue; } if (band != NL80211_BAND_6GHZ){ sband->ht_cap.ht_supported = true; sband->ht_cap.cap = IEEE80211_HT_CAP_SUP_WIDTH_20_40 | IEEE80211_HT_CAP_GRN_FLD | IEEE80211_HT_CAP_SGI_20 | IEEE80211_HT_CAP_SGI_40 | IEEE80211_HT_CAP_DSSSCCK40; sband->ht_cap.ampdu_factor = 0x3; sband->ht_cap.ampdu_density = 0x6; memset(&sband->ht_cap.mcs, 0, sizeof(sband->ht_cap.mcs)); sband->ht_cap.mcs.rx_mask[0] = 0xff; sband->ht_cap.mcs.rx_mask[1] = 0xff; sband->ht_cap.mcs.tx_params = IEEE80211_HT_MCS_TX_DEFINED; } mac80211_hwsim_sband_capab(sband); hw->wiphy->bands[band] = sband; if (!param->multi_radio) continue; c = sband->channels; radio_range = &data->radio_range[n_bands]; radio_range->start_freq = ieee80211_channel_to_khz(c) - 10000; c += sband->n_channels - 1; radio_range->end_freq = ieee80211_channel_to_khz(c) + 10000; radio = &data->radio[n_bands++]; radio->freq_range = radio_range; radio->n_freq_range = 1; radio->iface_combinations = &data->if_combination_radio; radio->n_iface_combinations = 1; } if (param->multi_radio) { hw->wiphy->radio = data->radio; hw->wiphy->n_radio = n_bands; memcpy(&data->if_combination_radio, &data->if_combination, sizeof(data->if_combination)); data->if_combination.num_different_channels *= n_bands; } if (data->use_chanctx) data->if_combination.radar_detect_widths = 0; /* By default all radios belong to the first group */ data->group = 1; mutex_init(&data->mutex); data->netgroup = hwsim_net_get_netgroup(net); data->wmediumd = hwsim_net_get_wmediumd(net); /* Enable frame retransmissions for lossy channels */ hw->max_rates = 4; hw->max_rate_tries = 11; hw->wiphy->vendor_commands = mac80211_hwsim_vendor_commands; hw->wiphy->n_vendor_commands = ARRAY_SIZE(mac80211_hwsim_vendor_commands); hw->wiphy->vendor_events = mac80211_hwsim_vendor_events; hw->wiphy->n_vendor_events = ARRAY_SIZE(mac80211_hwsim_vendor_events); if (param->reg_strict) hw->wiphy->regulatory_flags |= REGULATORY_STRICT_REG; if (param->regd) { data->regd = param->regd; hw->wiphy->regulatory_flags |= REGULATORY_CUSTOM_REG; wiphy_apply_custom_regulatory(hw->wiphy, param->regd); /* give the regulatory workqueue a chance to run */ schedule_timeout_interruptible(1); } wiphy_ext_feature_set(hw->wiphy, NL80211_EXT_FEATURE_DFS_CONCURRENT); if (param->no_vif) ieee80211_hw_set(hw, NO_AUTO_VIF); wiphy_ext_feature_set(hw->wiphy, NL80211_EXT_FEATURE_CQM_RSSI_LIST); wiphy_ext_feature_set(hw->wiphy, NL80211_EXT_FEATURE_PUNCT); for (i = 0; i < ARRAY_SIZE(data->link_data); i++) { hrtimer_setup(&data->link_data[i].beacon_timer, mac80211_hwsim_beacon, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_SOFT); data->link_data[i].link_id = i; } err = ieee80211_register_hw(hw); if (err < 0) { pr_debug("mac80211_hwsim: ieee80211_register_hw failed (%d)\n", err); goto failed_hw; } wiphy_dbg(hw->wiphy, "hwaddr %pM registered\n", hw->wiphy->perm_addr); if (param->reg_alpha2) { data->alpha2[0] = param->reg_alpha2[0]; data->alpha2[1] = param->reg_alpha2[1]; regulatory_hint(hw->wiphy, param->reg_alpha2); } data->debugfs = debugfs_create_dir("hwsim", hw->wiphy->debugfsdir); debugfs_create_file("ps", 0666, data->debugfs, data, &hwsim_fops_ps); debugfs_create_file("group", 0666, data->debugfs, data, &hwsim_fops_group); debugfs_create_file("rx_rssi", 0666, data->debugfs, data, &hwsim_fops_rx_rssi); if (!data->use_chanctx) debugfs_create_file("dfs_simulate_radar", 0222, data->debugfs, data, &hwsim_simulate_radar); if (param->pmsr_capa) { data->pmsr_capa = *param->pmsr_capa; hw->wiphy->pmsr_capa = &data->pmsr_capa; } spin_lock_bh(&hwsim_radio_lock); err = rhashtable_insert_fast(&hwsim_radios_rht, &data->rht, hwsim_rht_params); if (err < 0) { if (info) { GENL_SET_ERR_MSG(info, "perm addr already present"); NL_SET_BAD_ATTR(info->extack, info->attrs[HWSIM_ATTR_PERM_ADDR]); } spin_unlock_bh(&hwsim_radio_lock); goto failed_final_insert; } list_add_tail(&data->list, &hwsim_radios); hwsim_radios_generation++; spin_unlock_bh(&hwsim_radio_lock); hwsim_mcast_new_radio(idx, info, param); return idx; failed_final_insert: debugfs_remove_recursive(data->debugfs); ieee80211_unregister_hw(data->hw); failed_hw: device_release_driver(data->dev); failed_bind: device_unregister(data->dev); failed_drvdata: ieee80211_free_hw(hw); failed: return err; } static void hwsim_mcast_del_radio(int id, const char *hwname, struct genl_info *info) { struct sk_buff *skb; void *data; int ret; skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return; data = genlmsg_put(skb, 0, 0, &hwsim_genl_family, 0, HWSIM_CMD_DEL_RADIO); if (!data) goto error; ret = nla_put_u32(skb, HWSIM_ATTR_RADIO_ID, id); if (ret < 0) goto error; ret = nla_put(skb, HWSIM_ATTR_RADIO_NAME, strlen(hwname), hwname); if (ret < 0) goto error; genlmsg_end(skb, data); hwsim_mcast_config_msg(skb, info); return; error: nlmsg_free(skb); } static void mac80211_hwsim_del_radio(struct mac80211_hwsim_data *data, const char *hwname, struct genl_info *info) { hwsim_mcast_del_radio(data->idx, hwname, info); debugfs_remove_recursive(data->debugfs); ieee80211_unregister_hw(data->hw); device_release_driver(data->dev); device_unregister(data->dev); ieee80211_free_hw(data->hw); } static int mac80211_hwsim_get_radio(struct sk_buff *skb, struct mac80211_hwsim_data *data, u32 portid, u32 seq, struct netlink_callback *cb, int flags) { void *hdr; struct hwsim_new_radio_params param = { }; int res = -EMSGSIZE; hdr = genlmsg_put(skb, portid, seq, &hwsim_genl_family, flags, HWSIM_CMD_GET_RADIO); if (!hdr) return -EMSGSIZE; if (cb) genl_dump_check_consistent(cb, hdr); if (data->alpha2[0] && data->alpha2[1]) param.reg_alpha2 = data->alpha2; param.reg_strict = !!(data->hw->wiphy->regulatory_flags & REGULATORY_STRICT_REG); param.p2p_device = !!(data->hw->wiphy->interface_modes & BIT(NL80211_IFTYPE_P2P_DEVICE)); param.nan_device = !!(data->hw->wiphy->interface_modes & BIT(NL80211_IFTYPE_NAN)); param.use_chanctx = data->use_chanctx; param.regd = data->regd; param.channels = data->channels; param.hwname = wiphy_name(data->hw->wiphy); param.pmsr_capa = &data->pmsr_capa; res = append_radio_msg(skb, data->idx, ¶m); if (res < 0) goto out_err; genlmsg_end(skb, hdr); return 0; out_err: genlmsg_cancel(skb, hdr); return res; } static void mac80211_hwsim_free(void) { struct mac80211_hwsim_data *data; spin_lock_bh(&hwsim_radio_lock); while ((data = list_first_entry_or_null(&hwsim_radios, struct mac80211_hwsim_data, list))) { list_del(&data->list); spin_unlock_bh(&hwsim_radio_lock); mac80211_hwsim_del_radio(data, wiphy_name(data->hw->wiphy), NULL); spin_lock_bh(&hwsim_radio_lock); } spin_unlock_bh(&hwsim_radio_lock); class_destroy(hwsim_class); } static const struct net_device_ops hwsim_netdev_ops = { .ndo_start_xmit = hwsim_mon_xmit, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; static void hwsim_mon_setup(struct net_device *dev) { u8 addr[ETH_ALEN]; dev->netdev_ops = &hwsim_netdev_ops; dev->needs_free_netdev = true; ether_setup(dev); dev->priv_flags |= IFF_NO_QUEUE; dev->type = ARPHRD_IEEE80211_RADIOTAP; eth_zero_addr(addr); addr[0] = 0x12; eth_hw_addr_set(dev, addr); } static void hwsim_register_wmediumd(struct net *net, u32 portid) { struct mac80211_hwsim_data *data; hwsim_net_set_wmediumd(net, portid); spin_lock_bh(&hwsim_radio_lock); list_for_each_entry(data, &hwsim_radios, list) { if (data->netgroup == hwsim_net_get_netgroup(net)) data->wmediumd = portid; } spin_unlock_bh(&hwsim_radio_lock); } static int hwsim_tx_info_frame_received_nl(struct sk_buff *skb_2, struct genl_info *info) { struct ieee80211_hdr *hdr; struct mac80211_hwsim_data *data2; struct ieee80211_tx_info *txi; struct hwsim_tx_rate *tx_attempts; u64 ret_skb_cookie; struct sk_buff *skb, *tmp; const u8 *src; unsigned int hwsim_flags; int i; unsigned long flags; bool found = false; if (!info->attrs[HWSIM_ATTR_ADDR_TRANSMITTER] || !info->attrs[HWSIM_ATTR_FLAGS] || !info->attrs[HWSIM_ATTR_COOKIE] || !info->attrs[HWSIM_ATTR_SIGNAL] || !info->attrs[HWSIM_ATTR_TX_INFO]) goto out; src = (void *)nla_data(info->attrs[HWSIM_ATTR_ADDR_TRANSMITTER]); hwsim_flags = nla_get_u32(info->attrs[HWSIM_ATTR_FLAGS]); ret_skb_cookie = nla_get_u64(info->attrs[HWSIM_ATTR_COOKIE]); data2 = get_hwsim_data_ref_from_addr(src); if (!data2) goto out; if (!hwsim_virtio_enabled) { if (hwsim_net_get_netgroup(genl_info_net(info)) != data2->netgroup) goto out; if (info->snd_portid != data2->wmediumd) goto out; } /* look for the skb matching the cookie passed back from user */ spin_lock_irqsave(&data2->pending.lock, flags); skb_queue_walk_safe(&data2->pending, skb, tmp) { uintptr_t skb_cookie; txi = IEEE80211_SKB_CB(skb); skb_cookie = (uintptr_t)txi->rate_driver_data[0]; if (skb_cookie == ret_skb_cookie) { __skb_unlink(skb, &data2->pending); found = true; break; } } spin_unlock_irqrestore(&data2->pending.lock, flags); /* not found */ if (!found) goto out; /* Tx info received because the frame was broadcasted on user space, so we get all the necessary info: tx attempts and skb control buff */ tx_attempts = (struct hwsim_tx_rate *)nla_data( info->attrs[HWSIM_ATTR_TX_INFO]); /* now send back TX status */ txi = IEEE80211_SKB_CB(skb); ieee80211_tx_info_clear_status(txi); for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) { txi->status.rates[i].idx = tx_attempts[i].idx; txi->status.rates[i].count = tx_attempts[i].count; } txi->status.ack_signal = nla_get_u32(info->attrs[HWSIM_ATTR_SIGNAL]); if (!(hwsim_flags & HWSIM_TX_CTL_NO_ACK) && (hwsim_flags & HWSIM_TX_STAT_ACK)) { if (skb->len >= 16) { hdr = (struct ieee80211_hdr *) skb->data; mac80211_hwsim_monitor_ack(data2->channel, hdr->addr2); } txi->flags |= IEEE80211_TX_STAT_ACK; } if (hwsim_flags & HWSIM_TX_CTL_NO_ACK) txi->flags |= IEEE80211_TX_STAT_NOACK_TRANSMITTED; ieee80211_tx_status_irqsafe(data2->hw, skb); return 0; out: return -EINVAL; } static int hwsim_cloned_frame_received_nl(struct sk_buff *skb_2, struct genl_info *info) { struct mac80211_hwsim_data *data2; struct ieee80211_rx_status rx_status; struct ieee80211_hdr *hdr; const u8 *dst; int frame_data_len; void *frame_data; struct sk_buff *skb = NULL; struct ieee80211_channel *channel = NULL; if (!info->attrs[HWSIM_ATTR_ADDR_RECEIVER] || !info->attrs[HWSIM_ATTR_FRAME] || !info->attrs[HWSIM_ATTR_RX_RATE] || !info->attrs[HWSIM_ATTR_SIGNAL]) goto out; dst = (void *)nla_data(info->attrs[HWSIM_ATTR_ADDR_RECEIVER]); frame_data_len = nla_len(info->attrs[HWSIM_ATTR_FRAME]); frame_data = (void *)nla_data(info->attrs[HWSIM_ATTR_FRAME]); if (frame_data_len < sizeof(struct ieee80211_hdr_3addr) || frame_data_len > IEEE80211_MAX_DATA_LEN) goto err; /* Allocate new skb here */ skb = alloc_skb(frame_data_len, GFP_KERNEL); if (skb == NULL) goto err; /* Copy the data */ skb_put_data(skb, frame_data, frame_data_len); data2 = get_hwsim_data_ref_from_addr(dst); if (!data2) goto out; if (data2->use_chanctx) { if (data2->tmp_chan) channel = data2->tmp_chan; } else { channel = data2->channel; } if (!hwsim_virtio_enabled) { if (hwsim_net_get_netgroup(genl_info_net(info)) != data2->netgroup) goto out; if (info->snd_portid != data2->wmediumd) goto out; } /* check if radio is configured properly */ if ((data2->idle && !data2->tmp_chan) || !data2->started) goto out; /* A frame is received from user space */ memset(&rx_status, 0, sizeof(rx_status)); if (info->attrs[HWSIM_ATTR_FREQ]) { struct tx_iter_data iter_data = {}; /* throw away off-channel packets, but allow both the temporary * ("hw" scan/remain-on-channel), regular channels and links, * since the internal datapath also allows this */ rx_status.freq = nla_get_u32(info->attrs[HWSIM_ATTR_FREQ]); iter_data.channel = ieee80211_get_channel(data2->hw->wiphy, rx_status.freq); if (!iter_data.channel) goto out; rx_status.band = iter_data.channel->band; mutex_lock(&data2->mutex); if (!hwsim_chans_compat(iter_data.channel, channel)) { ieee80211_iterate_active_interfaces_atomic( data2->hw, IEEE80211_IFACE_ITER_NORMAL, mac80211_hwsim_tx_iter, &iter_data); if (!iter_data.receive) { mutex_unlock(&data2->mutex); goto out; } } mutex_unlock(&data2->mutex); } else if (!channel) { goto out; } else { rx_status.freq = channel->center_freq; rx_status.band = channel->band; } rx_status.rate_idx = nla_get_u32(info->attrs[HWSIM_ATTR_RX_RATE]); if (rx_status.rate_idx >= data2->hw->wiphy->bands[rx_status.band]->n_bitrates) goto out; rx_status.signal = nla_get_u32(info->attrs[HWSIM_ATTR_SIGNAL]); hdr = (void *)skb->data; if (ieee80211_is_beacon(hdr->frame_control) || ieee80211_is_probe_resp(hdr->frame_control)) rx_status.boottime_ns = ktime_get_boottime_ns(); mac80211_hwsim_rx(data2, &rx_status, skb); return 0; err: pr_debug("mac80211_hwsim: error occurred in %s\n", __func__); out: dev_kfree_skb(skb); return -EINVAL; } static int hwsim_register_received_nl(struct sk_buff *skb_2, struct genl_info *info) { struct net *net = genl_info_net(info); struct mac80211_hwsim_data *data; int chans = 1; spin_lock_bh(&hwsim_radio_lock); list_for_each_entry(data, &hwsim_radios, list) chans = max(chans, data->channels); spin_unlock_bh(&hwsim_radio_lock); /* In the future we should revise the userspace API and allow it * to set a flag that it does support multi-channel, then we can * let this pass conditionally on the flag. * For current userspace, prohibit it since it won't work right. */ if (chans > 1) return -EOPNOTSUPP; if (hwsim_net_get_wmediumd(net)) return -EBUSY; hwsim_register_wmediumd(net, info->snd_portid); pr_debug("mac80211_hwsim: received a REGISTER, " "switching to wmediumd mode with pid %d\n", info->snd_portid); return 0; } /* ensures ciphers only include ciphers listed in 'hwsim_ciphers' array */ static bool hwsim_known_ciphers(const u32 *ciphers, int n_ciphers) { int i; for (i = 0; i < n_ciphers; i++) { int j; int found = 0; for (j = 0; j < ARRAY_SIZE(hwsim_ciphers); j++) { if (ciphers[i] == hwsim_ciphers[j]) { found = 1; break; } } if (!found) return false; } return true; } static int parse_ftm_capa(const struct nlattr *ftm_capa, struct cfg80211_pmsr_capabilities *out, struct genl_info *info) { struct nlattr *tb[NL80211_PMSR_FTM_CAPA_ATTR_MAX + 1]; int ret; ret = nla_parse_nested(tb, NL80211_PMSR_FTM_CAPA_ATTR_MAX, ftm_capa, hwsim_ftm_capa_policy, NULL); if (ret) { NL_SET_ERR_MSG_ATTR(info->extack, ftm_capa, "malformed FTM capability"); return -EINVAL; } out->ftm.supported = 1; if (tb[NL80211_PMSR_FTM_CAPA_ATTR_PREAMBLES]) out->ftm.preambles = nla_get_u32(tb[NL80211_PMSR_FTM_CAPA_ATTR_PREAMBLES]); if (tb[NL80211_PMSR_FTM_CAPA_ATTR_BANDWIDTHS]) out->ftm.bandwidths = nla_get_u32(tb[NL80211_PMSR_FTM_CAPA_ATTR_BANDWIDTHS]); if (tb[NL80211_PMSR_FTM_CAPA_ATTR_MAX_BURSTS_EXPONENT]) out->ftm.max_bursts_exponent = nla_get_u8(tb[NL80211_PMSR_FTM_CAPA_ATTR_MAX_BURSTS_EXPONENT]); if (tb[NL80211_PMSR_FTM_CAPA_ATTR_MAX_FTMS_PER_BURST]) out->ftm.max_ftms_per_burst = nla_get_u8(tb[NL80211_PMSR_FTM_CAPA_ATTR_MAX_FTMS_PER_BURST]); out->ftm.asap = !!tb[NL80211_PMSR_FTM_CAPA_ATTR_ASAP]; out->ftm.non_asap = !!tb[NL80211_PMSR_FTM_CAPA_ATTR_NON_ASAP]; out->ftm.request_lci = !!tb[NL80211_PMSR_FTM_CAPA_ATTR_REQ_LCI]; out->ftm.request_civicloc = !!tb[NL80211_PMSR_FTM_CAPA_ATTR_REQ_CIVICLOC]; out->ftm.trigger_based = !!tb[NL80211_PMSR_FTM_CAPA_ATTR_TRIGGER_BASED]; out->ftm.non_trigger_based = !!tb[NL80211_PMSR_FTM_CAPA_ATTR_NON_TRIGGER_BASED]; return 0; } static int parse_pmsr_capa(const struct nlattr *pmsr_capa, struct cfg80211_pmsr_capabilities *out, struct genl_info *info) { struct nlattr *tb[NL80211_PMSR_ATTR_MAX + 1]; struct nlattr *nla; int size; int ret; ret = nla_parse_nested(tb, NL80211_PMSR_ATTR_MAX, pmsr_capa, hwsim_pmsr_capa_policy, NULL); if (ret) { NL_SET_ERR_MSG_ATTR(info->extack, pmsr_capa, "malformed PMSR capability"); return -EINVAL; } if (tb[NL80211_PMSR_ATTR_MAX_PEERS]) out->max_peers = nla_get_u32(tb[NL80211_PMSR_ATTR_MAX_PEERS]); out->report_ap_tsf = !!tb[NL80211_PMSR_ATTR_REPORT_AP_TSF]; out->randomize_mac_addr = !!tb[NL80211_PMSR_ATTR_RANDOMIZE_MAC_ADDR]; if (!tb[NL80211_PMSR_ATTR_TYPE_CAPA]) { NL_SET_ERR_MSG_ATTR(info->extack, tb[NL80211_PMSR_ATTR_TYPE_CAPA], "malformed PMSR type"); return -EINVAL; } nla_for_each_nested(nla, tb[NL80211_PMSR_ATTR_TYPE_CAPA], size) { switch (nla_type(nla)) { case NL80211_PMSR_TYPE_FTM: parse_ftm_capa(nla, out, info); break; default: NL_SET_ERR_MSG_ATTR(info->extack, nla, "unsupported measurement type"); return -EINVAL; } } return 0; } static int hwsim_new_radio_nl(struct sk_buff *msg, struct genl_info *info) { struct hwsim_new_radio_params param = { 0 }; const char *hwname = NULL; int ret; param.reg_strict = info->attrs[HWSIM_ATTR_REG_STRICT_REG]; param.p2p_device = info->attrs[HWSIM_ATTR_SUPPORT_P2P_DEVICE]; param.nan_device = info->attrs[HWSIM_ATTR_SUPPORT_NAN_DEVICE]; param.channels = channels; param.destroy_on_close = info->attrs[HWSIM_ATTR_DESTROY_RADIO_ON_CLOSE]; if (info->attrs[HWSIM_ATTR_CHANNELS]) param.channels = nla_get_u32(info->attrs[HWSIM_ATTR_CHANNELS]); if (param.channels < 1) { GENL_SET_ERR_MSG(info, "must have at least one channel"); return -EINVAL; } if (info->attrs[HWSIM_ATTR_NO_VIF]) param.no_vif = true; if (info->attrs[HWSIM_ATTR_USE_CHANCTX]) param.use_chanctx = true; else param.use_chanctx = (param.channels > 1); if (info->attrs[HWSIM_ATTR_MULTI_RADIO]) param.multi_radio = true; if (info->attrs[HWSIM_ATTR_REG_HINT_ALPHA2]) param.reg_alpha2 = nla_data(info->attrs[HWSIM_ATTR_REG_HINT_ALPHA2]); if (info->attrs[HWSIM_ATTR_REG_CUSTOM_REG]) { u32 idx = nla_get_u32(info->attrs[HWSIM_ATTR_REG_CUSTOM_REG]); if (idx >= ARRAY_SIZE(hwsim_world_regdom_custom)) return -EINVAL; idx = array_index_nospec(idx, ARRAY_SIZE(hwsim_world_regdom_custom)); param.regd = hwsim_world_regdom_custom[idx]; } if (info->attrs[HWSIM_ATTR_PERM_ADDR]) { if (!is_valid_ether_addr( nla_data(info->attrs[HWSIM_ATTR_PERM_ADDR]))) { GENL_SET_ERR_MSG(info,"MAC is no valid source addr"); NL_SET_BAD_ATTR(info->extack, info->attrs[HWSIM_ATTR_PERM_ADDR]); return -EINVAL; } param.perm_addr = nla_data(info->attrs[HWSIM_ATTR_PERM_ADDR]); } if (info->attrs[HWSIM_ATTR_IFTYPE_SUPPORT]) { param.iftypes = nla_get_u32(info->attrs[HWSIM_ATTR_IFTYPE_SUPPORT]); if (param.iftypes & ~HWSIM_IFTYPE_SUPPORT_MASK) { NL_SET_ERR_MSG_ATTR(info->extack, info->attrs[HWSIM_ATTR_IFTYPE_SUPPORT], "cannot support more iftypes than kernel"); return -EINVAL; } } else { param.iftypes = HWSIM_IFTYPE_SUPPORT_MASK; } /* ensure both flag and iftype support is honored */ if (param.p2p_device || param.iftypes & BIT(NL80211_IFTYPE_P2P_DEVICE)) { param.iftypes |= BIT(NL80211_IFTYPE_P2P_DEVICE); param.p2p_device = true; } /* ensure both flag and iftype support is honored */ if (param.nan_device || param.iftypes & BIT(NL80211_IFTYPE_NAN)) { param.iftypes |= BIT(NL80211_IFTYPE_NAN); param.nan_device = true; } if (info->attrs[HWSIM_ATTR_CIPHER_SUPPORT]) { u32 len = nla_len(info->attrs[HWSIM_ATTR_CIPHER_SUPPORT]); param.ciphers = nla_data(info->attrs[HWSIM_ATTR_CIPHER_SUPPORT]); if (len % sizeof(u32)) { NL_SET_ERR_MSG_ATTR(info->extack, info->attrs[HWSIM_ATTR_CIPHER_SUPPORT], "bad cipher list length"); return -EINVAL; } param.n_ciphers = len / sizeof(u32); if (param.n_ciphers > ARRAY_SIZE(hwsim_ciphers)) { NL_SET_ERR_MSG_ATTR(info->extack, info->attrs[HWSIM_ATTR_CIPHER_SUPPORT], "too many ciphers specified"); return -EINVAL; } if (!hwsim_known_ciphers(param.ciphers, param.n_ciphers)) { NL_SET_ERR_MSG_ATTR(info->extack, info->attrs[HWSIM_ATTR_CIPHER_SUPPORT], "unsupported ciphers specified"); return -EINVAL; } } param.mlo = info->attrs[HWSIM_ATTR_MLO_SUPPORT]; if (param.mlo || param.multi_radio) param.use_chanctx = true; if (info->attrs[HWSIM_ATTR_RADIO_NAME]) { hwname = kstrndup((char *)nla_data(info->attrs[HWSIM_ATTR_RADIO_NAME]), nla_len(info->attrs[HWSIM_ATTR_RADIO_NAME]), GFP_KERNEL); if (!hwname) return -ENOMEM; param.hwname = hwname; } if (info->attrs[HWSIM_ATTR_PMSR_SUPPORT]) { struct cfg80211_pmsr_capabilities *pmsr_capa; pmsr_capa = kmalloc(sizeof(*pmsr_capa), GFP_KERNEL); if (!pmsr_capa) { ret = -ENOMEM; goto out_free; } param.pmsr_capa = pmsr_capa; ret = parse_pmsr_capa(info->attrs[HWSIM_ATTR_PMSR_SUPPORT], pmsr_capa, info); if (ret) goto out_free; } ret = mac80211_hwsim_new_radio(info, ¶m); out_free: kfree(hwname); kfree(param.pmsr_capa); return ret; } static int hwsim_del_radio_nl(struct sk_buff *msg, struct genl_info *info) { struct mac80211_hwsim_data *data; s64 idx = -1; const char *hwname = NULL; if (info->attrs[HWSIM_ATTR_RADIO_ID]) { idx = nla_get_u32(info->attrs[HWSIM_ATTR_RADIO_ID]); } else if (info->attrs[HWSIM_ATTR_RADIO_NAME]) { hwname = kstrndup((char *)nla_data(info->attrs[HWSIM_ATTR_RADIO_NAME]), nla_len(info->attrs[HWSIM_ATTR_RADIO_NAME]), GFP_KERNEL); if (!hwname) return -ENOMEM; } else return -EINVAL; spin_lock_bh(&hwsim_radio_lock); list_for_each_entry(data, &hwsim_radios, list) { if (idx >= 0) { if (data->idx != idx) continue; } else { if (!hwname || strcmp(hwname, wiphy_name(data->hw->wiphy))) continue; } if (!net_eq(wiphy_net(data->hw->wiphy), genl_info_net(info))) continue; list_del(&data->list); rhashtable_remove_fast(&hwsim_radios_rht, &data->rht, hwsim_rht_params); hwsim_radios_generation++; spin_unlock_bh(&hwsim_radio_lock); mac80211_hwsim_del_radio(data, wiphy_name(data->hw->wiphy), info); kfree(hwname); return 0; } spin_unlock_bh(&hwsim_radio_lock); kfree(hwname); return -ENODEV; } static int hwsim_get_radio_nl(struct sk_buff *msg, struct genl_info *info) { struct mac80211_hwsim_data *data; struct sk_buff *skb; int idx, res = -ENODEV; if (!info->attrs[HWSIM_ATTR_RADIO_ID]) return -EINVAL; idx = nla_get_u32(info->attrs[HWSIM_ATTR_RADIO_ID]); spin_lock_bh(&hwsim_radio_lock); list_for_each_entry(data, &hwsim_radios, list) { if (data->idx != idx) continue; if (!net_eq(wiphy_net(data->hw->wiphy), genl_info_net(info))) continue; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!skb) { res = -ENOMEM; goto out_err; } res = mac80211_hwsim_get_radio(skb, data, info->snd_portid, info->snd_seq, NULL, 0); if (res < 0) { nlmsg_free(skb); goto out_err; } res = genlmsg_reply(skb, info); break; } out_err: spin_unlock_bh(&hwsim_radio_lock); return res; } static int hwsim_dump_radio_nl(struct sk_buff *skb, struct netlink_callback *cb) { int last_idx = cb->args[0] - 1; struct mac80211_hwsim_data *data = NULL; int res = 0; void *hdr; spin_lock_bh(&hwsim_radio_lock); cb->seq = hwsim_radios_generation; if (last_idx >= hwsim_radio_idx-1) goto done; list_for_each_entry(data, &hwsim_radios, list) { if (data->idx <= last_idx) continue; if (!net_eq(wiphy_net(data->hw->wiphy), sock_net(skb->sk))) continue; res = mac80211_hwsim_get_radio(skb, data, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb, NLM_F_MULTI); if (res < 0) break; last_idx = data->idx; } cb->args[0] = last_idx + 1; /* list changed, but no new element sent, set interrupted flag */ if (skb->len == 0 && cb->prev_seq && cb->seq != cb->prev_seq) { hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &hwsim_genl_family, NLM_F_MULTI, HWSIM_CMD_GET_RADIO); if (hdr) { genl_dump_check_consistent(cb, hdr); genlmsg_end(skb, hdr); } else { res = -EMSGSIZE; } } done: spin_unlock_bh(&hwsim_radio_lock); return res ?: skb->len; } /* Generic Netlink operations array */ static const struct genl_small_ops hwsim_ops[] = { { .cmd = HWSIM_CMD_REGISTER, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = hwsim_register_received_nl, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = HWSIM_CMD_FRAME, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = hwsim_cloned_frame_received_nl, }, { .cmd = HWSIM_CMD_TX_INFO_FRAME, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = hwsim_tx_info_frame_received_nl, }, { .cmd = HWSIM_CMD_NEW_RADIO, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = hwsim_new_radio_nl, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = HWSIM_CMD_DEL_RADIO, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = hwsim_del_radio_nl, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = HWSIM_CMD_GET_RADIO, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = hwsim_get_radio_nl, .dumpit = hwsim_dump_radio_nl, }, { .cmd = HWSIM_CMD_REPORT_PMSR, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = hwsim_pmsr_report_nl, }, }; static struct genl_family hwsim_genl_family __ro_after_init = { .name = "MAC80211_HWSIM", .version = 1, .maxattr = HWSIM_ATTR_MAX, .policy = hwsim_genl_policy, .netnsok = true, .module = THIS_MODULE, .small_ops = hwsim_ops, .n_small_ops = ARRAY_SIZE(hwsim_ops), .resv_start_op = HWSIM_CMD_REPORT_PMSR + 1, // match with __HWSIM_CMD_MAX .mcgrps = hwsim_mcgrps, .n_mcgrps = ARRAY_SIZE(hwsim_mcgrps), }; static void remove_user_radios(u32 portid, int netgroup) { struct mac80211_hwsim_data *entry, *tmp; LIST_HEAD(list); spin_lock_bh(&hwsim_radio_lock); list_for_each_entry_safe(entry, tmp, &hwsim_radios, list) { if (entry->destroy_on_close && entry->portid == portid && entry->netgroup == netgroup) { list_move(&entry->list, &list); rhashtable_remove_fast(&hwsim_radios_rht, &entry->rht, hwsim_rht_params); hwsim_radios_generation++; } } spin_unlock_bh(&hwsim_radio_lock); list_for_each_entry_safe(entry, tmp, &list, list) { list_del(&entry->list); mac80211_hwsim_del_radio(entry, wiphy_name(entry->hw->wiphy), NULL); } } static int mac80211_hwsim_netlink_notify(struct notifier_block *nb, unsigned long state, void *_notify) { struct netlink_notify *notify = _notify; if (state != NETLINK_URELEASE) return NOTIFY_DONE; remove_user_radios(notify->portid, hwsim_net_get_netgroup(notify->net)); if (notify->portid == hwsim_net_get_wmediumd(notify->net)) { printk(KERN_INFO "mac80211_hwsim: wmediumd released netlink" " socket, switching to perfect channel medium\n"); hwsim_register_wmediumd(notify->net, 0); } return NOTIFY_DONE; } static struct notifier_block hwsim_netlink_notifier = { .notifier_call = mac80211_hwsim_netlink_notify, }; static int __init hwsim_init_netlink(void) { int rc; printk(KERN_INFO "mac80211_hwsim: initializing netlink\n"); rc = genl_register_family(&hwsim_genl_family); if (rc) goto failure; rc = netlink_register_notifier(&hwsim_netlink_notifier); if (rc) { genl_unregister_family(&hwsim_genl_family); goto failure; } return 0; failure: pr_debug("mac80211_hwsim: error occurred in %s\n", __func__); return -EINVAL; } static __net_init int hwsim_init_net(struct net *net) { return hwsim_net_set_netgroup(net); } static void __net_exit hwsim_exit_net(struct net *net) { struct mac80211_hwsim_data *data, *tmp; LIST_HEAD(list); spin_lock_bh(&hwsim_radio_lock); list_for_each_entry_safe(data, tmp, &hwsim_radios, list) { if (!net_eq(wiphy_net(data->hw->wiphy), net)) continue; /* Radios created in init_net are returned to init_net. */ if (data->netgroup == hwsim_net_get_netgroup(&init_net)) continue; list_move(&data->list, &list); rhashtable_remove_fast(&hwsim_radios_rht, &data->rht, hwsim_rht_params); hwsim_radios_generation++; } spin_unlock_bh(&hwsim_radio_lock); list_for_each_entry_safe(data, tmp, &list, list) { list_del(&data->list); mac80211_hwsim_del_radio(data, wiphy_name(data->hw->wiphy), NULL); } ida_free(&hwsim_netgroup_ida, hwsim_net_get_netgroup(net)); } static struct pernet_operations hwsim_net_ops = { .init = hwsim_init_net, .exit = hwsim_exit_net, .id = &hwsim_net_id, .size = sizeof(struct hwsim_net), }; static void hwsim_exit_netlink(void) { /* unregister the notifier */ netlink_unregister_notifier(&hwsim_netlink_notifier); /* unregister the family */ genl_unregister_family(&hwsim_genl_family); } #if IS_REACHABLE(CONFIG_VIRTIO) static void hwsim_virtio_tx_done(struct virtqueue *vq) { unsigned int len; struct sk_buff *skb; unsigned long flags; spin_lock_irqsave(&hwsim_virtio_lock, flags); while ((skb = virtqueue_get_buf(vq, &len))) dev_kfree_skb_irq(skb); spin_unlock_irqrestore(&hwsim_virtio_lock, flags); } static int hwsim_virtio_handle_cmd(struct sk_buff *skb) { struct nlmsghdr *nlh; struct genlmsghdr *gnlh; struct nlattr *tb[HWSIM_ATTR_MAX + 1]; struct genl_info info = {}; int err; nlh = nlmsg_hdr(skb); gnlh = nlmsg_data(nlh); if (skb->len < nlh->nlmsg_len) return -EINVAL; err = genlmsg_parse(nlh, &hwsim_genl_family, tb, HWSIM_ATTR_MAX, hwsim_genl_policy, NULL); if (err) { pr_err_ratelimited("hwsim: genlmsg_parse returned %d\n", err); return err; } info.attrs = tb; switch (gnlh->cmd) { case HWSIM_CMD_FRAME: hwsim_cloned_frame_received_nl(skb, &info); break; case HWSIM_CMD_TX_INFO_FRAME: hwsim_tx_info_frame_received_nl(skb, &info); break; case HWSIM_CMD_REPORT_PMSR: hwsim_pmsr_report_nl(skb, &info); break; default: pr_err_ratelimited("hwsim: invalid cmd: %d\n", gnlh->cmd); return -EPROTO; } return 0; } static void hwsim_virtio_rx_work(struct work_struct *work) { struct virtqueue *vq; unsigned int len; struct sk_buff *skb; struct scatterlist sg[1]; int err; unsigned long flags; spin_lock_irqsave(&hwsim_virtio_lock, flags); if (!hwsim_virtio_enabled) goto out_unlock; skb = virtqueue_get_buf(hwsim_vqs[HWSIM_VQ_RX], &len); if (!skb) goto out_unlock; spin_unlock_irqrestore(&hwsim_virtio_lock, flags); skb->data = skb->head; skb_reset_tail_pointer(skb); skb_put(skb, len); hwsim_virtio_handle_cmd(skb); spin_lock_irqsave(&hwsim_virtio_lock, flags); if (!hwsim_virtio_enabled) { dev_kfree_skb_irq(skb); goto out_unlock; } vq = hwsim_vqs[HWSIM_VQ_RX]; sg_init_one(sg, skb->head, skb_end_offset(skb)); err = virtqueue_add_inbuf(vq, sg, 1, skb, GFP_ATOMIC); if (WARN(err, "virtqueue_add_inbuf returned %d\n", err)) dev_kfree_skb_irq(skb); else virtqueue_kick(vq); schedule_work(&hwsim_virtio_rx); out_unlock: spin_unlock_irqrestore(&hwsim_virtio_lock, flags); } static void hwsim_virtio_rx_done(struct virtqueue *vq) { schedule_work(&hwsim_virtio_rx); } static int init_vqs(struct virtio_device *vdev) { struct virtqueue_info vqs_info[HWSIM_NUM_VQS] = { [HWSIM_VQ_TX] = { "tx", hwsim_virtio_tx_done }, [HWSIM_VQ_RX] = { "rx", hwsim_virtio_rx_done }, }; return virtio_find_vqs(vdev, HWSIM_NUM_VQS, hwsim_vqs, vqs_info, NULL); } static int fill_vq(struct virtqueue *vq) { int i, err; struct sk_buff *skb; struct scatterlist sg[1]; for (i = 0; i < virtqueue_get_vring_size(vq); i++) { skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return -ENOMEM; sg_init_one(sg, skb->head, skb_end_offset(skb)); err = virtqueue_add_inbuf(vq, sg, 1, skb, GFP_KERNEL); if (err) { nlmsg_free(skb); return err; } } virtqueue_kick(vq); return 0; } static void remove_vqs(struct virtio_device *vdev) { int i; virtio_reset_device(vdev); for (i = 0; i < ARRAY_SIZE(hwsim_vqs); i++) { struct virtqueue *vq = hwsim_vqs[i]; struct sk_buff *skb; while ((skb = virtqueue_detach_unused_buf(vq))) nlmsg_free(skb); } vdev->config->del_vqs(vdev); } static int hwsim_virtio_probe(struct virtio_device *vdev) { int err; unsigned long flags; spin_lock_irqsave(&hwsim_virtio_lock, flags); if (hwsim_virtio_enabled) { spin_unlock_irqrestore(&hwsim_virtio_lock, flags); return -EEXIST; } spin_unlock_irqrestore(&hwsim_virtio_lock, flags); err = init_vqs(vdev); if (err) return err; virtio_device_ready(vdev); err = fill_vq(hwsim_vqs[HWSIM_VQ_RX]); if (err) goto out_remove; spin_lock_irqsave(&hwsim_virtio_lock, flags); hwsim_virtio_enabled = true; spin_unlock_irqrestore(&hwsim_virtio_lock, flags); schedule_work(&hwsim_virtio_rx); return 0; out_remove: remove_vqs(vdev); return err; } static void hwsim_virtio_remove(struct virtio_device *vdev) { hwsim_virtio_enabled = false; cancel_work_sync(&hwsim_virtio_rx); remove_vqs(vdev); } /* MAC80211_HWSIM virtio device id table */ static const struct virtio_device_id id_table[] = { { VIRTIO_ID_MAC80211_HWSIM, VIRTIO_DEV_ANY_ID }, { 0 } }; MODULE_DEVICE_TABLE(virtio, id_table); static struct virtio_driver virtio_hwsim = { .driver.name = KBUILD_MODNAME, .id_table = id_table, .probe = hwsim_virtio_probe, .remove = hwsim_virtio_remove, }; static int hwsim_register_virtio_driver(void) { return register_virtio_driver(&virtio_hwsim); } static void hwsim_unregister_virtio_driver(void) { unregister_virtio_driver(&virtio_hwsim); } #else static inline int hwsim_register_virtio_driver(void) { return 0; } static inline void hwsim_unregister_virtio_driver(void) { } #endif static int __init init_mac80211_hwsim(void) { int i, err; if (radios < 0 || radios > 100) return -EINVAL; if (channels < 1) return -EINVAL; err = rhashtable_init(&hwsim_radios_rht, &hwsim_rht_params); if (err) return err; err = register_pernet_device(&hwsim_net_ops); if (err) goto out_free_rht; err = platform_driver_register(&mac80211_hwsim_driver); if (err) goto out_unregister_pernet; err = hwsim_init_netlink(); if (err) goto out_unregister_driver; err = hwsim_register_virtio_driver(); if (err) goto out_exit_netlink; hwsim_class = class_create("mac80211_hwsim"); if (IS_ERR(hwsim_class)) { err = PTR_ERR(hwsim_class); goto out_exit_virtio; } hwsim_init_s1g_channels(hwsim_channels_s1g); for (i = 0; i < radios; i++) { struct hwsim_new_radio_params param = { 0 }; param.channels = channels; switch (regtest) { case HWSIM_REGTEST_DIFF_COUNTRY: if (i < ARRAY_SIZE(hwsim_alpha2s)) param.reg_alpha2 = hwsim_alpha2s[i]; break; case HWSIM_REGTEST_DRIVER_REG_FOLLOW: if (!i) param.reg_alpha2 = hwsim_alpha2s[0]; break; case HWSIM_REGTEST_STRICT_ALL: param.reg_strict = true; fallthrough; case HWSIM_REGTEST_DRIVER_REG_ALL: param.reg_alpha2 = hwsim_alpha2s[0]; break; case HWSIM_REGTEST_WORLD_ROAM: if (i == 0) param.regd = &hwsim_world_regdom_custom_01; break; case HWSIM_REGTEST_CUSTOM_WORLD: param.regd = &hwsim_world_regdom_custom_03; break; case HWSIM_REGTEST_CUSTOM_WORLD_2: if (i == 0) param.regd = &hwsim_world_regdom_custom_03; else if (i == 1) param.regd = &hwsim_world_regdom_custom_02; break; case HWSIM_REGTEST_STRICT_FOLLOW: if (i == 0) { param.reg_strict = true; param.reg_alpha2 = hwsim_alpha2s[0]; } break; case HWSIM_REGTEST_STRICT_AND_DRIVER_REG: if (i == 0) { param.reg_strict = true; param.reg_alpha2 = hwsim_alpha2s[0]; } else if (i == 1) { param.reg_alpha2 = hwsim_alpha2s[1]; } break; case HWSIM_REGTEST_ALL: switch (i) { case 0: param.regd = &hwsim_world_regdom_custom_01; break; case 1: param.regd = &hwsim_world_regdom_custom_02; break; case 2: param.reg_alpha2 = hwsim_alpha2s[0]; break; case 3: param.reg_alpha2 = hwsim_alpha2s[1]; break; case 4: param.reg_strict = true; param.reg_alpha2 = hwsim_alpha2s[2]; break; } break; default: break; } param.p2p_device = support_p2p_device; param.nan_device = true; param.mlo = mlo; param.multi_radio = multi_radio; param.use_chanctx = channels > 1 || mlo || multi_radio; param.iftypes = HWSIM_IFTYPE_SUPPORT_MASK; if (param.p2p_device) param.iftypes |= BIT(NL80211_IFTYPE_P2P_DEVICE); param.iftypes |= BIT(NL80211_IFTYPE_NAN); err = mac80211_hwsim_new_radio(NULL, ¶m); if (err < 0) goto out_free_radios; } hwsim_mon = alloc_netdev(0, "hwsim%d", NET_NAME_UNKNOWN, hwsim_mon_setup); if (hwsim_mon == NULL) { err = -ENOMEM; goto out_free_radios; } rtnl_lock(); err = dev_alloc_name(hwsim_mon, hwsim_mon->name); if (err < 0) { rtnl_unlock(); goto out_free_mon; } err = register_netdevice(hwsim_mon); if (err < 0) { rtnl_unlock(); goto out_free_mon; } rtnl_unlock(); return 0; out_free_mon: free_netdev(hwsim_mon); out_free_radios: mac80211_hwsim_free(); out_exit_virtio: hwsim_unregister_virtio_driver(); out_exit_netlink: hwsim_exit_netlink(); out_unregister_driver: platform_driver_unregister(&mac80211_hwsim_driver); out_unregister_pernet: unregister_pernet_device(&hwsim_net_ops); out_free_rht: rhashtable_destroy(&hwsim_radios_rht); return err; } module_init(init_mac80211_hwsim); static void __exit exit_mac80211_hwsim(void) { pr_debug("mac80211_hwsim: unregister radios\n"); hwsim_unregister_virtio_driver(); hwsim_exit_netlink(); mac80211_hwsim_free(); rhashtable_destroy(&hwsim_radios_rht); unregister_netdev(hwsim_mon); platform_driver_unregister(&mac80211_hwsim_driver); unregister_pernet_device(&hwsim_net_ops); } module_exit(exit_mac80211_hwsim); |
| 1 1 1 1 1 3 1 2 1 2 2 1 1 16 2 1 1 8 1 12 3 16 8 8 8 3 2 1 1 1 16 1 1 1 1 1 1 1 16 8 9 9 9 9 1 1 1 1 1 3 2 1 2 1 1 1 2 2 1 16 2 3 2 16 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/sysfs.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Theodore Ts'o (tytso@mit.edu) * */ #include <linux/time.h> #include <linux/fs.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/proc_fs.h> #include <linux/part_stat.h> #include "ext4.h" #include "ext4_jbd2.h" typedef enum { attr_noop, attr_delayed_allocation_blocks, attr_session_write_kbytes, attr_lifetime_write_kbytes, attr_reserved_clusters, attr_sra_exceeded_retry_limit, attr_inode_readahead, attr_trigger_test_error, attr_first_error_time, attr_last_error_time, attr_clusters_in_group, attr_mb_order, attr_feature, attr_pointer_pi, attr_pointer_ui, attr_pointer_ul, attr_pointer_u64, attr_pointer_u8, attr_pointer_string, attr_pointer_atomic, attr_journal_task, } attr_id_t; typedef enum { ptr_explicit, ptr_ext4_sb_info_offset, ptr_ext4_super_block_offset, } attr_ptr_t; static const char proc_dirname[] = "fs/ext4"; static struct proc_dir_entry *ext4_proc_root; struct ext4_attr { struct attribute attr; short attr_id; short attr_ptr; unsigned short attr_size; union { int offset; void *explicit_ptr; } u; }; static ssize_t session_write_kbytes_show(struct ext4_sb_info *sbi, char *buf) { struct super_block *sb = sbi->s_buddy_cache->i_sb; return sysfs_emit(buf, "%lu\n", (part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) - sbi->s_sectors_written_start) >> 1); } static ssize_t lifetime_write_kbytes_show(struct ext4_sb_info *sbi, char *buf) { struct super_block *sb = sbi->s_buddy_cache->i_sb; return sysfs_emit(buf, "%llu\n", (unsigned long long)(sbi->s_kbytes_written + ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) - EXT4_SB(sb)->s_sectors_written_start) >> 1))); } static ssize_t inode_readahead_blks_store(struct ext4_sb_info *sbi, const char *buf, size_t count) { unsigned long t; int ret; ret = kstrtoul(skip_spaces(buf), 0, &t); if (ret) return ret; if (t && (!is_power_of_2(t) || t > 0x40000000)) return -EINVAL; sbi->s_inode_readahead_blks = t; return count; } static ssize_t reserved_clusters_store(struct ext4_sb_info *sbi, const char *buf, size_t count) { unsigned long long val; ext4_fsblk_t clusters = (ext4_blocks_count(sbi->s_es) >> sbi->s_cluster_bits); int ret; ret = kstrtoull(skip_spaces(buf), 0, &val); if (ret || val >= clusters || (s64)val < 0) return -EINVAL; atomic64_set(&sbi->s_resv_clusters, val); return count; } static ssize_t trigger_test_error(struct ext4_sb_info *sbi, const char *buf, size_t count) { int len = count; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (len && buf[len-1] == '\n') len--; if (len) ext4_error(sbi->s_sb, "%.*s", len, buf); return count; } static ssize_t journal_task_show(struct ext4_sb_info *sbi, char *buf) { if (!sbi->s_journal) return sysfs_emit(buf, "<none>\n"); return sysfs_emit(buf, "%d\n", task_pid_vnr(sbi->s_journal->j_task)); } #define EXT4_ATTR(_name,_mode,_id) \ static struct ext4_attr ext4_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = _mode }, \ .attr_id = attr_##_id, \ } #define EXT4_ATTR_FUNC(_name,_mode) EXT4_ATTR(_name,_mode,_name) #define EXT4_ATTR_FEATURE(_name) EXT4_ATTR(_name, 0444, feature) #define EXT4_ATTR_OFFSET(_name,_mode,_id,_struct,_elname) \ static struct ext4_attr ext4_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = _mode }, \ .attr_id = attr_##_id, \ .attr_ptr = ptr_##_struct##_offset, \ .u = { \ .offset = offsetof(struct _struct, _elname),\ }, \ } #define EXT4_ATTR_STRING(_name,_mode,_size,_struct,_elname) \ static struct ext4_attr ext4_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = _mode }, \ .attr_id = attr_pointer_string, \ .attr_size = _size, \ .attr_ptr = ptr_##_struct##_offset, \ .u = { \ .offset = offsetof(struct _struct, _elname),\ }, \ } #define EXT4_RO_ATTR_ES_UI(_name,_elname) \ EXT4_ATTR_OFFSET(_name, 0444, pointer_ui, ext4_super_block, _elname) #define EXT4_RO_ATTR_ES_U8(_name,_elname) \ EXT4_ATTR_OFFSET(_name, 0444, pointer_u8, ext4_super_block, _elname) #define EXT4_RO_ATTR_ES_U64(_name,_elname) \ EXT4_ATTR_OFFSET(_name, 0444, pointer_u64, ext4_super_block, _elname) #define EXT4_RO_ATTR_ES_STRING(_name,_elname,_size) \ EXT4_ATTR_STRING(_name, 0444, _size, ext4_super_block, _elname) #define EXT4_RW_ATTR_SBI_PI(_name,_elname) \ EXT4_ATTR_OFFSET(_name, 0644, pointer_pi, ext4_sb_info, _elname) #define EXT4_RW_ATTR_SBI_UI(_name,_elname) \ EXT4_ATTR_OFFSET(_name, 0644, pointer_ui, ext4_sb_info, _elname) #define EXT4_RW_ATTR_SBI_UL(_name,_elname) \ EXT4_ATTR_OFFSET(_name, 0644, pointer_ul, ext4_sb_info, _elname) #define EXT4_RO_ATTR_SBI_ATOMIC(_name,_elname) \ EXT4_ATTR_OFFSET(_name, 0444, pointer_atomic, ext4_sb_info, _elname) #define EXT4_ATTR_PTR(_name,_mode,_id,_ptr) \ static struct ext4_attr ext4_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = _mode }, \ .attr_id = attr_##_id, \ .attr_ptr = ptr_explicit, \ .u = { \ .explicit_ptr = _ptr, \ }, \ } #define ATTR_LIST(name) &ext4_attr_##name.attr EXT4_ATTR_FUNC(delayed_allocation_blocks, 0444); EXT4_ATTR_FUNC(session_write_kbytes, 0444); EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444); EXT4_ATTR_FUNC(reserved_clusters, 0644); EXT4_ATTR_FUNC(sra_exceeded_retry_limit, 0444); EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead, ext4_sb_info, s_inode_readahead_blks); EXT4_ATTR_OFFSET(mb_group_prealloc, 0644, clusters_in_group, ext4_sb_info, s_mb_group_prealloc); EXT4_ATTR_OFFSET(mb_best_avail_max_trim_order, 0644, mb_order, ext4_sb_info, s_mb_best_avail_max_trim_order); EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats); EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups); EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error); EXT4_RW_ATTR_SBI_PI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval); EXT4_RW_ATTR_SBI_PI(err_ratelimit_burst, s_err_ratelimit_state.burst); EXT4_RW_ATTR_SBI_PI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval); EXT4_RW_ATTR_SBI_PI(warning_ratelimit_burst, s_warning_ratelimit_state.burst); EXT4_RW_ATTR_SBI_PI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); EXT4_RW_ATTR_SBI_PI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); #ifdef CONFIG_EXT4_DEBUG EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail); #endif EXT4_RO_ATTR_SBI_ATOMIC(warning_count, s_warning_count); EXT4_RO_ATTR_SBI_ATOMIC(msg_count, s_msg_count); EXT4_RO_ATTR_ES_UI(errors_count, s_error_count); EXT4_RO_ATTR_ES_U8(first_error_errcode, s_first_error_errcode); EXT4_RO_ATTR_ES_U8(last_error_errcode, s_last_error_errcode); EXT4_RO_ATTR_ES_UI(first_error_ino, s_first_error_ino); EXT4_RO_ATTR_ES_UI(last_error_ino, s_last_error_ino); EXT4_RO_ATTR_ES_U64(first_error_block, s_first_error_block); EXT4_RO_ATTR_ES_U64(last_error_block, s_last_error_block); EXT4_RO_ATTR_ES_UI(first_error_line, s_first_error_line); EXT4_RO_ATTR_ES_UI(last_error_line, s_last_error_line); EXT4_RO_ATTR_ES_STRING(first_error_func, s_first_error_func, 32); EXT4_RO_ATTR_ES_STRING(last_error_func, s_last_error_func, 32); EXT4_ATTR(first_error_time, 0444, first_error_time); EXT4_ATTR(last_error_time, 0444, last_error_time); EXT4_ATTR(journal_task, 0444, journal_task); EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch); EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit); EXT4_RW_ATTR_SBI_UL(last_trim_minblks, s_last_trim_minblks); EXT4_RW_ATTR_SBI_UI(sb_update_sec, s_sb_update_sec); EXT4_RW_ATTR_SBI_UI(sb_update_kb, s_sb_update_kb); static unsigned int old_bump_val = 128; EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val); static struct attribute *ext4_attrs[] = { ATTR_LIST(delayed_allocation_blocks), ATTR_LIST(session_write_kbytes), ATTR_LIST(lifetime_write_kbytes), ATTR_LIST(reserved_clusters), ATTR_LIST(sra_exceeded_retry_limit), ATTR_LIST(inode_readahead_blks), ATTR_LIST(inode_goal), ATTR_LIST(mb_stats), ATTR_LIST(mb_max_to_scan), ATTR_LIST(mb_min_to_scan), ATTR_LIST(mb_order2_req), ATTR_LIST(mb_stream_req), ATTR_LIST(mb_group_prealloc), ATTR_LIST(mb_max_linear_groups), ATTR_LIST(max_writeback_mb_bump), ATTR_LIST(extent_max_zeroout_kb), ATTR_LIST(trigger_fs_error), ATTR_LIST(err_ratelimit_interval_ms), ATTR_LIST(err_ratelimit_burst), ATTR_LIST(warning_ratelimit_interval_ms), ATTR_LIST(warning_ratelimit_burst), ATTR_LIST(msg_ratelimit_interval_ms), ATTR_LIST(msg_ratelimit_burst), ATTR_LIST(mb_best_avail_max_trim_order), ATTR_LIST(errors_count), ATTR_LIST(warning_count), ATTR_LIST(msg_count), ATTR_LIST(first_error_ino), ATTR_LIST(last_error_ino), ATTR_LIST(first_error_block), ATTR_LIST(last_error_block), ATTR_LIST(first_error_line), ATTR_LIST(last_error_line), ATTR_LIST(first_error_func), ATTR_LIST(last_error_func), ATTR_LIST(first_error_errcode), ATTR_LIST(last_error_errcode), ATTR_LIST(first_error_time), ATTR_LIST(last_error_time), ATTR_LIST(journal_task), #ifdef CONFIG_EXT4_DEBUG ATTR_LIST(simulate_fail), #endif ATTR_LIST(mb_prefetch), ATTR_LIST(mb_prefetch_limit), ATTR_LIST(last_trim_minblks), ATTR_LIST(sb_update_sec), ATTR_LIST(sb_update_kb), NULL, }; ATTRIBUTE_GROUPS(ext4); /* Features this copy of ext4 supports */ EXT4_ATTR_FEATURE(lazy_itable_init); EXT4_ATTR_FEATURE(batched_discard); EXT4_ATTR_FEATURE(meta_bg_resize); #ifdef CONFIG_FS_ENCRYPTION EXT4_ATTR_FEATURE(encryption); EXT4_ATTR_FEATURE(test_dummy_encryption_v2); #endif #if IS_ENABLED(CONFIG_UNICODE) EXT4_ATTR_FEATURE(casefold); #endif #ifdef CONFIG_FS_VERITY EXT4_ATTR_FEATURE(verity); #endif EXT4_ATTR_FEATURE(metadata_csum_seed); EXT4_ATTR_FEATURE(fast_commit); #if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION) EXT4_ATTR_FEATURE(encrypted_casefold); #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE EXT4_ATTR_FEATURE(blocksize_gt_pagesize); #endif static struct attribute *ext4_feat_attrs[] = { ATTR_LIST(lazy_itable_init), ATTR_LIST(batched_discard), ATTR_LIST(meta_bg_resize), #ifdef CONFIG_FS_ENCRYPTION ATTR_LIST(encryption), ATTR_LIST(test_dummy_encryption_v2), #endif #if IS_ENABLED(CONFIG_UNICODE) ATTR_LIST(casefold), #endif #ifdef CONFIG_FS_VERITY ATTR_LIST(verity), #endif ATTR_LIST(metadata_csum_seed), ATTR_LIST(fast_commit), #if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION) ATTR_LIST(encrypted_casefold), #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE ATTR_LIST(blocksize_gt_pagesize), #endif NULL, }; ATTRIBUTE_GROUPS(ext4_feat); static void *calc_ptr(struct ext4_attr *a, struct ext4_sb_info *sbi) { switch (a->attr_ptr) { case ptr_explicit: return a->u.explicit_ptr; case ptr_ext4_sb_info_offset: return (void *) (((char *) sbi) + a->u.offset); case ptr_ext4_super_block_offset: return (void *) (((char *) sbi->s_es) + a->u.offset); } return NULL; } static ssize_t __print_tstamp(char *buf, __le32 lo, __u8 hi) { return sysfs_emit(buf, "%lld\n", ((time64_t)hi << 32) + le32_to_cpu(lo)); } #define print_tstamp(buf, es, tstamp) \ __print_tstamp(buf, (es)->tstamp, (es)->tstamp ## _hi) static ssize_t ext4_generic_attr_show(struct ext4_attr *a, struct ext4_sb_info *sbi, char *buf) { void *ptr = calc_ptr(a, sbi); if (!ptr) return 0; switch (a->attr_id) { case attr_inode_readahead: case attr_clusters_in_group: case attr_mb_order: case attr_pointer_pi: case attr_pointer_ui: if (a->attr_ptr == ptr_ext4_super_block_offset) return sysfs_emit(buf, "%u\n", le32_to_cpup(ptr)); return sysfs_emit(buf, "%u\n", *((unsigned int *) ptr)); case attr_pointer_ul: return sysfs_emit(buf, "%lu\n", *((unsigned long *) ptr)); case attr_pointer_u8: return sysfs_emit(buf, "%u\n", *((unsigned char *) ptr)); case attr_pointer_u64: if (a->attr_ptr == ptr_ext4_super_block_offset) return sysfs_emit(buf, "%llu\n", le64_to_cpup(ptr)); return sysfs_emit(buf, "%llu\n", *((unsigned long long *) ptr)); case attr_pointer_string: return sysfs_emit(buf, "%.*s\n", a->attr_size, (char *) ptr); case attr_pointer_atomic: return sysfs_emit(buf, "%d\n", atomic_read((atomic_t *) ptr)); } return 0; } static ssize_t ext4_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, s_kobj); struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); switch (a->attr_id) { case attr_delayed_allocation_blocks: return sysfs_emit(buf, "%llu\n", (s64) EXT4_C2B(sbi, percpu_counter_sum(&sbi->s_dirtyclusters_counter))); case attr_session_write_kbytes: return session_write_kbytes_show(sbi, buf); case attr_lifetime_write_kbytes: return lifetime_write_kbytes_show(sbi, buf); case attr_reserved_clusters: return sysfs_emit(buf, "%llu\n", (unsigned long long) atomic64_read(&sbi->s_resv_clusters)); case attr_sra_exceeded_retry_limit: return sysfs_emit(buf, "%llu\n", (unsigned long long) percpu_counter_sum(&sbi->s_sra_exceeded_retry_limit)); case attr_feature: return sysfs_emit(buf, "supported\n"); case attr_first_error_time: return print_tstamp(buf, sbi->s_es, s_first_error_time); case attr_last_error_time: return print_tstamp(buf, sbi->s_es, s_last_error_time); case attr_journal_task: return journal_task_show(sbi, buf); default: return ext4_generic_attr_show(a, sbi, buf); } } static ssize_t ext4_generic_attr_store(struct ext4_attr *a, struct ext4_sb_info *sbi, const char *buf, size_t len) { int ret; unsigned int t; unsigned long lt; void *ptr = calc_ptr(a, sbi); if (!ptr) return 0; switch (a->attr_id) { case attr_pointer_pi: ret = kstrtouint(skip_spaces(buf), 0, &t); if (ret) return ret; if ((int)t < 0) return -EINVAL; *((unsigned int *) ptr) = t; return len; case attr_pointer_ui: ret = kstrtouint(skip_spaces(buf), 0, &t); if (ret) return ret; if (a->attr_ptr == ptr_ext4_super_block_offset) *((__le32 *) ptr) = cpu_to_le32(t); else *((unsigned int *) ptr) = t; return len; case attr_mb_order: ret = kstrtouint(skip_spaces(buf), 0, &t); if (ret) return ret; if (t > 64) return -EINVAL; *((unsigned int *) ptr) = t; return len; case attr_clusters_in_group: ret = kstrtouint(skip_spaces(buf), 0, &t); if (ret) return ret; if (t > sbi->s_clusters_per_group) return -EINVAL; *((unsigned int *) ptr) = t; return len; case attr_pointer_ul: ret = kstrtoul(skip_spaces(buf), 0, <); if (ret) return ret; *((unsigned long *) ptr) = lt; return len; } return 0; } static ssize_t ext4_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t len) { struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, s_kobj); struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); switch (a->attr_id) { case attr_reserved_clusters: return reserved_clusters_store(sbi, buf, len); case attr_inode_readahead: return inode_readahead_blks_store(sbi, buf, len); case attr_trigger_test_error: return trigger_test_error(sbi, buf, len); default: return ext4_generic_attr_store(a, sbi, buf, len); } } static void ext4_sb_release(struct kobject *kobj) { struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, s_kobj); complete(&sbi->s_kobj_unregister); } static void ext4_feat_release(struct kobject *kobj) { kfree(kobj); } static const struct sysfs_ops ext4_attr_ops = { .show = ext4_attr_show, .store = ext4_attr_store, }; static const struct kobj_type ext4_sb_ktype = { .default_groups = ext4_groups, .sysfs_ops = &ext4_attr_ops, .release = ext4_sb_release, }; static const struct kobj_type ext4_feat_ktype = { .default_groups = ext4_feat_groups, .sysfs_ops = &ext4_attr_ops, .release = ext4_feat_release, }; void ext4_notify_error_sysfs(struct ext4_sb_info *sbi) { sysfs_notify(&sbi->s_kobj, NULL, "errors_count"); } static struct kobject *ext4_root; static struct kobject *ext4_feat; int ext4_register_sysfs(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); int err; init_completion(&sbi->s_kobj_unregister); err = kobject_init_and_add(&sbi->s_kobj, &ext4_sb_ktype, ext4_root, "%s", sb->s_id); if (err) { kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); return err; } if (ext4_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); if (sbi->s_proc) { proc_create_single_data("options", S_IRUGO, sbi->s_proc, ext4_seq_options_show, sb); proc_create_single_data("es_shrinker_info", S_IRUGO, sbi->s_proc, ext4_seq_es_shrinker_info_show, sb); proc_create_single_data("fc_info", 0444, sbi->s_proc, ext4_fc_info_show, sb); proc_create_seq_data("mb_groups", S_IRUGO, sbi->s_proc, &ext4_mb_seq_groups_ops, sb); proc_create_single_data("mb_stats", 0444, sbi->s_proc, ext4_seq_mb_stats_show, sb); proc_create_seq_data("mb_structs_summary", 0444, sbi->s_proc, &ext4_mb_seq_structs_summary_ops, sb); } return 0; } void ext4_unregister_sysfs(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); if (sbi->s_proc) remove_proc_subtree(sb->s_id, ext4_proc_root); kobject_del(&sbi->s_kobj); } int __init ext4_init_sysfs(void) { int ret; ext4_root = kobject_create_and_add("ext4", fs_kobj); if (!ext4_root) return -ENOMEM; ext4_feat = kzalloc(sizeof(*ext4_feat), GFP_KERNEL); if (!ext4_feat) { ret = -ENOMEM; goto root_err; } ret = kobject_init_and_add(ext4_feat, &ext4_feat_ktype, ext4_root, "features"); if (ret) goto feat_err; ext4_proc_root = proc_mkdir(proc_dirname, NULL); return ret; feat_err: kobject_put(ext4_feat); ext4_feat = NULL; root_err: kobject_put(ext4_root); ext4_root = NULL; return ret; } void ext4_exit_sysfs(void) { kobject_put(ext4_feat); ext4_feat = NULL; kobject_put(ext4_root); ext4_root = NULL; remove_proc_entry(proc_dirname, NULL); ext4_proc_root = NULL; } |
| 20159 1999 20668 1473 2814 1264 2014 3822 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 | /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef _LINUX_FILE_REF_H #define _LINUX_FILE_REF_H #include <linux/atomic.h> #include <linux/preempt.h> #include <linux/types.h> /* * file_ref is a reference count implementation specifically for use by * files. It takes inspiration from rcuref but differs in key aspects * such as support for SLAB_TYPESAFE_BY_RCU type caches. * * FILE_REF_ONEREF FILE_REF_MAXREF * 0x0000000000000000UL 0x7FFFFFFFFFFFFFFFUL * <-------------------valid -------------------> * * FILE_REF_SATURATED * 0x8000000000000000UL 0xA000000000000000UL 0xBFFFFFFFFFFFFFFFUL * <-----------------------saturation zone----------------------> * * FILE_REF_RELEASED FILE_REF_DEAD * 0xC000000000000000UL 0xE000000000000000UL * <-------------------dead zone-------------------> * * FILE_REF_NOREF * 0xFFFFFFFFFFFFFFFFUL */ #ifdef CONFIG_64BIT #define FILE_REF_ONEREF 0x0000000000000000UL #define FILE_REF_MAXREF 0x7FFFFFFFFFFFFFFFUL #define FILE_REF_SATURATED 0xA000000000000000UL #define FILE_REF_RELEASED 0xC000000000000000UL #define FILE_REF_DEAD 0xE000000000000000UL #define FILE_REF_NOREF 0xFFFFFFFFFFFFFFFFUL #else #define FILE_REF_ONEREF 0x00000000U #define FILE_REF_MAXREF 0x7FFFFFFFU #define FILE_REF_SATURATED 0xA0000000U #define FILE_REF_RELEASED 0xC0000000U #define FILE_REF_DEAD 0xE0000000U #define FILE_REF_NOREF 0xFFFFFFFFU #endif typedef struct { #ifdef CONFIG_64BIT atomic64_t refcnt; #else atomic_t refcnt; #endif } file_ref_t; /** * file_ref_init - Initialize a file reference count * @ref: Pointer to the reference count * @cnt: The initial reference count typically '1' */ static inline void file_ref_init(file_ref_t *ref, unsigned long cnt) { atomic_long_set(&ref->refcnt, cnt - 1); } bool __file_ref_put(file_ref_t *ref, unsigned long cnt); /** * file_ref_get - Acquire one reference on a file * @ref: Pointer to the reference count * * Similar to atomic_inc_not_zero() but saturates at FILE_REF_MAXREF. * * Provides full memory ordering. * * Return: False if the attempt to acquire a reference failed. This happens * when the last reference has been put already. True if a reference * was successfully acquired */ static __always_inline __must_check bool file_ref_get(file_ref_t *ref) { /* * Unconditionally increase the reference count with full * ordering. The saturation and dead zones provide enough * tolerance for this. * * If this indicates negative the file in question the fail can * be freed and immediately reused due to SLAB_TYPSAFE_BY_RCU. * Hence, unconditionally altering the file reference count to * e.g., reset the file reference count back to the middle of * the deadzone risk end up marking someone else's file as dead * behind their back. * * It would be possible to do a careful: * * cnt = atomic_long_inc_return(); * if (likely(cnt >= 0)) * return true; * * and then something like: * * if (cnt >= FILE_REF_RELEASE) * atomic_long_try_cmpxchg(&ref->refcnt, &cnt, FILE_REF_DEAD), * * to set the value back to the middle of the deadzone. But it's * practically impossible to go from FILE_REF_DEAD to * FILE_REF_ONEREF. It would need 2305843009213693952/2^61 * file_ref_get()s to resurrect such a dead file. */ return !atomic_long_add_negative(1, &ref->refcnt); } /** * file_ref_inc - Acquire one reference on a file * @ref: Pointer to the reference count * * Acquire an additional reference on a file. Warns if the caller didn't * already hold a reference. */ static __always_inline void file_ref_inc(file_ref_t *ref) { long prior = atomic_long_fetch_inc_relaxed(&ref->refcnt); WARN_ONCE(prior < 0, "file_ref_inc() on a released file reference"); } /** * file_ref_put -- Release a file reference * @ref: Pointer to the reference count * * Provides release memory ordering, such that prior loads and stores * are done before, and provides an acquire ordering on success such * that free() must come after. * * Return: True if this was the last reference with no future references * possible. This signals the caller that it can safely release * the object which is protected by the reference counter. * False if there are still active references or the put() raced * with a concurrent get()/put() pair. Caller is not allowed to * release the protected object. */ static __always_inline __must_check bool file_ref_put(file_ref_t *ref) { long cnt; /* * While files are SLAB_TYPESAFE_BY_RCU and thus file_ref_put() * calls don't risk UAFs when a file is recyclyed, it is still * vulnerable to UAFs caused by freeing the whole slab page once * it becomes unused. Prevent file_ref_put() from being * preempted protects against this. */ guard(preempt)(); /* * Unconditionally decrease the reference count. The saturation * and dead zones provide enough tolerance for this. If this * fails then we need to handle the last reference drop and * cases inside the saturation and dead zones. */ cnt = atomic_long_dec_return(&ref->refcnt); if (cnt >= 0) return false; return __file_ref_put(ref, cnt); } /** * file_ref_put_close - drop a reference expecting it would transition to FILE_REF_NOREF * @ref: Pointer to the reference count * * Semantically it is equivalent to calling file_ref_put(), but it trades lower * performance in face of other CPUs also modifying the refcount for higher * performance when this happens to be the last reference. * * For the last reference file_ref_put() issues 2 atomics. One to drop the * reference and another to transition it to FILE_REF_DEAD. This routine does * the work in one step, but in order to do it has to pre-read the variable which * decreases scalability. * * Use with close() et al, stick to file_ref_put() by default. */ static __always_inline __must_check bool file_ref_put_close(file_ref_t *ref) { long old; old = atomic_long_read(&ref->refcnt); if (likely(old == FILE_REF_ONEREF)) { if (likely(atomic_long_try_cmpxchg(&ref->refcnt, &old, FILE_REF_DEAD))) return true; } return file_ref_put(ref); } /** * file_ref_read - Read the number of file references * @ref: Pointer to the reference count * * Return: The number of held references (0 ... N) */ static inline unsigned long file_ref_read(file_ref_t *ref) { unsigned long c = atomic_long_read(&ref->refcnt); /* Return 0 if within the DEAD zone. */ return c >= FILE_REF_RELEASED ? 0 : c + 1; } /* * __file_ref_read_raw - Return the value stored in ref->refcnt * @ref: Pointer to the reference count * * Return: The raw value found in the counter * * A hack for file_needs_f_pos_lock(), you probably want to use * file_ref_read() instead. */ static inline unsigned long __file_ref_read_raw(file_ref_t *ref) { return atomic_long_read(&ref->refcnt); } #endif |
| 35 56 22 6 5 5 16 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef LINUX_EXPORTFS_H #define LINUX_EXPORTFS_H 1 #include <linux/types.h> #include <linux/path.h> struct dentry; struct iattr; struct inode; struct iomap; struct super_block; struct vfsmount; /* limit the handle size to NFSv4 handle size now */ #define MAX_HANDLE_SZ 128 /* * The fileid_type identifies how the file within the filesystem is encoded. * In theory this is freely set and parsed by the filesystem, but we try to * stick to conventions so we can share some generic code and don't confuse * sniffers like ethereal/wireshark. * * The filesystem must not use the value '0' or '0xff'. */ enum fid_type { /* * The root, or export point, of the filesystem. * (Never actually passed down to the filesystem. */ FILEID_ROOT = 0, /* * 32bit inode number, 32 bit generation number. */ FILEID_INO32_GEN = 1, /* * 32bit inode number, 32 bit generation number, * 32 bit parent directory inode number. */ FILEID_INO32_GEN_PARENT = 2, /* * 64 bit object ID, 64 bit root object ID, * 32 bit generation number. */ FILEID_BTRFS_WITHOUT_PARENT = 0x4d, /* * 64 bit object ID, 64 bit root object ID, * 32 bit generation number, * 64 bit parent object ID, 32 bit parent generation. */ FILEID_BTRFS_WITH_PARENT = 0x4e, /* * 64 bit object ID, 64 bit root object ID, * 32 bit generation number, * 64 bit parent object ID, 32 bit parent generation, * 64 bit parent root object ID. */ FILEID_BTRFS_WITH_PARENT_ROOT = 0x4f, /* * 32 bit block number, 16 bit partition reference, * 16 bit unused, 32 bit generation number. */ FILEID_UDF_WITHOUT_PARENT = 0x51, /* * 32 bit block number, 16 bit partition reference, * 16 bit unused, 32 bit generation number, * 32 bit parent block number, 32 bit parent generation number */ FILEID_UDF_WITH_PARENT = 0x52, /* * 64 bit checkpoint number, 64 bit inode number, * 32 bit generation number. */ FILEID_NILFS_WITHOUT_PARENT = 0x61, /* * 64 bit checkpoint number, 64 bit inode number, * 32 bit generation number, 32 bit parent generation. * 64 bit parent inode number. */ FILEID_NILFS_WITH_PARENT = 0x62, /* * 32 bit generation number, 40 bit i_pos. */ FILEID_FAT_WITHOUT_PARENT = 0x71, /* * 32 bit generation number, 40 bit i_pos, * 32 bit parent generation number, 40 bit parent i_pos */ FILEID_FAT_WITH_PARENT = 0x72, /* * 64 bit inode number, 32 bit generation number. */ FILEID_INO64_GEN = 0x81, /* * 64 bit inode number, 32 bit generation number, * 64 bit parent inode number, 32 bit parent generation. */ FILEID_INO64_GEN_PARENT = 0x82, /* * 128 bit child FID (struct lu_fid) * 128 bit parent FID (struct lu_fid) */ FILEID_LUSTRE = 0x97, /* * 64 bit inode number, 32 bit subvolume, 32 bit generation number: */ FILEID_BCACHEFS_WITHOUT_PARENT = 0xb1, FILEID_BCACHEFS_WITH_PARENT = 0xb2, /* * * 64 bit namespace identifier, 32 bit namespace type, 32 bit inode number. */ FILEID_NSFS = 0xf1, /* * 64 bit unique kernfs id */ FILEID_KERNFS = 0xfe, /* * Filesystems must not use 0xff file ID. */ FILEID_INVALID = 0xff, }; struct fid { union { struct { u32 ino; u32 gen; u32 parent_ino; u32 parent_gen; } i32; struct { u64 ino; u32 gen; } __packed i64; struct { u32 block; u16 partref; u16 parent_partref; u32 generation; u32 parent_block; u32 parent_generation; } udf; DECLARE_FLEX_ARRAY(__u32, raw); }; }; enum handle_to_path_flags { HANDLE_CHECK_PERMS = (1 << 0), HANDLE_CHECK_SUBTREE = (1 << 1), }; struct handle_to_path_ctx { struct path root; enum handle_to_path_flags flags; unsigned int fh_flags; }; #define EXPORT_FH_CONNECTABLE 0x1 /* Encode file handle with parent */ #define EXPORT_FH_FID 0x2 /* File handle may be non-decodeable */ #define EXPORT_FH_DIR_ONLY 0x4 /* Only decode file handle for a directory */ /* * Filesystems use only lower 8 bits of file_handle type for fid_type. * name_to_handle_at() uses upper 16 bits of type as user flags to be * interpreted by open_by_handle_at(). */ #define FILEID_USER_FLAGS_MASK 0xffff0000 #define FILEID_USER_FLAGS(type) ((type) & FILEID_USER_FLAGS_MASK) /* Flags supported in encoded handle_type that is exported to user */ #define FILEID_IS_CONNECTABLE 0x10000 #define FILEID_IS_DIR 0x20000 #define FILEID_VALID_USER_FLAGS (FILEID_IS_CONNECTABLE | FILEID_IS_DIR) /** * struct export_operations - for nfsd to communicate with file systems * @encode_fh: encode a file handle fragment from a dentry * @fh_to_dentry: find the implied object and get a dentry for it * @fh_to_parent: find the implied object's parent and get a dentry for it * @get_name: find the name for a given inode in a given directory * @get_parent: find the parent of a given directory * @commit_metadata: commit metadata changes to stable storage * * See Documentation/filesystems/nfs/exporting.rst for details on how to use * this interface correctly. * * encode_fh: * @encode_fh should store in the file handle fragment @fh (using at most * @max_len bytes) information that can be used by @decode_fh to recover the * file referred to by the &struct dentry @de. If @flag has CONNECTABLE bit * set, the encode_fh() should store sufficient information so that a good * attempt can be made to find not only the file but also it's place in the * filesystem. This typically means storing a reference to de->d_parent in * the filehandle fragment. encode_fh() should return the fileid_type on * success and on error returns 255 (if the space needed to encode fh is * greater than @max_len*4 bytes). On error @max_len contains the minimum * size(in 4 byte unit) needed to encode the file handle. * * fh_to_dentry: * @fh_to_dentry is given a &struct super_block (@sb) and a file handle * fragment (@fh, @fh_len). It should return a &struct dentry which refers * to the same file that the file handle fragment refers to. If it cannot, * it should return a %NULL pointer if the file cannot be found, or an * %ERR_PTR error code of %ENOMEM if a memory allocation failure occurred. * Any other error code is treated like %NULL, and will cause an %ESTALE error * for callers of exportfs_decode_fh(). * Any suitable dentry can be returned including, if necessary, a new dentry * created with d_alloc_root. The caller can then find any other extant * dentries by following the d_alias links. * * fh_to_parent: * Same as @fh_to_dentry, except that it returns a pointer to the parent * dentry if it was encoded into the filehandle fragment by @encode_fh. * * get_name: * @get_name should find a name for the given @child in the given @parent * directory. The name should be stored in the @name (with the * understanding that it is already pointing to a %NAME_MAX+1 sized * buffer. get_name() should return %0 on success, a negative error code * or error. @get_name will be called without @parent->i_rwsem held. * * get_parent: * @get_parent should find the parent directory for the given @child which * is also a directory. In the event that it cannot be found, or storage * space cannot be allocated, a %ERR_PTR should be returned. * * permission: * Allow filesystems to specify a custom permission function. * * open: * Allow filesystems to specify a custom open function. * * commit_metadata: * @commit_metadata should commit metadata changes to stable storage. * * Locking rules: * get_parent is called with child->d_inode->i_rwsem down * get_name is not (which is possibly inconsistent) */ struct export_operations { int (*encode_fh)(struct inode *inode, __u32 *fh, int *max_len, struct inode *parent); struct dentry * (*fh_to_dentry)(struct super_block *sb, struct fid *fid, int fh_len, int fh_type); struct dentry * (*fh_to_parent)(struct super_block *sb, struct fid *fid, int fh_len, int fh_type); int (*get_name)(struct dentry *parent, char *name, struct dentry *child); struct dentry * (*get_parent)(struct dentry *child); int (*commit_metadata)(struct inode *inode); int (*get_uuid)(struct super_block *sb, u8 *buf, u32 *len, u64 *offset); int (*map_blocks)(struct inode *inode, loff_t offset, u64 len, struct iomap *iomap, bool write, u32 *device_generation); int (*commit_blocks)(struct inode *inode, struct iomap *iomaps, int nr_iomaps, struct iattr *iattr); int (*permission)(struct handle_to_path_ctx *ctx, unsigned int oflags); struct file * (*open)(const struct path *path, unsigned int oflags); #define EXPORT_OP_NOWCC (0x1) /* don't collect v3 wcc data */ #define EXPORT_OP_NOSUBTREECHK (0x2) /* no subtree checking */ #define EXPORT_OP_CLOSE_BEFORE_UNLINK (0x4) /* close files before unlink */ #define EXPORT_OP_REMOTE_FS (0x8) /* Filesystem is remote */ #define EXPORT_OP_NOATOMIC_ATTR (0x10) /* Filesystem cannot supply atomic attribute updates */ #define EXPORT_OP_FLUSH_ON_CLOSE (0x20) /* fs flushes file data on close */ #define EXPORT_OP_NOLOCKS (0x40) /* no file locking support */ unsigned long flags; }; /** * exportfs_cannot_lock() - check if export implements file locking * @export_ops: the nfs export operations to check * * Returns true if the export does not support file locking. */ static inline bool exportfs_cannot_lock(const struct export_operations *export_ops) { return export_ops->flags & EXPORT_OP_NOLOCKS; } extern int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid, int *max_len, struct inode *parent, int flags); extern int exportfs_encode_fh(struct dentry *dentry, struct fid *fid, int *max_len, int flags); static inline bool exportfs_can_encode_fid(const struct export_operations *nop) { return !nop || nop->encode_fh; } static inline bool exportfs_can_decode_fh(const struct export_operations *nop) { return nop && nop->fh_to_dentry; } static inline bool exportfs_can_encode_fh(const struct export_operations *nop, int fh_flags) { /* * If a non-decodeable file handle was requested, we only need to make * sure that filesystem did not opt-out of encoding fid. */ if (fh_flags & EXPORT_FH_FID) return exportfs_can_encode_fid(nop); /* Normal file handles cannot be created without export ops */ if (!nop) return false; /* * If a connectable file handle was requested, we need to make sure that * filesystem can also decode connected file handles. */ if ((fh_flags & EXPORT_FH_CONNECTABLE) && !nop->fh_to_parent) return false; /* * If a decodeable file handle was requested, we need to make sure that * filesystem can also decode file handles. */ return exportfs_can_decode_fh(nop); } static inline int exportfs_encode_fid(struct inode *inode, struct fid *fid, int *max_len) { return exportfs_encode_inode_fh(inode, fid, max_len, NULL, EXPORT_FH_FID); } extern struct dentry *exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len, int fileid_type, unsigned int flags, int (*acceptable)(void *, struct dentry *), void *context); extern struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, int fh_len, int fileid_type, int (*acceptable)(void *, struct dentry *), void *context); /* * Generic helpers for filesystems. */ int generic_encode_ino32_fh(struct inode *inode, __u32 *fh, int *max_len, struct inode *parent); struct dentry *generic_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type, struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen)); struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type, struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen)); #endif /* LINUX_EXPORTFS_H */ |
| 11412 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __X86_KERNEL_FPU_INTERNAL_H #define __X86_KERNEL_FPU_INTERNAL_H extern struct fpstate init_fpstate; /* CPU feature check wrappers */ static __always_inline __pure bool use_xsave(void) { return cpu_feature_enabled(X86_FEATURE_XSAVE); } static __always_inline __pure bool use_fxsr(void) { return cpu_feature_enabled(X86_FEATURE_FXSR); } #ifdef CONFIG_X86_DEBUG_FPU # define WARN_ON_FPU(x) WARN_ON_ONCE(x) #else # define WARN_ON_FPU(x) ({ BUILD_BUG_ON_INVALID(x); 0; }) #endif /* Used in init.c */ extern void fpstate_init_user(struct fpstate *fpstate); extern void fpstate_reset(struct fpu *fpu); #endif |
| 31 31 25 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * ALSA sequencer Timer * Copyright (c) 1998-1999 by Frank van de Pol <fvdpol@coil.demon.nl> */ #ifndef __SND_SEQ_TIMER_H #define __SND_SEQ_TIMER_H #include <sound/timer.h> #include <sound/seq_kernel.h> struct snd_seq_timer_tick { snd_seq_tick_time_t cur_tick; /* current tick */ unsigned long resolution; /* time per tick in nsec */ unsigned long fraction; /* current time per tick in nsec */ }; struct snd_seq_timer { /* ... tempo / offset / running state */ unsigned int running:1, /* running state of queue */ initialized:1; /* timer is initialized */ unsigned int tempo; /* current tempo, us/tick */ int ppq; /* time resolution, ticks/quarter */ snd_seq_real_time_t cur_time; /* current time */ struct snd_seq_timer_tick tick; /* current tick */ int tick_updated; int type; /* timer type */ struct snd_timer_id alsa_id; /* ALSA's timer ID */ struct snd_timer_instance *timeri; /* timer instance */ unsigned int ticks; unsigned long preferred_resolution; /* timer resolution, ticks/sec */ unsigned int skew; unsigned int skew_base; unsigned int tempo_base; struct timespec64 last_update; /* time of last clock update, used for interpolation */ spinlock_t lock; }; /* create new timer (constructor) */ struct snd_seq_timer *snd_seq_timer_new(void); /* delete timer (destructor) */ void snd_seq_timer_delete(struct snd_seq_timer **tmr); /* */ static inline void snd_seq_timer_update_tick(struct snd_seq_timer_tick *tick, unsigned long resolution) { if (tick->resolution > 0) { tick->fraction += resolution; tick->cur_tick += (unsigned int)(tick->fraction / tick->resolution); tick->fraction %= tick->resolution; } } /* compare timestamp between events */ /* return 1 if a >= b; otherwise return 0 */ static inline int snd_seq_compare_tick_time(snd_seq_tick_time_t *a, snd_seq_tick_time_t *b) { /* compare ticks */ return (*a >= *b); } static inline int snd_seq_compare_real_time(snd_seq_real_time_t *a, snd_seq_real_time_t *b) { /* compare real time */ if (a->tv_sec > b->tv_sec) return 1; if ((a->tv_sec == b->tv_sec) && (a->tv_nsec >= b->tv_nsec)) return 1; return 0; } static inline void snd_seq_sanity_real_time(snd_seq_real_time_t *tm) { while (tm->tv_nsec >= 1000000000) { /* roll-over */ tm->tv_nsec -= 1000000000; tm->tv_sec++; } } /* increment timestamp */ static inline void snd_seq_inc_real_time(snd_seq_real_time_t *tm, snd_seq_real_time_t *inc) { tm->tv_sec += inc->tv_sec; tm->tv_nsec += inc->tv_nsec; snd_seq_sanity_real_time(tm); } static inline void snd_seq_inc_time_nsec(snd_seq_real_time_t *tm, unsigned long nsec) { tm->tv_nsec += nsec; snd_seq_sanity_real_time(tm); } /* called by timer isr */ struct snd_seq_queue; int snd_seq_timer_open(struct snd_seq_queue *q); int snd_seq_timer_close(struct snd_seq_queue *q); void snd_seq_timer_defaults(struct snd_seq_timer *tmr); void snd_seq_timer_reset(struct snd_seq_timer *tmr); int snd_seq_timer_stop(struct snd_seq_timer *tmr); int snd_seq_timer_start(struct snd_seq_timer *tmr); int snd_seq_timer_continue(struct snd_seq_timer *tmr); int snd_seq_timer_set_tempo(struct snd_seq_timer *tmr, int tempo); int snd_seq_timer_set_tempo_ppq(struct snd_seq_timer *tmr, int tempo, int ppq, unsigned int tempo_base); int snd_seq_timer_set_position_tick(struct snd_seq_timer *tmr, snd_seq_tick_time_t position); int snd_seq_timer_set_position_time(struct snd_seq_timer *tmr, snd_seq_real_time_t position); int snd_seq_timer_set_skew(struct snd_seq_timer *tmr, unsigned int skew, unsigned int base); snd_seq_real_time_t snd_seq_timer_get_cur_time(struct snd_seq_timer *tmr, bool adjust_ktime); snd_seq_tick_time_t snd_seq_timer_get_cur_tick(struct snd_seq_timer *tmr); extern int seq_default_timer_class; extern int seq_default_timer_sclass; extern int seq_default_timer_card; extern int seq_default_timer_device; extern int seq_default_timer_subdevice; extern int seq_default_timer_resolution; #endif |
| 26 26 26 31 25 6 6 4 3 3 5 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 2 2 2 3 14 14 1 13 13 1 13 14 4 25 25 25 68 68 24 2 24 2 24 7 6 2 5 2 3 2 2 2 7 3 10 9 9 7 4 4 4 4 3 3 2 2 3 3 3 1 1 1 1 6 8 3 2 1 1 2 2 89 5 5 5 5 1 5 2 5 5 32 31 31 1 31 32 89 12 11 1 4 4 3 3 3 3 3 4 4 4 4 3 3 3 2 1 3 3 2 2 32 54 26 27 54 7 2 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 | // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2009 Red Hat, Inc. * Author: Michael S. Tsirkin <mst@redhat.com> * * virtio-net server in host kernel. */ #include <linux/compat.h> #include <linux/eventfd.h> #include <linux/vhost.h> #include <linux/virtio_net.h> #include <linux/miscdevice.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/mutex.h> #include <linux/workqueue.h> #include <linux/file.h> #include <linux/slab.h> #include <linux/sched/clock.h> #include <linux/sched/signal.h> #include <linux/vmalloc.h> #include <linux/net.h> #include <linux/if_packet.h> #include <linux/if_arp.h> #include <linux/if_tun.h> #include <linux/if_macvlan.h> #include <linux/if_tap.h> #include <linux/if_vlan.h> #include <linux/skb_array.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/xdp.h> #include "vhost.h" static int experimental_zcopytx = 0; module_param(experimental_zcopytx, int, 0444); MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;" " 1 -Enable; 0 - Disable"); /* Max number of bytes transferred before requeueing the job. * Using this limit prevents one virtqueue from starving others. */ #define VHOST_NET_WEIGHT 0x80000 /* Max number of packets transferred before requeueing the job. * Using this limit prevents one virtqueue from starving others with small * pkts. */ #define VHOST_NET_PKT_WEIGHT 256 /* MAX number of TX used buffers for outstanding zerocopy */ #define VHOST_MAX_PEND 128 #define VHOST_GOODCOPY_LEN 256 /* * For transmit, used buffer len is unused; we override it to track buffer * status internally; used for zerocopy tx only. */ /* Lower device DMA failed */ #define VHOST_DMA_FAILED_LEN ((__force __virtio32)3) /* Lower device DMA done */ #define VHOST_DMA_DONE_LEN ((__force __virtio32)2) /* Lower device DMA in progress */ #define VHOST_DMA_IN_PROGRESS ((__force __virtio32)1) /* Buffer unused */ #define VHOST_DMA_CLEAR_LEN ((__force __virtio32)0) #define VHOST_DMA_IS_DONE(len) ((__force u32)(len) >= (__force u32)VHOST_DMA_DONE_LEN) static const int vhost_net_bits[] = { VHOST_FEATURES, VHOST_NET_F_VIRTIO_NET_HDR, VIRTIO_NET_F_MRG_RXBUF, VIRTIO_F_ACCESS_PLATFORM, VIRTIO_F_RING_RESET, VIRTIO_F_IN_ORDER, VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO, VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO }; enum { VHOST_NET_BACKEND_FEATURES = (1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2) }; enum { VHOST_NET_VQ_RX = 0, VHOST_NET_VQ_TX = 1, VHOST_NET_VQ_MAX = 2, }; struct vhost_net_ubuf_ref { /* refcount follows semantics similar to kref: * 0: object is released * 1: no outstanding ubufs * >1: outstanding ubufs */ atomic_t refcount; wait_queue_head_t wait; struct vhost_virtqueue *vq; struct rcu_head rcu; }; #define VHOST_NET_BATCH 64 struct vhost_net_buf { void **queue; int tail; int head; }; struct vhost_net_virtqueue { struct vhost_virtqueue vq; size_t vhost_hlen; size_t sock_hlen; /* vhost zerocopy support fields below: */ /* last used idx for outstanding DMA zerocopy buffers */ int upend_idx; /* For TX, first used idx for DMA done zerocopy buffers * For RX, number of batched heads */ int done_idx; /* Number of XDP frames batched */ int batched_xdp; /* an array of userspace buffers info */ struct ubuf_info_msgzc *ubuf_info; /* Reference counting for outstanding ubufs. * Protected by vq mutex. Writers must also take device mutex. */ struct vhost_net_ubuf_ref *ubufs; struct ptr_ring *rx_ring; struct vhost_net_buf rxq; /* Batched XDP buffs */ struct xdp_buff *xdp; }; struct vhost_net { struct vhost_dev dev; struct vhost_net_virtqueue vqs[VHOST_NET_VQ_MAX]; struct vhost_poll poll[VHOST_NET_VQ_MAX]; /* Number of TX recently submitted. * Protected by tx vq lock. */ unsigned tx_packets; /* Number of times zerocopy TX recently failed. * Protected by tx vq lock. */ unsigned tx_zcopy_err; /* Flush in progress. Protected by tx vq lock. */ bool tx_flush; /* Private page frag cache */ struct page_frag_cache pf_cache; }; static unsigned vhost_net_zcopy_mask __read_mostly; static void *vhost_net_buf_get_ptr(struct vhost_net_buf *rxq) { if (rxq->tail != rxq->head) return rxq->queue[rxq->head]; else return NULL; } static int vhost_net_buf_get_size(struct vhost_net_buf *rxq) { return rxq->tail - rxq->head; } static int vhost_net_buf_is_empty(struct vhost_net_buf *rxq) { return rxq->tail == rxq->head; } static void *vhost_net_buf_consume(struct vhost_net_buf *rxq) { void *ret = vhost_net_buf_get_ptr(rxq); ++rxq->head; return ret; } static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq) { struct vhost_net_buf *rxq = &nvq->rxq; rxq->head = 0; rxq->tail = ptr_ring_consume_batched(nvq->rx_ring, rxq->queue, VHOST_NET_BATCH); return rxq->tail; } static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq) { struct vhost_net_buf *rxq = &nvq->rxq; if (nvq->rx_ring && !vhost_net_buf_is_empty(rxq)) { ptr_ring_unconsume(nvq->rx_ring, rxq->queue + rxq->head, vhost_net_buf_get_size(rxq), tun_ptr_free); rxq->head = rxq->tail = 0; } } static int vhost_net_buf_peek_len(void *ptr) { if (tun_is_xdp_frame(ptr)) { struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); return xdpf->len; } return __skb_array_len_with_tag(ptr); } static int vhost_net_buf_peek(struct vhost_net_virtqueue *nvq) { struct vhost_net_buf *rxq = &nvq->rxq; if (!vhost_net_buf_is_empty(rxq)) goto out; if (!vhost_net_buf_produce(nvq)) return 0; out: return vhost_net_buf_peek_len(vhost_net_buf_get_ptr(rxq)); } static void vhost_net_buf_init(struct vhost_net_buf *rxq) { rxq->head = rxq->tail = 0; } static void vhost_net_enable_zcopy(int vq) { vhost_net_zcopy_mask |= 0x1 << vq; } static struct vhost_net_ubuf_ref * vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy) { struct vhost_net_ubuf_ref *ubufs; /* No zero copy backend? Nothing to count. */ if (!zcopy) return NULL; ubufs = kmalloc(sizeof(*ubufs), GFP_KERNEL); if (!ubufs) return ERR_PTR(-ENOMEM); atomic_set(&ubufs->refcount, 1); init_waitqueue_head(&ubufs->wait); ubufs->vq = vq; return ubufs; } static int vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs) { int r; rcu_read_lock(); r = atomic_sub_return(1, &ubufs->refcount); if (unlikely(!r)) wake_up(&ubufs->wait); rcu_read_unlock(); return r; } static void vhost_net_ubuf_put_and_wait(struct vhost_net_ubuf_ref *ubufs) { vhost_net_ubuf_put(ubufs); wait_event(ubufs->wait, !atomic_read(&ubufs->refcount)); } static void vhost_net_ubuf_put_wait_and_free(struct vhost_net_ubuf_ref *ubufs) { vhost_net_ubuf_put_and_wait(ubufs); kfree_rcu(ubufs, rcu); } static void vhost_net_clear_ubuf_info(struct vhost_net *n) { int i; for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { kfree(n->vqs[i].ubuf_info); n->vqs[i].ubuf_info = NULL; } } static int vhost_net_set_ubuf_info(struct vhost_net *n) { bool zcopy; int i; for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { zcopy = vhost_net_zcopy_mask & (0x1 << i); if (!zcopy) continue; n->vqs[i].ubuf_info = kmalloc_array(UIO_MAXIOV, sizeof(*n->vqs[i].ubuf_info), GFP_KERNEL); if (!n->vqs[i].ubuf_info) goto err; } return 0; err: vhost_net_clear_ubuf_info(n); return -ENOMEM; } static void vhost_net_vq_reset(struct vhost_net *n) { int i; vhost_net_clear_ubuf_info(n); for (i = 0; i < VHOST_NET_VQ_MAX; i++) { n->vqs[i].done_idx = 0; n->vqs[i].upend_idx = 0; n->vqs[i].ubufs = NULL; n->vqs[i].vhost_hlen = 0; n->vqs[i].sock_hlen = 0; vhost_net_buf_init(&n->vqs[i].rxq); } } static void vhost_net_tx_packet(struct vhost_net *net) { ++net->tx_packets; if (net->tx_packets < 1024) return; net->tx_packets = 0; net->tx_zcopy_err = 0; } static void vhost_net_tx_err(struct vhost_net *net) { ++net->tx_zcopy_err; } static bool vhost_net_tx_select_zcopy(struct vhost_net *net) { /* TX flush waits for outstanding DMAs to be done. * Don't start new DMAs. */ return !net->tx_flush && net->tx_packets / 64 >= net->tx_zcopy_err; } static bool vhost_sock_zcopy(struct socket *sock) { return unlikely(experimental_zcopytx) && sock_flag(sock->sk, SOCK_ZEROCOPY); } static bool vhost_sock_xdp(struct socket *sock) { return sock_flag(sock->sk, SOCK_XDP); } /* In case of DMA done not in order in lower device driver for some reason. * upend_idx is used to track end of used idx, done_idx is used to track head * of used idx. Once lower device DMA done contiguously, we will signal KVM * guest used idx. */ static void vhost_zerocopy_signal_used(struct vhost_net *net, struct vhost_virtqueue *vq) { struct vhost_net_virtqueue *nvq = container_of(vq, struct vhost_net_virtqueue, vq); int i, add; int j = 0; for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) { if (vq->heads[i].len == VHOST_DMA_FAILED_LEN) vhost_net_tx_err(net); if (VHOST_DMA_IS_DONE(vq->heads[i].len)) { vq->heads[i].len = VHOST_DMA_CLEAR_LEN; ++j; } else break; } while (j) { add = min(UIO_MAXIOV - nvq->done_idx, j); vhost_add_used_and_signal_n(vq->dev, vq, &vq->heads[nvq->done_idx], NULL, add); nvq->done_idx = (nvq->done_idx + add) % UIO_MAXIOV; j -= add; } } static void vhost_zerocopy_complete(struct sk_buff *skb, struct ubuf_info *ubuf_base, bool success) { struct ubuf_info_msgzc *ubuf = uarg_to_msgzc(ubuf_base); struct vhost_net_ubuf_ref *ubufs = ubuf->ctx; struct vhost_virtqueue *vq = ubufs->vq; int cnt; rcu_read_lock_bh(); /* set len to mark this desc buffers done DMA */ vq->heads[ubuf->desc].len = success ? VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN; cnt = vhost_net_ubuf_put(ubufs); /* * Trigger polling thread if guest stopped submitting new buffers: * in this case, the refcount after decrement will eventually reach 1. * We also trigger polling periodically after each 16 packets * (the value 16 here is more or less arbitrary, it's tuned to trigger * less than 10% of times). */ if (cnt <= 1 || !(cnt % 16)) vhost_poll_queue(&vq->poll); rcu_read_unlock_bh(); } static const struct ubuf_info_ops vhost_ubuf_ops = { .complete = vhost_zerocopy_complete, }; static inline unsigned long busy_clock(void) { return local_clock() >> 10; } static bool vhost_can_busy_poll(unsigned long endtime) { return likely(!need_resched() && !time_after(busy_clock(), endtime) && !signal_pending(current)); } static void vhost_net_disable_vq(struct vhost_net *n, struct vhost_virtqueue *vq) { struct vhost_net_virtqueue *nvq = container_of(vq, struct vhost_net_virtqueue, vq); struct vhost_poll *poll = n->poll + (nvq - n->vqs); if (!vhost_vq_get_backend(vq)) return; vhost_poll_stop(poll); } static int vhost_net_enable_vq(struct vhost_net *n, struct vhost_virtqueue *vq) { struct vhost_net_virtqueue *nvq = container_of(vq, struct vhost_net_virtqueue, vq); struct vhost_poll *poll = n->poll + (nvq - n->vqs); struct socket *sock; sock = vhost_vq_get_backend(vq); if (!sock) return 0; return vhost_poll_start(poll, sock->file); } static void vhost_net_signal_used(struct vhost_net_virtqueue *nvq, unsigned int count) { struct vhost_virtqueue *vq = &nvq->vq; struct vhost_dev *dev = vq->dev; if (!nvq->done_idx) return; vhost_add_used_and_signal_n(dev, vq, vq->heads, vq->nheads, count); nvq->done_idx = 0; } static void vhost_tx_batch(struct vhost_net *net, struct vhost_net_virtqueue *nvq, struct socket *sock, struct msghdr *msghdr) { struct vhost_virtqueue *vq = &nvq->vq; bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER); struct tun_msg_ctl ctl = { .type = TUN_MSG_PTR, .num = nvq->batched_xdp, .ptr = nvq->xdp, }; int i, err; if (in_order) { vq->heads[0].len = 0; vq->nheads[0] = nvq->done_idx; } if (nvq->batched_xdp == 0) goto signal_used; msghdr->msg_control = &ctl; msghdr->msg_controllen = sizeof(ctl); err = sock->ops->sendmsg(sock, msghdr, 0); if (unlikely(err < 0)) { vq_err(&nvq->vq, "Fail to batch sending packets\n"); /* free pages owned by XDP; since this is an unlikely error path, * keep it simple and avoid more complex bulk update for the * used pages */ for (i = 0; i < nvq->batched_xdp; ++i) put_page(virt_to_head_page(nvq->xdp[i].data)); nvq->batched_xdp = 0; nvq->done_idx = 0; return; } signal_used: vhost_net_signal_used(nvq, in_order ? 1 : nvq->done_idx); nvq->batched_xdp = 0; } static int sock_has_rx_data(struct socket *sock) { if (unlikely(!sock)) return 0; if (sock->ops->peek_len) return sock->ops->peek_len(sock); return skb_queue_empty(&sock->sk->sk_receive_queue); } static void vhost_net_busy_poll_try_queue(struct vhost_net *net, struct vhost_virtqueue *vq) { if (!vhost_vq_avail_empty(&net->dev, vq)) { vhost_poll_queue(&vq->poll); } else if (unlikely(vhost_enable_notify(&net->dev, vq))) { vhost_disable_notify(&net->dev, vq); vhost_poll_queue(&vq->poll); } } static void vhost_net_busy_poll(struct vhost_net *net, struct vhost_virtqueue *rvq, struct vhost_virtqueue *tvq, bool *busyloop_intr, bool poll_rx) { unsigned long busyloop_timeout; unsigned long endtime; struct socket *sock; struct vhost_virtqueue *vq = poll_rx ? tvq : rvq; /* Try to hold the vq mutex of the paired virtqueue. We can't * use mutex_lock() here since we could not guarantee a * consistenet lock ordering. */ if (!mutex_trylock(&vq->mutex)) return; vhost_disable_notify(&net->dev, vq); sock = vhost_vq_get_backend(rvq); busyloop_timeout = poll_rx ? rvq->busyloop_timeout: tvq->busyloop_timeout; preempt_disable(); endtime = busy_clock() + busyloop_timeout; while (vhost_can_busy_poll(endtime)) { if (vhost_vq_has_work(vq)) { *busyloop_intr = true; break; } if ((sock_has_rx_data(sock) && !vhost_vq_avail_empty(&net->dev, rvq)) || !vhost_vq_avail_empty(&net->dev, tvq)) break; cpu_relax(); } preempt_enable(); if (poll_rx || sock_has_rx_data(sock)) vhost_net_busy_poll_try_queue(net, vq); else if (!poll_rx) /* On tx here, sock has no rx data. */ vhost_enable_notify(&net->dev, rvq); mutex_unlock(&vq->mutex); } static int vhost_net_tx_get_vq_desc(struct vhost_net *net, struct vhost_net_virtqueue *tnvq, unsigned int *out_num, unsigned int *in_num, struct msghdr *msghdr, bool *busyloop_intr, unsigned int *ndesc) { struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX]; struct vhost_virtqueue *rvq = &rnvq->vq; struct vhost_virtqueue *tvq = &tnvq->vq; int r = vhost_get_vq_desc_n(tvq, tvq->iov, ARRAY_SIZE(tvq->iov), out_num, in_num, NULL, NULL, ndesc); if (r == tvq->num && tvq->busyloop_timeout) { /* Flush batched packets first */ if (!vhost_sock_zcopy(vhost_vq_get_backend(tvq))) vhost_tx_batch(net, tnvq, vhost_vq_get_backend(tvq), msghdr); vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, false); r = vhost_get_vq_desc_n(tvq, tvq->iov, ARRAY_SIZE(tvq->iov), out_num, in_num, NULL, NULL, ndesc); } return r; } static bool vhost_exceeds_maxpend(struct vhost_net *net) { struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; struct vhost_virtqueue *vq = &nvq->vq; return (nvq->upend_idx + UIO_MAXIOV - nvq->done_idx) % UIO_MAXIOV > min_t(unsigned int, VHOST_MAX_PEND, vq->num >> 2); } static size_t init_iov_iter(struct vhost_virtqueue *vq, struct iov_iter *iter, size_t hdr_size, int out) { /* Skip header. TODO: support TSO. */ size_t len = iov_length(vq->iov, out); iov_iter_init(iter, ITER_SOURCE, vq->iov, out, len); iov_iter_advance(iter, hdr_size); return iov_iter_count(iter); } static int get_tx_bufs(struct vhost_net *net, struct vhost_net_virtqueue *nvq, struct msghdr *msg, unsigned int *out, unsigned int *in, size_t *len, bool *busyloop_intr, unsigned int *ndesc) { struct vhost_virtqueue *vq = &nvq->vq; int ret; ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg, busyloop_intr, ndesc); if (ret < 0 || ret == vq->num) return ret; if (*in) { vq_err(vq, "Unexpected descriptor format for TX: out %d, int %d\n", *out, *in); return -EFAULT; } /* Sanity check */ *len = init_iov_iter(vq, &msg->msg_iter, nvq->vhost_hlen, *out); if (*len == 0) { vq_err(vq, "Unexpected header len for TX: %zd expected %zd\n", *len, nvq->vhost_hlen); return -EFAULT; } return ret; } static bool tx_can_batch(struct vhost_virtqueue *vq, size_t total_len) { return total_len < VHOST_NET_WEIGHT && !vhost_vq_avail_empty(vq->dev, vq); } #define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD) static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq, struct iov_iter *from) { struct vhost_virtqueue *vq = &nvq->vq; struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); struct socket *sock = vhost_vq_get_backend(vq); struct virtio_net_hdr *gso; struct xdp_buff *xdp = &nvq->xdp[nvq->batched_xdp]; size_t len = iov_iter_count(from); int headroom = vhost_sock_xdp(sock) ? XDP_PACKET_HEADROOM : 0; int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); int pad = SKB_DATA_ALIGN(VHOST_NET_RX_PAD + headroom + nvq->sock_hlen); int sock_hlen = nvq->sock_hlen; void *buf; int copied; int ret; if (unlikely(len < nvq->sock_hlen)) return -EFAULT; if (SKB_DATA_ALIGN(len + pad) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE) return -ENOSPC; buflen += SKB_DATA_ALIGN(len + pad); buf = page_frag_alloc_align(&net->pf_cache, buflen, GFP_KERNEL, SMP_CACHE_BYTES); if (unlikely(!buf)) return -ENOMEM; copied = copy_from_iter(buf + pad - sock_hlen, len, from); if (copied != len) { ret = -EFAULT; goto err; } gso = buf + pad - sock_hlen; if (!sock_hlen) memset(buf, 0, pad); if ((gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && vhost16_to_cpu(vq, gso->csum_start) + vhost16_to_cpu(vq, gso->csum_offset) + 2 > vhost16_to_cpu(vq, gso->hdr_len)) { gso->hdr_len = cpu_to_vhost16(vq, vhost16_to_cpu(vq, gso->csum_start) + vhost16_to_cpu(vq, gso->csum_offset) + 2); if (vhost16_to_cpu(vq, gso->hdr_len) > len) { ret = -EINVAL; goto err; } } /* pad contains sock_hlen */ memcpy(buf, buf + pad - sock_hlen, sock_hlen); xdp_init_buff(xdp, buflen, NULL); xdp_prepare_buff(xdp, buf, pad, len - sock_hlen, true); ++nvq->batched_xdp; return 0; err: page_frag_free(buf); return ret; } static void handle_tx_copy(struct vhost_net *net, struct socket *sock) { struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; struct vhost_virtqueue *vq = &nvq->vq; unsigned out, in; int head; struct msghdr msg = { .msg_name = NULL, .msg_namelen = 0, .msg_control = NULL, .msg_controllen = 0, .msg_flags = MSG_DONTWAIT, }; size_t len, total_len = 0; int err; int sent_pkts = 0; bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX); bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER); unsigned int ndesc = 0; do { bool busyloop_intr = false; if (nvq->done_idx == VHOST_NET_BATCH) vhost_tx_batch(net, nvq, sock, &msg); head = get_tx_bufs(net, nvq, &msg, &out, &in, &len, &busyloop_intr, &ndesc); /* On error, stop handling until the next kick. */ if (unlikely(head < 0)) break; /* Nothing new? Wait for eventfd to tell us they refilled. */ if (head == vq->num) { /* Flush batched packets to handle pending RX * work (if busyloop_intr is set) and to avoid * unnecessary virtqueue kicks. */ vhost_tx_batch(net, nvq, sock, &msg); if (unlikely(busyloop_intr)) { vhost_poll_queue(&vq->poll); } else if (unlikely(vhost_enable_notify(&net->dev, vq))) { vhost_disable_notify(&net->dev, vq); continue; } break; } total_len += len; /* For simplicity, TX batching is only enabled if * sndbuf is unlimited. */ if (sock_can_batch) { err = vhost_net_build_xdp(nvq, &msg.msg_iter); if (!err) { goto done; } else if (unlikely(err != -ENOSPC)) { vhost_tx_batch(net, nvq, sock, &msg); vhost_discard_vq_desc(vq, 1, ndesc); vhost_net_enable_vq(net, vq); break; } if (nvq->batched_xdp) { /* We can't build XDP buff, go for single * packet path but let's flush batched * packets. */ vhost_tx_batch(net, nvq, sock, &msg); } msg.msg_control = NULL; } else { if (tx_can_batch(vq, total_len)) msg.msg_flags |= MSG_MORE; else msg.msg_flags &= ~MSG_MORE; } err = sock->ops->sendmsg(sock, &msg, len); if (unlikely(err < 0)) { if (err == -EAGAIN || err == -ENOMEM || err == -ENOBUFS) { vhost_discard_vq_desc(vq, 1, ndesc); vhost_net_enable_vq(net, vq); break; } pr_debug("Fail to send packet: err %d", err); } else if (unlikely(err != len)) pr_debug("Truncated TX packet: len %d != %zd\n", err, len); done: if (in_order) { vq->heads[0].id = cpu_to_vhost32(vq, head); } else { vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head); vq->heads[nvq->done_idx].len = 0; } ++nvq->done_idx; } while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len))); vhost_tx_batch(net, nvq, sock, &msg); } static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) { struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; struct vhost_virtqueue *vq = &nvq->vq; unsigned out, in; int head; struct msghdr msg = { .msg_name = NULL, .msg_namelen = 0, .msg_control = NULL, .msg_controllen = 0, .msg_flags = MSG_DONTWAIT, }; struct tun_msg_ctl ctl; size_t len, total_len = 0; int err; struct vhost_net_ubuf_ref *ubufs; struct ubuf_info_msgzc *ubuf; unsigned int ndesc = 0; bool zcopy_used; int sent_pkts = 0; do { bool busyloop_intr; /* Release DMAs done buffers first */ vhost_zerocopy_signal_used(net, vq); busyloop_intr = false; head = get_tx_bufs(net, nvq, &msg, &out, &in, &len, &busyloop_intr, &ndesc); /* On error, stop handling until the next kick. */ if (unlikely(head < 0)) break; /* Nothing new? Wait for eventfd to tell us they refilled. */ if (head == vq->num) { if (unlikely(busyloop_intr)) { vhost_poll_queue(&vq->poll); } else if (unlikely(vhost_enable_notify(&net->dev, vq))) { vhost_disable_notify(&net->dev, vq); continue; } break; } zcopy_used = len >= VHOST_GOODCOPY_LEN && !vhost_exceeds_maxpend(net) && vhost_net_tx_select_zcopy(net); /* use msg_control to pass vhost zerocopy ubuf info to skb */ if (zcopy_used) { ubuf = nvq->ubuf_info + nvq->upend_idx; vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head); vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS; ubuf->ctx = nvq->ubufs; ubuf->desc = nvq->upend_idx; ubuf->ubuf.ops = &vhost_ubuf_ops; ubuf->ubuf.flags = SKBFL_ZEROCOPY_FRAG; refcount_set(&ubuf->ubuf.refcnt, 1); msg.msg_control = &ctl; ctl.type = TUN_MSG_UBUF; ctl.ptr = &ubuf->ubuf; msg.msg_controllen = sizeof(ctl); ubufs = nvq->ubufs; atomic_inc(&ubufs->refcount); nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV; } else { msg.msg_control = NULL; ubufs = NULL; } total_len += len; if (tx_can_batch(vq, total_len) && likely(!vhost_exceeds_maxpend(net))) { msg.msg_flags |= MSG_MORE; } else { msg.msg_flags &= ~MSG_MORE; } err = sock->ops->sendmsg(sock, &msg, len); if (unlikely(err < 0)) { bool retry = err == -EAGAIN || err == -ENOMEM || err == -ENOBUFS; if (zcopy_used) { if (vq->heads[ubuf->desc].len == VHOST_DMA_IN_PROGRESS) vhost_net_ubuf_put(ubufs); if (retry) nvq->upend_idx = ((unsigned)nvq->upend_idx - 1) % UIO_MAXIOV; else vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN; } if (retry) { vhost_discard_vq_desc(vq, 1, ndesc); vhost_net_enable_vq(net, vq); break; } pr_debug("Fail to send packet: err %d", err); } else if (unlikely(err != len)) pr_debug("Truncated TX packet: " " len %d != %zd\n", err, len); if (!zcopy_used) vhost_add_used_and_signal(&net->dev, vq, head, 0); else vhost_zerocopy_signal_used(net, vq); vhost_net_tx_packet(net); } while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len))); } /* Expects to be always run from workqueue - which acts as * read-size critical section for our kind of RCU. */ static void handle_tx(struct vhost_net *net) { struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; struct vhost_virtqueue *vq = &nvq->vq; struct socket *sock; mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX); sock = vhost_vq_get_backend(vq); if (!sock) goto out; if (!vq_meta_prefetch(vq)) goto out; vhost_disable_notify(&net->dev, vq); vhost_net_disable_vq(net, vq); if (vhost_sock_zcopy(sock)) handle_tx_zerocopy(net, sock); else handle_tx_copy(net, sock); out: mutex_unlock(&vq->mutex); } static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk) { struct sk_buff *head; int len = 0; unsigned long flags; if (rvq->rx_ring) return vhost_net_buf_peek(rvq); spin_lock_irqsave(&sk->sk_receive_queue.lock, flags); head = skb_peek(&sk->sk_receive_queue); if (likely(head)) { len = head->len; if (skb_vlan_tag_present(head)) len += VLAN_HLEN; } spin_unlock_irqrestore(&sk->sk_receive_queue.lock, flags); return len; } static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk, bool *busyloop_intr, unsigned int *count) { struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX]; struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX]; struct vhost_virtqueue *rvq = &rnvq->vq; struct vhost_virtqueue *tvq = &tnvq->vq; int len = peek_head_len(rnvq, sk); if (!len && rvq->busyloop_timeout) { /* Flush batched heads first */ vhost_net_signal_used(rnvq, *count); *count = 0; /* Both tx vq and rx socket were polled here */ vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, true); len = peek_head_len(rnvq, sk); } return len; } /* This is a multi-buffer version of vhost_get_desc, that works if * vq has read descriptors only. * @nvq - the relevant vhost_net virtqueue * @datalen - data length we'll be reading * @iovcount - returned count of io vectors we fill * @log - vhost log * @log_num - log offset * @quota - headcount quota, 1 for big buffer * returns number of buffer heads allocated, negative on error */ static int get_rx_bufs(struct vhost_net_virtqueue *nvq, struct vring_used_elem *heads, u16 *nheads, int datalen, unsigned *iovcount, struct vhost_log *log, unsigned *log_num, unsigned int quota, unsigned int *ndesc) { struct vhost_virtqueue *vq = &nvq->vq; bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER); unsigned int out, in, desc_num, n = 0; int seg = 0; int headcount = 0; unsigned d; int r, nlogs = 0; /* len is always initialized before use since we are always called with * datalen > 0. */ u32 len; while (datalen > 0 && headcount < quota) { if (unlikely(seg >= UIO_MAXIOV)) { r = -ENOBUFS; goto err; } r = vhost_get_vq_desc_n(vq, vq->iov + seg, ARRAY_SIZE(vq->iov) - seg, &out, &in, log, log_num, &desc_num); if (unlikely(r < 0)) goto err; d = r; if (d == vq->num) { r = 0; goto err; } if (unlikely(out || in <= 0)) { vq_err(vq, "unexpected descriptor format for RX: " "out %d, in %d\n", out, in); r = -EINVAL; goto err; } if (unlikely(log)) { nlogs += *log_num; log += *log_num; } len = iov_length(vq->iov + seg, in); if (!in_order) { heads[headcount].id = cpu_to_vhost32(vq, d); heads[headcount].len = cpu_to_vhost32(vq, len); } ++headcount; datalen -= len; seg += in; n += desc_num; } *iovcount = seg; if (unlikely(log)) *log_num = nlogs; /* Detect overrun */ if (unlikely(datalen > 0)) { r = UIO_MAXIOV + 1; goto err; } if (!in_order) heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen); else { heads[0].len = cpu_to_vhost32(vq, len + datalen); heads[0].id = cpu_to_vhost32(vq, d); nheads[0] = headcount; } *ndesc = n; return headcount; err: vhost_discard_vq_desc(vq, headcount, n); return r; } /* Expects to be always run from workqueue - which acts as * read-size critical section for our kind of RCU. */ static void handle_rx(struct vhost_net *net) { struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_RX]; struct vhost_virtqueue *vq = &nvq->vq; bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER); unsigned int count = 0; unsigned in, log; struct vhost_log *vq_log; struct msghdr msg = { .msg_name = NULL, .msg_namelen = 0, .msg_control = NULL, /* FIXME: get and handle RX aux data. */ .msg_controllen = 0, .msg_flags = MSG_DONTWAIT, }; struct virtio_net_hdr hdr = { .flags = 0, .gso_type = VIRTIO_NET_HDR_GSO_NONE }; size_t total_len = 0; int err, mergeable; s16 headcount; size_t vhost_hlen, sock_hlen; size_t vhost_len, sock_len; bool busyloop_intr = false; bool set_num_buffers; struct socket *sock; struct iov_iter fixup; __virtio16 num_buffers; int recv_pkts = 0; unsigned int ndesc; mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_RX); sock = vhost_vq_get_backend(vq); if (!sock) goto out; if (!vq_meta_prefetch(vq)) goto out; vhost_disable_notify(&net->dev, vq); vhost_net_disable_vq(net, vq); vhost_hlen = nvq->vhost_hlen; sock_hlen = nvq->sock_hlen; vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ? vq->log : NULL; mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF); set_num_buffers = mergeable || vhost_has_feature(vq, VIRTIO_F_VERSION_1); do { sock_len = vhost_net_rx_peek_head_len(net, sock->sk, &busyloop_intr, &count); if (!sock_len) break; sock_len += sock_hlen; vhost_len = sock_len + vhost_hlen; headcount = get_rx_bufs(nvq, vq->heads + count, vq->nheads + count, vhost_len, &in, vq_log, &log, likely(mergeable) ? UIO_MAXIOV : 1, &ndesc); /* On error, stop handling until the next kick. */ if (unlikely(headcount < 0)) goto out; /* OK, now we need to know about added descriptors. */ if (!headcount) { if (unlikely(busyloop_intr)) { vhost_poll_queue(&vq->poll); } else if (unlikely(vhost_enable_notify(&net->dev, vq))) { /* They have slipped one in as we were * doing that: check again. */ vhost_disable_notify(&net->dev, vq); continue; } /* Nothing new? Wait for eventfd to tell us * they refilled. */ goto out; } busyloop_intr = false; if (nvq->rx_ring) msg.msg_control = vhost_net_buf_consume(&nvq->rxq); /* On overrun, truncate and discard */ if (unlikely(headcount > UIO_MAXIOV)) { iov_iter_init(&msg.msg_iter, ITER_DEST, vq->iov, 1, 1); err = sock->ops->recvmsg(sock, &msg, 1, MSG_DONTWAIT | MSG_TRUNC); pr_debug("Discarded rx packet: len %zd\n", sock_len); continue; } /* We don't need to be notified again. */ iov_iter_init(&msg.msg_iter, ITER_DEST, vq->iov, in, vhost_len); fixup = msg.msg_iter; if (unlikely((vhost_hlen))) { /* We will supply the header ourselves * TODO: support TSO. */ iov_iter_advance(&msg.msg_iter, vhost_hlen); } err = sock->ops->recvmsg(sock, &msg, sock_len, MSG_DONTWAIT | MSG_TRUNC); /* Userspace might have consumed the packet meanwhile: * it's not supposed to do this usually, but might be hard * to prevent. Discard data we got (if any) and keep going. */ if (unlikely(err != sock_len)) { pr_debug("Discarded rx packet: " " len %d, expected %zd\n", err, sock_len); vhost_discard_vq_desc(vq, headcount, ndesc); continue; } /* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */ if (unlikely(vhost_hlen)) { if (copy_to_iter(&hdr, sizeof(hdr), &fixup) != sizeof(hdr)) { vq_err(vq, "Unable to write vnet_hdr " "at addr %p\n", vq->iov->iov_base); goto out; } } else { /* Header came from socket; we'll need to patch * ->num_buffers over if VIRTIO_NET_F_MRG_RXBUF */ iov_iter_advance(&fixup, sizeof(hdr)); } /* TODO: Should check and handle checksum. */ num_buffers = cpu_to_vhost16(vq, headcount); if (likely(set_num_buffers) && copy_to_iter(&num_buffers, sizeof num_buffers, &fixup) != sizeof num_buffers) { vq_err(vq, "Failed num_buffers write"); vhost_discard_vq_desc(vq, headcount, ndesc); goto out; } nvq->done_idx += headcount; count += in_order ? 1 : headcount; if (nvq->done_idx > VHOST_NET_BATCH) { vhost_net_signal_used(nvq, count); count = 0; } if (unlikely(vq_log)) vhost_log_write(vq, vq_log, log, vhost_len, vq->iov, in); total_len += vhost_len; } while (likely(!vhost_exceeds_weight(vq, ++recv_pkts, total_len))); if (unlikely(busyloop_intr)) vhost_poll_queue(&vq->poll); else if (!sock_len) vhost_net_enable_vq(net, vq); out: vhost_net_signal_used(nvq, count); mutex_unlock(&vq->mutex); } static void handle_tx_kick(struct vhost_work *work) { struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, poll.work); struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); handle_tx(net); } static void handle_rx_kick(struct vhost_work *work) { struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, poll.work); struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); handle_rx(net); } static void handle_tx_net(struct vhost_work *work) { struct vhost_net *net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_TX].work); handle_tx(net); } static void handle_rx_net(struct vhost_work *work) { struct vhost_net *net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work); handle_rx(net); } static int vhost_net_open(struct inode *inode, struct file *f) { struct vhost_net *n; struct vhost_dev *dev; struct vhost_virtqueue **vqs; void **queue; struct xdp_buff *xdp; int i; n = kvmalloc(sizeof *n, GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!n) return -ENOMEM; vqs = kmalloc_array(VHOST_NET_VQ_MAX, sizeof(*vqs), GFP_KERNEL); if (!vqs) { kvfree(n); return -ENOMEM; } queue = kmalloc_array(VHOST_NET_BATCH, sizeof(void *), GFP_KERNEL); if (!queue) { kfree(vqs); kvfree(n); return -ENOMEM; } n->vqs[VHOST_NET_VQ_RX].rxq.queue = queue; xdp = kmalloc_array(VHOST_NET_BATCH, sizeof(*xdp), GFP_KERNEL); if (!xdp) { kfree(vqs); kvfree(n); kfree(queue); return -ENOMEM; } n->vqs[VHOST_NET_VQ_TX].xdp = xdp; dev = &n->dev; vqs[VHOST_NET_VQ_TX] = &n->vqs[VHOST_NET_VQ_TX].vq; vqs[VHOST_NET_VQ_RX] = &n->vqs[VHOST_NET_VQ_RX].vq; n->vqs[VHOST_NET_VQ_TX].vq.handle_kick = handle_tx_kick; n->vqs[VHOST_NET_VQ_RX].vq.handle_kick = handle_rx_kick; for (i = 0; i < VHOST_NET_VQ_MAX; i++) { n->vqs[i].ubufs = NULL; n->vqs[i].ubuf_info = NULL; n->vqs[i].upend_idx = 0; n->vqs[i].done_idx = 0; n->vqs[i].batched_xdp = 0; n->vqs[i].vhost_hlen = 0; n->vqs[i].sock_hlen = 0; n->vqs[i].rx_ring = NULL; vhost_net_buf_init(&n->vqs[i].rxq); } vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX, UIO_MAXIOV + VHOST_NET_BATCH, VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT, true, NULL); vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, EPOLLOUT, dev, vqs[VHOST_NET_VQ_TX]); vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, EPOLLIN, dev, vqs[VHOST_NET_VQ_RX]); f->private_data = n; page_frag_cache_init(&n->pf_cache); return 0; } static struct socket *vhost_net_stop_vq(struct vhost_net *n, struct vhost_virtqueue *vq) { struct socket *sock; struct vhost_net_virtqueue *nvq = container_of(vq, struct vhost_net_virtqueue, vq); mutex_lock(&vq->mutex); sock = vhost_vq_get_backend(vq); vhost_net_disable_vq(n, vq); vhost_vq_set_backend(vq, NULL); vhost_net_buf_unproduce(nvq); nvq->rx_ring = NULL; mutex_unlock(&vq->mutex); return sock; } static void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock, struct socket **rx_sock) { *tx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_TX].vq); *rx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_RX].vq); } static void vhost_net_flush(struct vhost_net *n) { vhost_dev_flush(&n->dev); if (n->vqs[VHOST_NET_VQ_TX].ubufs) { mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); n->tx_flush = true; mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); /* Wait for all lower device DMAs done. */ vhost_net_ubuf_put_and_wait(n->vqs[VHOST_NET_VQ_TX].ubufs); mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); n->tx_flush = false; atomic_set(&n->vqs[VHOST_NET_VQ_TX].ubufs->refcount, 1); mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); } } static int vhost_net_release(struct inode *inode, struct file *f) { struct vhost_net *n = f->private_data; struct socket *tx_sock; struct socket *rx_sock; vhost_net_stop(n, &tx_sock, &rx_sock); vhost_net_flush(n); vhost_dev_stop(&n->dev); vhost_dev_cleanup(&n->dev); vhost_net_vq_reset(n); if (tx_sock) sockfd_put(tx_sock); if (rx_sock) sockfd_put(rx_sock); /* Make sure no callbacks are outstanding */ synchronize_rcu(); /* We do an extra flush before freeing memory, * since jobs can re-queue themselves. */ vhost_net_flush(n); kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue); kfree(n->vqs[VHOST_NET_VQ_TX].xdp); kfree(n->dev.vqs); page_frag_cache_drain(&n->pf_cache); kvfree(n); return 0; } static struct socket *get_raw_socket(int fd) { int r; struct socket *sock = sockfd_lookup(fd, &r); if (!sock) return ERR_PTR(-ENOTSOCK); /* Parameter checking */ if (sock->sk->sk_type != SOCK_RAW) { r = -ESOCKTNOSUPPORT; goto err; } if (sock->sk->sk_family != AF_PACKET) { r = -EPFNOSUPPORT; goto err; } return sock; err: sockfd_put(sock); return ERR_PTR(r); } static struct ptr_ring *get_tap_ptr_ring(struct file *file) { struct ptr_ring *ring; ring = tun_get_tx_ring(file); if (!IS_ERR(ring)) goto out; ring = tap_get_ptr_ring(file); if (!IS_ERR(ring)) goto out; ring = NULL; out: return ring; } static struct socket *get_tap_socket(int fd) { struct file *file = fget(fd); struct socket *sock; if (!file) return ERR_PTR(-EBADF); sock = tun_get_socket(file); if (!IS_ERR(sock)) return sock; sock = tap_get_socket(file); if (IS_ERR(sock)) fput(file); return sock; } static struct socket *get_socket(int fd) { struct socket *sock; /* special case to disable backend */ if (fd == -1) return NULL; sock = get_raw_socket(fd); if (!IS_ERR(sock)) return sock; sock = get_tap_socket(fd); if (!IS_ERR(sock)) return sock; return ERR_PTR(-ENOTSOCK); } static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) { struct socket *sock, *oldsock; struct vhost_virtqueue *vq; struct vhost_net_virtqueue *nvq; struct vhost_net_ubuf_ref *ubufs, *oldubufs = NULL; int r; mutex_lock(&n->dev.mutex); r = vhost_dev_check_owner(&n->dev); if (r) goto err; if (index >= VHOST_NET_VQ_MAX) { r = -ENOBUFS; goto err; } vq = &n->vqs[index].vq; nvq = &n->vqs[index]; mutex_lock(&vq->mutex); if (fd == -1) vhost_clear_msg(&n->dev); /* Verify that ring has been setup correctly. */ if (!vhost_vq_access_ok(vq)) { r = -EFAULT; goto err_vq; } sock = get_socket(fd); if (IS_ERR(sock)) { r = PTR_ERR(sock); goto err_vq; } /* start polling new socket */ oldsock = vhost_vq_get_backend(vq); if (sock != oldsock) { ubufs = vhost_net_ubuf_alloc(vq, sock && vhost_sock_zcopy(sock)); if (IS_ERR(ubufs)) { r = PTR_ERR(ubufs); goto err_ubufs; } vhost_net_disable_vq(n, vq); vhost_vq_set_backend(vq, sock); vhost_net_buf_unproduce(nvq); r = vhost_vq_init_access(vq); if (r) goto err_used; r = vhost_net_enable_vq(n, vq); if (r) goto err_used; if (index == VHOST_NET_VQ_RX) { if (sock) nvq->rx_ring = get_tap_ptr_ring(sock->file); else nvq->rx_ring = NULL; } oldubufs = nvq->ubufs; nvq->ubufs = ubufs; n->tx_packets = 0; n->tx_zcopy_err = 0; n->tx_flush = false; } mutex_unlock(&vq->mutex); if (oldubufs) { vhost_net_ubuf_put_wait_and_free(oldubufs); mutex_lock(&vq->mutex); vhost_zerocopy_signal_used(n, vq); mutex_unlock(&vq->mutex); } if (oldsock) { vhost_dev_flush(&n->dev); sockfd_put(oldsock); } mutex_unlock(&n->dev.mutex); return 0; err_used: vhost_vq_set_backend(vq, oldsock); vhost_net_enable_vq(n, vq); if (ubufs) vhost_net_ubuf_put_wait_and_free(ubufs); err_ubufs: if (sock) sockfd_put(sock); err_vq: mutex_unlock(&vq->mutex); err: mutex_unlock(&n->dev.mutex); return r; } static long vhost_net_reset_owner(struct vhost_net *n) { struct socket *tx_sock = NULL; struct socket *rx_sock = NULL; long err; struct vhost_iotlb *umem; mutex_lock(&n->dev.mutex); err = vhost_dev_check_owner(&n->dev); if (err) goto done; umem = vhost_dev_reset_owner_prepare(); if (!umem) { err = -ENOMEM; goto done; } vhost_net_stop(n, &tx_sock, &rx_sock); vhost_net_flush(n); vhost_dev_stop(&n->dev); vhost_dev_reset_owner(&n->dev, umem); vhost_net_vq_reset(n); done: mutex_unlock(&n->dev.mutex); if (tx_sock) sockfd_put(tx_sock); if (rx_sock) sockfd_put(rx_sock); return err; } static int vhost_net_set_features(struct vhost_net *n, const u64 *features) { size_t vhost_hlen, sock_hlen, hdr_len; int i; hdr_len = virtio_features_test_bit(features, VIRTIO_NET_F_MRG_RXBUF) || virtio_features_test_bit(features, VIRTIO_F_VERSION_1) ? sizeof(struct virtio_net_hdr_mrg_rxbuf) : sizeof(struct virtio_net_hdr); if (virtio_features_test_bit(features, VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO) || virtio_features_test_bit(features, VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO)) hdr_len = sizeof(struct virtio_net_hdr_v1_hash_tunnel); if (virtio_features_test_bit(features, VHOST_NET_F_VIRTIO_NET_HDR)) { /* vhost provides vnet_hdr */ vhost_hlen = hdr_len; sock_hlen = 0; } else { /* socket provides vnet_hdr */ vhost_hlen = 0; sock_hlen = hdr_len; } mutex_lock(&n->dev.mutex); if (virtio_features_test_bit(features, VHOST_F_LOG_ALL) && !vhost_log_access_ok(&n->dev)) goto out_unlock; if (virtio_features_test_bit(features, VIRTIO_F_ACCESS_PLATFORM)) { if (vhost_init_device_iotlb(&n->dev)) goto out_unlock; } for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { mutex_lock(&n->vqs[i].vq.mutex); virtio_features_copy(n->vqs[i].vq.acked_features_array, features); n->vqs[i].vhost_hlen = vhost_hlen; n->vqs[i].sock_hlen = sock_hlen; mutex_unlock(&n->vqs[i].vq.mutex); } mutex_unlock(&n->dev.mutex); return 0; out_unlock: mutex_unlock(&n->dev.mutex); return -EFAULT; } static long vhost_net_set_owner(struct vhost_net *n) { int r; mutex_lock(&n->dev.mutex); if (vhost_dev_has_owner(&n->dev)) { r = -EBUSY; goto out; } r = vhost_net_set_ubuf_info(n); if (r) goto out; r = vhost_dev_set_owner(&n->dev); if (r) vhost_net_clear_ubuf_info(n); vhost_net_flush(n); out: mutex_unlock(&n->dev.mutex); return r; } static long vhost_net_ioctl(struct file *f, unsigned int ioctl, unsigned long arg) { const DEFINE_VHOST_FEATURES_ARRAY(vhost_net_features, vhost_net_bits); u64 all_features[VIRTIO_FEATURES_U64S]; struct vhost_net *n = f->private_data; void __user *argp = (void __user *)arg; u64 __user *featurep = argp; struct vhost_vring_file backend; u64 features, count, copied; int r, i; switch (ioctl) { case VHOST_NET_SET_BACKEND: if (copy_from_user(&backend, argp, sizeof backend)) return -EFAULT; return vhost_net_set_backend(n, backend.index, backend.fd); case VHOST_GET_FEATURES: features = vhost_net_features[0]; if (copy_to_user(featurep, &features, sizeof features)) return -EFAULT; return 0; case VHOST_SET_FEATURES: if (copy_from_user(&features, featurep, sizeof features)) return -EFAULT; if (features & ~vhost_net_features[0]) return -EOPNOTSUPP; virtio_features_from_u64(all_features, features); return vhost_net_set_features(n, all_features); case VHOST_GET_FEATURES_ARRAY: if (copy_from_user(&count, featurep, sizeof(count))) return -EFAULT; /* Copy the net features, up to the user-provided buffer size */ argp += sizeof(u64); copied = min(count, (u64)VIRTIO_FEATURES_U64S); if (copy_to_user(argp, vhost_net_features, copied * sizeof(u64))) return -EFAULT; /* Zero the trailing space provided by user-space, if any */ if (clear_user(argp, size_mul(count - copied, sizeof(u64)))) return -EFAULT; return 0; case VHOST_SET_FEATURES_ARRAY: if (copy_from_user(&count, featurep, sizeof(count))) return -EFAULT; virtio_features_zero(all_features); argp += sizeof(u64); copied = min(count, (u64)VIRTIO_FEATURES_U64S); if (copy_from_user(all_features, argp, copied * sizeof(u64))) return -EFAULT; /* * Any feature specified by user-space above * VIRTIO_FEATURES_BITS is not supported by definition. */ for (i = copied; i < count; ++i) { if (copy_from_user(&features, featurep + 1 + i, sizeof(features))) return -EFAULT; if (features) return -EOPNOTSUPP; } for (i = 0; i < VIRTIO_FEATURES_U64S; i++) if (all_features[i] & ~vhost_net_features[i]) return -EOPNOTSUPP; return vhost_net_set_features(n, all_features); case VHOST_GET_BACKEND_FEATURES: features = VHOST_NET_BACKEND_FEATURES; if (copy_to_user(featurep, &features, sizeof(features))) return -EFAULT; return 0; case VHOST_SET_BACKEND_FEATURES: if (copy_from_user(&features, featurep, sizeof(features))) return -EFAULT; if (features & ~VHOST_NET_BACKEND_FEATURES) return -EOPNOTSUPP; vhost_set_backend_features(&n->dev, features); return 0; case VHOST_RESET_OWNER: return vhost_net_reset_owner(n); case VHOST_SET_OWNER: return vhost_net_set_owner(n); default: mutex_lock(&n->dev.mutex); r = vhost_dev_ioctl(&n->dev, ioctl, argp); if (r == -ENOIOCTLCMD) r = vhost_vring_ioctl(&n->dev, ioctl, argp); else vhost_net_flush(n); mutex_unlock(&n->dev.mutex); return r; } } static ssize_t vhost_net_chr_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct vhost_net *n = file->private_data; struct vhost_dev *dev = &n->dev; int noblock = file->f_flags & O_NONBLOCK; return vhost_chr_read_iter(dev, to, noblock); } static ssize_t vhost_net_chr_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct vhost_net *n = file->private_data; struct vhost_dev *dev = &n->dev; return vhost_chr_write_iter(dev, from); } static __poll_t vhost_net_chr_poll(struct file *file, poll_table *wait) { struct vhost_net *n = file->private_data; struct vhost_dev *dev = &n->dev; return vhost_chr_poll(file, dev, wait); } static const struct file_operations vhost_net_fops = { .owner = THIS_MODULE, .release = vhost_net_release, .read_iter = vhost_net_chr_read_iter, .write_iter = vhost_net_chr_write_iter, .poll = vhost_net_chr_poll, .unlocked_ioctl = vhost_net_ioctl, .compat_ioctl = compat_ptr_ioctl, .open = vhost_net_open, .llseek = noop_llseek, }; static struct miscdevice vhost_net_misc = { .minor = VHOST_NET_MINOR, .name = "vhost-net", .fops = &vhost_net_fops, }; static int __init vhost_net_init(void) { if (experimental_zcopytx) vhost_net_enable_zcopy(VHOST_NET_VQ_TX); return misc_register(&vhost_net_misc); } module_init(vhost_net_init); static void __exit vhost_net_exit(void) { misc_deregister(&vhost_net_misc); } module_exit(vhost_net_exit); MODULE_VERSION("0.0.1"); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Michael S. Tsirkin"); MODULE_DESCRIPTION("Host kernel accelerator for virtio net"); MODULE_ALIAS_MISCDEV(VHOST_NET_MINOR); MODULE_ALIAS("devname:vhost-net"); |
| 111 111 550 11 550 551 11 11 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netfilter.h> #include <linux/rhashtable.h> #include <linux/netdevice.h> #include <net/ip.h> #include <net/ip6_route.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_flow_table.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_tuple.h> static DEFINE_MUTEX(flowtable_lock); static LIST_HEAD(flowtables); static void flow_offload_fill_dir(struct flow_offload *flow, enum flow_offload_tuple_dir dir) { struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple; struct nf_conntrack_tuple *ctt = &flow->ct->tuplehash[dir].tuple; ft->dir = dir; switch (ctt->src.l3num) { case NFPROTO_IPV4: ft->src_v4 = ctt->src.u3.in; ft->dst_v4 = ctt->dst.u3.in; break; case NFPROTO_IPV6: ft->src_v6 = ctt->src.u3.in6; ft->dst_v6 = ctt->dst.u3.in6; break; } ft->l3proto = ctt->src.l3num; ft->l4proto = ctt->dst.protonum; switch (ctt->dst.protonum) { case IPPROTO_TCP: case IPPROTO_UDP: ft->src_port = ctt->src.u.tcp.port; ft->dst_port = ctt->dst.u.tcp.port; break; } } struct flow_offload *flow_offload_alloc(struct nf_conn *ct) { struct flow_offload *flow; if (unlikely(nf_ct_is_dying(ct))) return NULL; flow = kzalloc(sizeof(*flow), GFP_ATOMIC); if (!flow) return NULL; refcount_inc(&ct->ct_general.use); flow->ct = ct; flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_ORIGINAL); flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_REPLY); if (ct->status & IPS_SRC_NAT) __set_bit(NF_FLOW_SNAT, &flow->flags); if (ct->status & IPS_DST_NAT) __set_bit(NF_FLOW_DNAT, &flow->flags); return flow; } EXPORT_SYMBOL_GPL(flow_offload_alloc); static u32 flow_offload_dst_cookie(struct flow_offload_tuple *flow_tuple) { if (flow_tuple->l3proto == NFPROTO_IPV6) return rt6_get_cookie(dst_rt6_info(flow_tuple->dst_cache)); return 0; } static struct dst_entry *nft_route_dst_fetch(struct nf_flow_route *route, enum flow_offload_tuple_dir dir) { struct dst_entry *dst = route->tuple[dir].dst; route->tuple[dir].dst = NULL; return dst; } static int flow_offload_fill_route(struct flow_offload *flow, struct nf_flow_route *route, enum flow_offload_tuple_dir dir) { struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple; struct dst_entry *dst = nft_route_dst_fetch(route, dir); int i, j = 0; switch (flow_tuple->l3proto) { case NFPROTO_IPV4: flow_tuple->mtu = ip_dst_mtu_maybe_forward(dst, true); break; case NFPROTO_IPV6: flow_tuple->mtu = ip6_dst_mtu_maybe_forward(dst, true); break; } flow_tuple->iifidx = route->tuple[dir].in.ifindex; for (i = route->tuple[dir].in.num_encaps - 1; i >= 0; i--) { flow_tuple->encap[j].id = route->tuple[dir].in.encap[i].id; flow_tuple->encap[j].proto = route->tuple[dir].in.encap[i].proto; if (route->tuple[dir].in.ingress_vlans & BIT(i)) flow_tuple->in_vlan_ingress |= BIT(j); j++; } flow_tuple->tun = route->tuple[dir].in.tun; flow_tuple->encap_num = route->tuple[dir].in.num_encaps; flow_tuple->tun_num = route->tuple[dir].in.num_tuns; switch (route->tuple[dir].xmit_type) { case FLOW_OFFLOAD_XMIT_DIRECT: memcpy(flow_tuple->out.h_dest, route->tuple[dir].out.h_dest, ETH_ALEN); memcpy(flow_tuple->out.h_source, route->tuple[dir].out.h_source, ETH_ALEN); flow_tuple->out.ifidx = route->tuple[dir].out.ifindex; dst_release(dst); break; case FLOW_OFFLOAD_XMIT_XFRM: case FLOW_OFFLOAD_XMIT_NEIGH: flow_tuple->ifidx = route->tuple[dir].out.ifindex; flow_tuple->dst_cache = dst; flow_tuple->dst_cookie = flow_offload_dst_cookie(flow_tuple); break; default: WARN_ON_ONCE(1); break; } flow_tuple->xmit_type = route->tuple[dir].xmit_type; return 0; } static void nft_flow_dst_release(struct flow_offload *flow, enum flow_offload_tuple_dir dir) { if (flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_NEIGH || flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM) dst_release(flow->tuplehash[dir].tuple.dst_cache); } void flow_offload_route_init(struct flow_offload *flow, struct nf_flow_route *route) { flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_ORIGINAL); flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_REPLY); flow->type = NF_FLOW_OFFLOAD_ROUTE; } EXPORT_SYMBOL_GPL(flow_offload_route_init); static inline bool nf_flow_has_expired(const struct flow_offload *flow) { return nf_flow_timeout_delta(flow->timeout) <= 0; } static void flow_offload_fixup_tcp(struct nf_conn *ct, u8 tcp_state) { struct ip_ct_tcp *tcp = &ct->proto.tcp; spin_lock_bh(&ct->lock); if (tcp->state != tcp_state) tcp->state = tcp_state; /* syn packet triggers the TCP reopen case from conntrack. */ if (tcp->state == TCP_CONNTRACK_CLOSE) ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; /* Conntrack state is outdated due to offload bypass. * Clear IP_CT_TCP_FLAG_MAXACK_SET, otherwise conntracks * TCP reset validation will fail. */ tcp->seen[0].td_maxwin = 0; tcp->seen[0].flags &= ~IP_CT_TCP_FLAG_MAXACK_SET; tcp->seen[1].td_maxwin = 0; tcp->seen[1].flags &= ~IP_CT_TCP_FLAG_MAXACK_SET; spin_unlock_bh(&ct->lock); } static void flow_offload_fixup_ct(struct flow_offload *flow) { struct nf_conn *ct = flow->ct; struct net *net = nf_ct_net(ct); int l4num = nf_ct_protonum(ct); bool expired, closing = false; u32 offload_timeout = 0; s32 timeout; if (l4num == IPPROTO_TCP) { const struct nf_tcp_net *tn = nf_tcp_pernet(net); u8 tcp_state; /* Enter CLOSE state if fin/rst packet has been seen, this * allows TCP reopen from conntrack. Otherwise, pick up from * the last seen TCP state. */ closing = test_bit(NF_FLOW_CLOSING, &flow->flags); if (closing) { flow_offload_fixup_tcp(ct, TCP_CONNTRACK_CLOSE); timeout = READ_ONCE(tn->timeouts[TCP_CONNTRACK_CLOSE]); expired = false; } else { tcp_state = READ_ONCE(ct->proto.tcp.state); flow_offload_fixup_tcp(ct, tcp_state); timeout = READ_ONCE(tn->timeouts[tcp_state]); expired = nf_flow_has_expired(flow); } offload_timeout = READ_ONCE(tn->offload_timeout); } else if (l4num == IPPROTO_UDP) { const struct nf_udp_net *tn = nf_udp_pernet(net); enum udp_conntrack state = test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ? UDP_CT_REPLIED : UDP_CT_UNREPLIED; timeout = READ_ONCE(tn->timeouts[state]); expired = nf_flow_has_expired(flow); offload_timeout = READ_ONCE(tn->offload_timeout); } else { return; } if (expired) timeout -= offload_timeout; if (timeout < 0) timeout = 0; if (closing || nf_flow_timeout_delta(READ_ONCE(ct->timeout)) > (__s32)timeout) nf_ct_refresh(ct, timeout); } static void flow_offload_route_release(struct flow_offload *flow) { nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL); nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_REPLY); } void flow_offload_free(struct flow_offload *flow) { switch (flow->type) { case NF_FLOW_OFFLOAD_ROUTE: flow_offload_route_release(flow); break; default: break; } nf_ct_put(flow->ct); kfree_rcu(flow, rcu_head); } EXPORT_SYMBOL_GPL(flow_offload_free); static u32 flow_offload_hash(const void *data, u32 len, u32 seed) { const struct flow_offload_tuple *tuple = data; return jhash(tuple, offsetof(struct flow_offload_tuple, __hash), seed); } static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed) { const struct flow_offload_tuple_rhash *tuplehash = data; return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, __hash), seed); } static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { const struct flow_offload_tuple *tuple = arg->key; const struct flow_offload_tuple_rhash *x = ptr; if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, __hash))) return 1; return 0; } static const struct rhashtable_params nf_flow_offload_rhash_params = { .head_offset = offsetof(struct flow_offload_tuple_rhash, node), .hashfn = flow_offload_hash, .obj_hashfn = flow_offload_hash_obj, .obj_cmpfn = flow_offload_hash_cmp, .automatic_shrinking = true, }; unsigned long flow_offload_get_timeout(struct flow_offload *flow) { unsigned long timeout = NF_FLOW_TIMEOUT; struct net *net = nf_ct_net(flow->ct); int l4num = nf_ct_protonum(flow->ct); if (l4num == IPPROTO_TCP) { struct nf_tcp_net *tn = nf_tcp_pernet(net); timeout = tn->offload_timeout; } else if (l4num == IPPROTO_UDP) { struct nf_udp_net *tn = nf_udp_pernet(net); timeout = tn->offload_timeout; } return timeout; } int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) { int err; flow->timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow); err = rhashtable_insert_fast(&flow_table->rhashtable, &flow->tuplehash[0].node, nf_flow_offload_rhash_params); if (err < 0) return err; err = rhashtable_insert_fast(&flow_table->rhashtable, &flow->tuplehash[1].node, nf_flow_offload_rhash_params); if (err < 0) { rhashtable_remove_fast(&flow_table->rhashtable, &flow->tuplehash[0].node, nf_flow_offload_rhash_params); return err; } nf_ct_refresh(flow->ct, NF_CT_DAY); if (nf_flowtable_hw_offload(flow_table)) { __set_bit(NF_FLOW_HW, &flow->flags); nf_flow_offload_add(flow_table, flow); } return 0; } EXPORT_SYMBOL_GPL(flow_offload_add); void flow_offload_refresh(struct nf_flowtable *flow_table, struct flow_offload *flow, bool force) { u32 timeout; timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow); if (force || timeout - READ_ONCE(flow->timeout) > HZ) WRITE_ONCE(flow->timeout, timeout); else return; if (likely(!nf_flowtable_hw_offload(flow_table)) || test_bit(NF_FLOW_CLOSING, &flow->flags)) return; nf_flow_offload_add(flow_table, flow); } EXPORT_SYMBOL_GPL(flow_offload_refresh); static void flow_offload_del(struct nf_flowtable *flow_table, struct flow_offload *flow) { rhashtable_remove_fast(&flow_table->rhashtable, &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node, nf_flow_offload_rhash_params); rhashtable_remove_fast(&flow_table->rhashtable, &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node, nf_flow_offload_rhash_params); flow_offload_free(flow); } void flow_offload_teardown(struct flow_offload *flow) { clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status); if (!test_and_set_bit(NF_FLOW_TEARDOWN, &flow->flags)) flow_offload_fixup_ct(flow); } EXPORT_SYMBOL_GPL(flow_offload_teardown); struct flow_offload_tuple_rhash * flow_offload_lookup(struct nf_flowtable *flow_table, struct flow_offload_tuple *tuple) { struct flow_offload_tuple_rhash *tuplehash; struct flow_offload *flow; int dir; tuplehash = rhashtable_lookup(&flow_table->rhashtable, tuple, nf_flow_offload_rhash_params); if (!tuplehash) return NULL; dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) return NULL; if (unlikely(nf_ct_is_dying(flow->ct))) return NULL; return tuplehash; } EXPORT_SYMBOL_GPL(flow_offload_lookup); static int nf_flow_table_iterate(struct nf_flowtable *flow_table, void (*iter)(struct nf_flowtable *flowtable, struct flow_offload *flow, void *data), void *data) { struct flow_offload_tuple_rhash *tuplehash; struct rhashtable_iter hti; struct flow_offload *flow; int err = 0; rhashtable_walk_enter(&flow_table->rhashtable, &hti); rhashtable_walk_start(&hti); while ((tuplehash = rhashtable_walk_next(&hti))) { if (IS_ERR(tuplehash)) { if (PTR_ERR(tuplehash) != -EAGAIN) { err = PTR_ERR(tuplehash); break; } continue; } if (tuplehash->tuple.dir) continue; flow = container_of(tuplehash, struct flow_offload, tuplehash[0]); iter(flow_table, flow, data); } rhashtable_walk_stop(&hti); rhashtable_walk_exit(&hti); return err; } static bool nf_flow_custom_gc(struct nf_flowtable *flow_table, const struct flow_offload *flow) { return flow_table->type->gc && flow_table->type->gc(flow); } /** * nf_flow_table_tcp_timeout() - new timeout of offloaded tcp entry * @ct: Flowtable offloaded tcp ct * * Return: number of seconds when ct entry should expire. */ static u32 nf_flow_table_tcp_timeout(const struct nf_conn *ct) { u8 state = READ_ONCE(ct->proto.tcp.state); switch (state) { case TCP_CONNTRACK_SYN_SENT: case TCP_CONNTRACK_SYN_RECV: return 0; case TCP_CONNTRACK_ESTABLISHED: return NF_CT_DAY; case TCP_CONNTRACK_FIN_WAIT: case TCP_CONNTRACK_CLOSE_WAIT: case TCP_CONNTRACK_LAST_ACK: case TCP_CONNTRACK_TIME_WAIT: return 5 * 60 * HZ; case TCP_CONNTRACK_CLOSE: return 0; } return 0; } /** * nf_flow_table_extend_ct_timeout() - Extend ct timeout of offloaded conntrack entry * @ct: Flowtable offloaded ct * * Datapath lookups in the conntrack table will evict nf_conn entries * if they have expired. * * Once nf_conn entries have been offloaded, nf_conntrack might not see any * packets anymore. Thus ct->timeout is no longer refreshed and ct can * be evicted. * * To avoid the need for an additional check on the offload bit for every * packet processed via nf_conntrack_in(), set an arbitrary timeout large * enough not to ever expire, this save us a check for the IPS_OFFLOAD_BIT * from the packet path via nf_ct_is_expired(). */ static void nf_flow_table_extend_ct_timeout(struct nf_conn *ct) { static const u32 min_timeout = 5 * 60 * HZ; u32 expires = nf_ct_expires(ct); /* normal case: large enough timeout, nothing to do. */ if (likely(expires >= min_timeout)) return; /* must check offload bit after this, we do not hold any locks. * flowtable and ct entries could have been removed on another CPU. */ if (!refcount_inc_not_zero(&ct->ct_general.use)) return; /* load ct->status after refcount increase */ smp_acquire__after_ctrl_dep(); if (nf_ct_is_confirmed(ct) && test_bit(IPS_OFFLOAD_BIT, &ct->status)) { u8 l4proto = nf_ct_protonum(ct); u32 new_timeout = true; switch (l4proto) { case IPPROTO_UDP: new_timeout = NF_CT_DAY; break; case IPPROTO_TCP: new_timeout = nf_flow_table_tcp_timeout(ct); break; default: WARN_ON_ONCE(1); break; } /* Update to ct->timeout from nf_conntrack happens * without holding ct->lock. * * Use cmpxchg to ensure timeout extension doesn't * happen when we race with conntrack datapath. * * The inverse -- datapath updating ->timeout right * after this -- is fine, datapath is authoritative. */ if (new_timeout) { new_timeout += nfct_time_stamp; cmpxchg(&ct->timeout, expires, new_timeout); } } nf_ct_put(ct); } static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table, struct flow_offload *flow, void *data) { bool teardown = test_bit(NF_FLOW_TEARDOWN, &flow->flags); if (nf_flow_has_expired(flow) || nf_ct_is_dying(flow->ct) || nf_flow_custom_gc(flow_table, flow)) { flow_offload_teardown(flow); teardown = true; } else if (!teardown) { nf_flow_table_extend_ct_timeout(flow->ct); } if (teardown) { if (test_bit(NF_FLOW_HW, &flow->flags)) { if (!test_bit(NF_FLOW_HW_DYING, &flow->flags)) nf_flow_offload_del(flow_table, flow); else if (test_bit(NF_FLOW_HW_DEAD, &flow->flags)) flow_offload_del(flow_table, flow); } else { flow_offload_del(flow_table, flow); } } else if (test_bit(NF_FLOW_CLOSING, &flow->flags) && test_bit(NF_FLOW_HW, &flow->flags) && !test_bit(NF_FLOW_HW_DYING, &flow->flags)) { nf_flow_offload_del(flow_table, flow); } else if (test_bit(NF_FLOW_HW, &flow->flags)) { nf_flow_offload_stats(flow_table, flow); } } void nf_flow_table_gc_run(struct nf_flowtable *flow_table) { nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, NULL); } static void nf_flow_offload_work_gc(struct work_struct *work) { struct nf_flowtable *flow_table; flow_table = container_of(work, struct nf_flowtable, gc_work.work); nf_flow_table_gc_run(flow_table); queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ); } static void nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff, __be16 port, __be16 new_port) { struct tcphdr *tcph; tcph = (void *)(skb_network_header(skb) + thoff); inet_proto_csum_replace2(&tcph->check, skb, port, new_port, false); } static void nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff, __be16 port, __be16 new_port) { struct udphdr *udph; udph = (void *)(skb_network_header(skb) + thoff); if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { inet_proto_csum_replace2(&udph->check, skb, port, new_port, false); if (!udph->check) udph->check = CSUM_MANGLED_0; } } static void nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff, u8 protocol, __be16 port, __be16 new_port) { switch (protocol) { case IPPROTO_TCP: nf_flow_nat_port_tcp(skb, thoff, port, new_port); break; case IPPROTO_UDP: nf_flow_nat_port_udp(skb, thoff, port, new_port); break; } } void nf_flow_snat_port(const struct flow_offload *flow, struct sk_buff *skb, unsigned int thoff, u8 protocol, enum flow_offload_tuple_dir dir) { struct flow_ports *hdr; __be16 port, new_port; hdr = (void *)(skb_network_header(skb) + thoff); switch (dir) { case FLOW_OFFLOAD_DIR_ORIGINAL: port = hdr->source; new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port; hdr->source = new_port; break; case FLOW_OFFLOAD_DIR_REPLY: port = hdr->dest; new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port; hdr->dest = new_port; break; } nf_flow_nat_port(skb, thoff, protocol, port, new_port); } EXPORT_SYMBOL_GPL(nf_flow_snat_port); void nf_flow_dnat_port(const struct flow_offload *flow, struct sk_buff *skb, unsigned int thoff, u8 protocol, enum flow_offload_tuple_dir dir) { struct flow_ports *hdr; __be16 port, new_port; hdr = (void *)(skb_network_header(skb) + thoff); switch (dir) { case FLOW_OFFLOAD_DIR_ORIGINAL: port = hdr->dest; new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port; hdr->dest = new_port; break; case FLOW_OFFLOAD_DIR_REPLY: port = hdr->source; new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port; hdr->source = new_port; break; } nf_flow_nat_port(skb, thoff, protocol, port, new_port); } EXPORT_SYMBOL_GPL(nf_flow_dnat_port); int nf_flow_table_init(struct nf_flowtable *flowtable) { int err; INIT_DELAYED_WORK(&flowtable->gc_work, nf_flow_offload_work_gc); flow_block_init(&flowtable->flow_block); init_rwsem(&flowtable->flow_block_lock); err = rhashtable_init(&flowtable->rhashtable, &nf_flow_offload_rhash_params); if (err < 0) return err; queue_delayed_work(system_power_efficient_wq, &flowtable->gc_work, HZ); mutex_lock(&flowtable_lock); list_add(&flowtable->list, &flowtables); mutex_unlock(&flowtable_lock); return 0; } EXPORT_SYMBOL_GPL(nf_flow_table_init); static void nf_flow_table_do_cleanup(struct nf_flowtable *flow_table, struct flow_offload *flow, void *data) { struct net_device *dev = data; if (!dev) { flow_offload_teardown(flow); return; } if (net_eq(nf_ct_net(flow->ct), dev_net(dev)) && (flow->tuplehash[0].tuple.iifidx == dev->ifindex || flow->tuplehash[1].tuple.iifidx == dev->ifindex)) flow_offload_teardown(flow); } void nf_flow_table_gc_cleanup(struct nf_flowtable *flowtable, struct net_device *dev) { nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev); flush_delayed_work(&flowtable->gc_work); nf_flow_table_offload_flush(flowtable); } void nf_flow_table_cleanup(struct net_device *dev) { struct nf_flowtable *flowtable; mutex_lock(&flowtable_lock); list_for_each_entry(flowtable, &flowtables, list) nf_flow_table_gc_cleanup(flowtable, dev); mutex_unlock(&flowtable_lock); } EXPORT_SYMBOL_GPL(nf_flow_table_cleanup); void nf_flow_table_free(struct nf_flowtable *flow_table) { mutex_lock(&flowtable_lock); list_del(&flow_table->list); mutex_unlock(&flowtable_lock); cancel_delayed_work_sync(&flow_table->gc_work); nf_flow_table_offload_flush(flow_table); /* ... no more pending work after this stage ... */ nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL); nf_flow_table_gc_run(flow_table); nf_flow_table_offload_flush_cleanup(flow_table); rhashtable_destroy(&flow_table->rhashtable); } EXPORT_SYMBOL_GPL(nf_flow_table_free); static int nf_flow_table_init_net(struct net *net) { net->ft.stat = alloc_percpu(struct nf_flow_table_stat); return net->ft.stat ? 0 : -ENOMEM; } static void nf_flow_table_fini_net(struct net *net) { free_percpu(net->ft.stat); } static int nf_flow_table_pernet_init(struct net *net) { int ret; ret = nf_flow_table_init_net(net); if (ret < 0) return ret; ret = nf_flow_table_init_proc(net); if (ret < 0) goto out_proc; return 0; out_proc: nf_flow_table_fini_net(net); return ret; } static void nf_flow_table_pernet_exit(struct list_head *net_exit_list) { struct net *net; list_for_each_entry(net, net_exit_list, exit_list) { nf_flow_table_fini_proc(net); nf_flow_table_fini_net(net); } } static struct pernet_operations nf_flow_table_net_ops = { .init = nf_flow_table_pernet_init, .exit_batch = nf_flow_table_pernet_exit, }; static int __init nf_flow_table_module_init(void) { int ret; ret = register_pernet_subsys(&nf_flow_table_net_ops); if (ret < 0) return ret; ret = nf_flow_table_offload_init(); if (ret) goto out_offload; ret = nf_flow_register_bpf(); if (ret) goto out_bpf; return 0; out_bpf: nf_flow_table_offload_exit(); out_offload: unregister_pernet_subsys(&nf_flow_table_net_ops); return ret; } static void __exit nf_flow_table_module_exit(void) { nf_flow_table_offload_exit(); unregister_pernet_subsys(&nf_flow_table_net_ops); } module_init(nf_flow_table_module_init); module_exit(nf_flow_table_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); MODULE_DESCRIPTION("Netfilter flow table module"); |
| 120 86 97 2 24 24 47 90 90 89 90 63 89 89 37 37 37 37 37 37 37 37 90 90 55 16 55 55 16 55 55 55 17 90 42 90 90 17 17 90 90 90 90 89 90 90 90 90 90 90 90 90 90 90 90 90 90 90 89 90 89 67 67 67 67 67 67 67 22 22 22 22 22 24 24 24 23 24 24 97 97 97 60 97 97 24 24 24 2 24 24 24 24 24 24 24 24 25 25 25 25 24 24 24 24 24 24 24 3 2 24 24 24 24 24 24 24 24 24 24 24 25 63 63 63 63 63 11 63 63 63 48 48 48 48 48 48 63 63 67 67 67 67 31 67 67 3 67 67 33 66 49 67 67 67 65 65 37 37 37 37 37 37 37 37 67 65 67 67 67 67 67 63 67 63 67 23 35 24 20 24 35 35 35 35 35 35 26 27 27 24 24 7 24 67 50 67 67 50 50 50 50 50 50 50 50 50 11 50 47 47 47 47 47 47 47 47 35 35 35 35 35 35 1 35 35 35 35 35 55 47 35 35 35 55 55 55 55 55 55 55 47 47 47 47 47 35 55 55 55 55 55 55 55 55 55 14 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 55 16 16 55 55 55 55 55 55 67 67 67 55 55 48 48 47 55 48 55 55 55 55 55 55 55 67 55 67 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 119 118 119 118 28 118 118 118 118 118 118 118 2 118 22 118 118 117 118 118 97 22 118 118 118 118 98 98 98 98 98 98 28 97 98 50 50 50 50 19 50 63 50 50 50 50 50 50 46 19 49 49 49 19 19 19 49 49 19 49 19 63 63 63 63 67 63 67 63 63 63 63 63 63 63 63 63 63 63 63 8 8 63 63 63 63 63 63 63 47 63 48 63 63 63 63 63 63 63 51 48 48 63 47 47 98 98 67 94 28 1 1 1 1 28 28 98 98 98 98 98 98 98 94 98 98 98 95 95 95 51 94 94 94 94 94 94 94 94 49 63 31 31 31 31 24 76 77 98 7 98 7 7 7 7 7 7 49 41 49 49 14 49 63 63 63 63 63 63 63 63 63 63 63 63 63 63 63 63 63 24 24 24 24 24 24 24 24 24 24 41 41 41 41 41 24 24 23 24 24 24 24 41 24 24 24 24 1 24 24 24 24 67 67 51 51 51 51 98 98 98 98 98 98 98 98 98 98 98 98 98 98 98 98 67 98 67 98 98 98 98 98 98 98 98 98 67 67 67 67 67 51 94 98 28 97 28 98 98 98 98 28 98 5 5 22 22 22 22 22 22 22 22 5 5 5 5 1 1 22 5 5 1 5 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 3 3 3 3 22 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com * Written by Alex Tomas <alex@clusterfs.com> */ /* * mballoc.c contains the multiblocks allocation routines */ #include "ext4_jbd2.h" #include "mballoc.h" #include <linux/log2.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/nospec.h> #include <linux/backing-dev.h> #include <linux/freezer.h> #include <trace/events/ext4.h> #include <kunit/static_stub.h> /* * MUSTDO: * - test ext4_ext_search_left() and ext4_ext_search_right() * - search for metadata in few groups * * TODO v4: * - normalization should take into account whether file is still open * - discard preallocations if no free space left (policy?) * - don't normalize tails * - quota * - reservation for superuser * * TODO v3: * - bitmap read-ahead (proposed by Oleg Drokin aka green) * - track min/max extents in each group for better group selection * - mb_mark_used() may allocate chunk right after splitting buddy * - tree of groups sorted by number of free blocks * - error handling */ /* * The allocation request involve request for multiple number of blocks * near to the goal(block) value specified. * * During initialization phase of the allocator we decide to use the * group preallocation or inode preallocation depending on the size of * the file. The size of the file could be the resulting file size we * would have after allocation, or the current file size, which ever * is larger. If the size is less than sbi->s_mb_stream_request we * select to use the group preallocation. The default value of * s_mb_stream_request is 16 blocks. This can also be tuned via * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in * terms of number of blocks. * * The main motivation for having small file use group preallocation is to * ensure that we have small files closer together on the disk. * * First stage the allocator looks at the inode prealloc list, * ext4_inode_info->i_prealloc_list, which contains list of prealloc * spaces for this particular inode. The inode prealloc space is * represented as: * * pa_lstart -> the logical start block for this prealloc space * pa_pstart -> the physical start block for this prealloc space * pa_len -> length for this prealloc space (in clusters) * pa_free -> free space available in this prealloc space (in clusters) * * The inode preallocation space is used looking at the _logical_ start * block. If only the logical file block falls within the range of prealloc * space we will consume the particular prealloc space. This makes sure that * we have contiguous physical blocks representing the file blocks * * The important thing to be noted in case of inode prealloc space is that * we don't modify the values associated to inode prealloc space except * pa_free. * * If we are not able to find blocks in the inode prealloc space and if we * have the group allocation flag set then we look at the locality group * prealloc space. These are per CPU prealloc list represented as * * ext4_sb_info.s_locality_groups[smp_processor_id()] * * The reason for having a per cpu locality group is to reduce the contention * between CPUs. It is possible to get scheduled at this point. * * The locality group prealloc space is used looking at whether we have * enough free space (pa_free) within the prealloc space. * * If we can't allocate blocks via inode prealloc or/and locality group * prealloc then we look at the buddy cache. The buddy cache is represented * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets * mapped to the buddy and bitmap information regarding different * groups. The buddy information is attached to buddy cache inode so that * we can access them through the page cache. The information regarding * each group is loaded via ext4_mb_load_buddy. The information involve * block bitmap and buddy information. The information are stored in the * inode as: * * { folio } * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... * * * one block each for bitmap and buddy information. So for each group we * take up 2 blocks. A folio can contain blocks_per_folio (folio_size / * blocksize) blocks. So it can have information regarding groups_per_folio * which is blocks_per_folio/2 * * The buddy cache inode is not stored on disk. The inode is thrown * away when the filesystem is unmounted. * * We look for count number of blocks in the buddy cache. If we were able * to locate that many free blocks we return with additional information * regarding rest of the contiguous physical block available * * Before allocating blocks via buddy cache we normalize the request * blocks. This ensure we ask for more blocks that we needed. The extra * blocks that we get after allocation is added to the respective prealloc * list. In case of inode preallocation we follow a list of heuristics * based on file size. This can be found in ext4_mb_normalize_request. If * we are doing a group prealloc we try to normalize the request to * sbi->s_mb_group_prealloc. The default value of s_mb_group_prealloc is * dependent on the cluster size; for non-bigalloc file systems, it is * 512 blocks. This can be tuned via * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in * terms of number of blocks. If we have mounted the file system with -O * stripe=<value> option the group prealloc request is normalized to the * smallest multiple of the stripe value (sbi->s_stripe) which is * greater than the default mb_group_prealloc. * * If "mb_optimize_scan" mount option is set, we maintain in memory group info * structures in two data structures: * * 1) Array of largest free order xarrays (sbi->s_mb_largest_free_orders) * * Locking: Writers use xa_lock, readers use rcu_read_lock. * * This is an array of xarrays where the index in the array represents the * largest free order in the buddy bitmap of the participating group infos of * that xarray. So, there are exactly MB_NUM_ORDERS(sb) (which means total * number of buddy bitmap orders possible) number of xarrays. Group-infos are * placed in appropriate xarrays. * * 2) Average fragment size xarrays (sbi->s_mb_avg_fragment_size) * * Locking: Writers use xa_lock, readers use rcu_read_lock. * * This is an array of xarrays where in the i-th xarray there are groups with * average fragment size >= 2^i and < 2^(i+1). The average fragment size * is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments. * Note that we don't bother with a special xarray for completely empty * groups so we only have MB_NUM_ORDERS(sb) xarrays. Group-infos are placed * in appropriate xarrays. * * In xarray, the index is the block group number, the value is the block group * information, and a non-empty value indicates the block group is present in * the current xarray. * * When "mb_optimize_scan" mount option is set, mballoc consults the above data * structures to decide the order in which groups are to be traversed for * fulfilling an allocation request. * * At CR_POWER2_ALIGNED , we look for groups which have the largest_free_order * >= the order of the request. We directly look at the largest free order list * in the data structure (1) above where largest_free_order = order of the * request. If that list is empty, we look at remaining list in the increasing * order of largest_free_order. This allows us to perform CR_POWER2_ALIGNED * lookup in O(1) time. * * At CR_GOAL_LEN_FAST, we only consider groups where * average fragment size > request size. So, we lookup a group which has average * fragment size just above or equal to request size using our average fragment * size group lists (data structure 2) in O(1) time. * * At CR_BEST_AVAIL_LEN, we aim to optimize allocations which can't be satisfied * in CR_GOAL_LEN_FAST. The fact that we couldn't find a group in * CR_GOAL_LEN_FAST suggests that there is no BG that has avg * fragment size > goal length. So before falling to the slower * CR_GOAL_LEN_SLOW, in CR_BEST_AVAIL_LEN we proactively trim goal length and * then use the same fragment lists as CR_GOAL_LEN_FAST to find a BG with a big * enough average fragment size. This increases the chances of finding a * suitable block group in O(1) time and results in faster allocation at the * cost of reduced size of allocation. * * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in * linear order which requires O(N) search time for each CR_POWER2_ALIGNED and * CR_GOAL_LEN_FAST phase. * * The regular allocator (using the buddy cache) supports a few tunables. * * /sys/fs/ext4/<partition>/mb_min_to_scan * /sys/fs/ext4/<partition>/mb_max_to_scan * /sys/fs/ext4/<partition>/mb_order2_req * /sys/fs/ext4/<partition>/mb_max_linear_groups * * The regular allocator uses buddy scan only if the request len is power of * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The * value of s_mb_order2_reqs can be tuned via * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to * stripe size (sbi->s_stripe), we try to search for contiguous block in * stripe size. This should result in better allocation on RAID setups. If * not, we search in the specific group using bitmap for best extents. The * tunable min_to_scan and max_to_scan control the behaviour here. * min_to_scan indicate how long the mballoc __must__ look for a best * extent and max_to_scan indicates how long the mballoc __can__ look for a * best extent in the found extents. Searching for the blocks starts with * the group specified as the goal value in allocation context via * ac_g_ex. Each group is first checked based on the criteria whether it * can be used for allocation. ext4_mb_good_group explains how the groups are * checked. * * When "mb_optimize_scan" is turned on, as mentioned above, the groups may not * get traversed linearly. That may result in subsequent allocations being not * close to each other. And so, the underlying device may get filled up in a * non-linear fashion. While that may not matter on non-rotational devices, for * rotational devices that may result in higher seek times. "mb_max_linear_groups" * tells mballoc how many groups mballoc should search linearly before * performing consulting above data structures for more efficient lookups. For * non rotational devices, this value defaults to 0 and for rotational devices * this is set to MB_DEFAULT_LINEAR_LIMIT. * * Both the prealloc space are getting populated as above. So for the first * request we will hit the buddy cache which will result in this prealloc * space getting filled. The prealloc space is then later used for the * subsequent request. */ /* * mballoc operates on the following data: * - on-disk bitmap * - in-core buddy (actually includes buddy and bitmap) * - preallocation descriptors (PAs) * * there are two types of preallocations: * - inode * assiged to specific inode and can be used for this inode only. * it describes part of inode's space preallocated to specific * physical blocks. any block from that preallocated can be used * independent. the descriptor just tracks number of blocks left * unused. so, before taking some block from descriptor, one must * make sure corresponded logical block isn't allocated yet. this * also means that freeing any block within descriptor's range * must discard all preallocated blocks. * - locality group * assigned to specific locality group which does not translate to * permanent set of inodes: inode can join and leave group. space * from this type of preallocation can be used for any inode. thus * it's consumed from the beginning to the end. * * relation between them can be expressed as: * in-core buddy = on-disk bitmap + preallocation descriptors * * this mean blocks mballoc considers used are: * - allocated blocks (persistent) * - preallocated blocks (non-persistent) * * consistency in mballoc world means that at any time a block is either * free or used in ALL structures. notice: "any time" should not be read * literally -- time is discrete and delimited by locks. * * to keep it simple, we don't use block numbers, instead we count number of * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA. * * all operations can be expressed as: * - init buddy: buddy = on-disk + PAs * - new PA: buddy += N; PA = N * - use inode PA: on-disk += N; PA -= N * - discard inode PA buddy -= on-disk - PA; PA = 0 * - use locality group PA on-disk += N; PA -= N * - discard locality group PA buddy -= PA; PA = 0 * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap * is used in real operation because we can't know actual used * bits from PA, only from on-disk bitmap * * if we follow this strict logic, then all operations above should be atomic. * given some of them can block, we'd have to use something like semaphores * killing performance on high-end SMP hardware. let's try to relax it using * the following knowledge: * 1) if buddy is referenced, it's already initialized * 2) while block is used in buddy and the buddy is referenced, * nobody can re-allocate that block * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has * bit set and PA claims same block, it's OK. IOW, one can set bit in * on-disk bitmap if buddy has same bit set or/and PA covers corresponded * block * * so, now we're building a concurrency table: * - init buddy vs. * - new PA * blocks for PA are allocated in the buddy, buddy must be referenced * until PA is linked to allocation group to avoid concurrent buddy init * - use inode PA * we need to make sure that either on-disk bitmap or PA has uptodate data * given (3) we care that PA-=N operation doesn't interfere with init * - discard inode PA * the simplest way would be to have buddy initialized by the discard * - use locality group PA * again PA-=N must be serialized with init * - discard locality group PA * the simplest way would be to have buddy initialized by the discard * - new PA vs. * - use inode PA * i_data_sem serializes them * - discard inode PA * discard process must wait until PA isn't used by another process * - use locality group PA * some mutex should serialize them * - discard locality group PA * discard process must wait until PA isn't used by another process * - use inode PA * - use inode PA * i_data_sem or another mutex should serializes them * - discard inode PA * discard process must wait until PA isn't used by another process * - use locality group PA * nothing wrong here -- they're different PAs covering different blocks * - discard locality group PA * discard process must wait until PA isn't used by another process * * now we're ready to make few consequences: * - PA is referenced and while it is no discard is possible * - PA is referenced until block isn't marked in on-disk bitmap * - PA changes only after on-disk bitmap * - discard must not compete with init. either init is done before * any discard or they're serialized somehow * - buddy init as sum of on-disk bitmap and PAs is done atomically * * a special case when we've used PA to emptiness. no need to modify buddy * in this case, but we should care about concurrent init * */ /* * Logic in few words: * * - allocation: * load group * find blocks * mark bits in on-disk bitmap * release group * * - use preallocation: * find proper PA (per-inode or group) * load group * mark bits in on-disk bitmap * release group * release PA * * - free: * load group * mark bits in on-disk bitmap * release group * * - discard preallocations in group: * mark PAs deleted * move them onto local list * load on-disk bitmap * load group * remove PA from object (inode or locality group) * mark free blocks in-core * * - discard inode's preallocations: */ /* * Locking rules * * Locks: * - bitlock on a group (group) * - object (inode/locality) (object) * - per-pa lock (pa) * - cr_power2_aligned lists lock (cr_power2_aligned) * - cr_goal_len_fast lists lock (cr_goal_len_fast) * * Paths: * - new pa * object * group * * - find and use pa: * pa * * - release consumed pa: * pa * group * object * * - generate in-core bitmap: * group * pa * * - discard all for given object (inode, locality group): * object * pa * group * * - discard all for given group: * group * pa * group * object * * - allocation path (ext4_mb_regular_allocator) * group * cr_power2_aligned/cr_goal_len_fast */ static struct kmem_cache *ext4_pspace_cachep; static struct kmem_cache *ext4_ac_cachep; static struct kmem_cache *ext4_free_data_cachep; /* We create slab caches for groupinfo data structures based on the * superblock block size. There will be one per mounted filesystem for * each unique s_blocksize_bits */ #define NR_GRPINFO_CACHES 8 static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES]; static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = { "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k", "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k", "ext4_groupinfo_64k", "ext4_groupinfo_128k" }; static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); static int ext4_mb_scan_group(struct ext4_allocation_context *ac, ext4_group_t group); static int ext4_try_to_trim_range(struct super_block *sb, struct ext4_buddy *e4b, ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks); /* * The algorithm using this percpu seq counter goes below: * 1. We sample the percpu discard_pa_seq counter before trying for block * allocation in ext4_mb_new_blocks(). * 2. We increment this percpu discard_pa_seq counter when we either allocate * or free these blocks i.e. while marking those blocks as used/free in * mb_mark_used()/mb_free_blocks(). * 3. We also increment this percpu seq counter when we successfully identify * that the bb_prealloc_list is not empty and hence proceed for discarding * of those PAs inside ext4_mb_discard_group_preallocations(). * * Now to make sure that the regular fast path of block allocation is not * affected, as a small optimization we only sample the percpu seq counter * on that cpu. Only when the block allocation fails and when freed blocks * found were 0, that is when we sample percpu seq counter for all cpus using * below function ext4_get_discard_pa_seq_sum(). This happens after making * sure that all the PAs on grp->bb_prealloc_list got freed or if it's empty. */ static DEFINE_PER_CPU(u64, discard_pa_seq); static inline u64 ext4_get_discard_pa_seq_sum(void) { int __cpu; u64 __seq = 0; for_each_possible_cpu(__cpu) __seq += per_cpu(discard_pa_seq, __cpu); return __seq; } static inline void *mb_correct_addr_and_bit(int *bit, void *addr) { #if BITS_PER_LONG == 64 *bit += ((unsigned long) addr & 7UL) << 3; addr = (void *) ((unsigned long) addr & ~7UL); #elif BITS_PER_LONG == 32 *bit += ((unsigned long) addr & 3UL) << 3; addr = (void *) ((unsigned long) addr & ~3UL); #else #error "how many bits you are?!" #endif return addr; } static inline int mb_test_bit(int bit, void *addr) { /* * ext4_test_bit on architecture like powerpc * needs unsigned long aligned address */ addr = mb_correct_addr_and_bit(&bit, addr); return ext4_test_bit(bit, addr); } static inline void mb_set_bit(int bit, void *addr) { addr = mb_correct_addr_and_bit(&bit, addr); ext4_set_bit(bit, addr); } static inline void mb_clear_bit(int bit, void *addr) { addr = mb_correct_addr_and_bit(&bit, addr); ext4_clear_bit(bit, addr); } static inline int mb_test_and_clear_bit(int bit, void *addr) { addr = mb_correct_addr_and_bit(&bit, addr); return ext4_test_and_clear_bit(bit, addr); } static inline int mb_find_next_zero_bit(void *addr, int max, int start) { int fix = 0, ret, tmpmax; addr = mb_correct_addr_and_bit(&fix, addr); tmpmax = max + fix; start += fix; ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix; if (ret > max) return max; return ret; } static inline int mb_find_next_bit(void *addr, int max, int start) { int fix = 0, ret, tmpmax; addr = mb_correct_addr_and_bit(&fix, addr); tmpmax = max + fix; start += fix; ret = ext4_find_next_bit(addr, tmpmax, start) - fix; if (ret > max) return max; return ret; } static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) { char *bb; BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); BUG_ON(max == NULL); if (order > e4b->bd_blkbits + 1) { *max = 0; return NULL; } /* at order 0 we see each particular block */ if (order == 0) { *max = 1 << (e4b->bd_blkbits + 3); return e4b->bd_bitmap; } bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order]; return bb; } #ifdef DOUBLE_CHECK static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, int first, int count) { int i; struct super_block *sb = e4b->bd_sb; if (unlikely(e4b->bd_info->bb_bitmap == NULL)) return; assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); for (i = 0; i < count; i++) { if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) { ext4_fsblk_t blocknr; blocknr = ext4_group_first_block_no(sb, e4b->bd_group); blocknr += EXT4_C2B(EXT4_SB(sb), first + i); ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(sb, e4b->bd_group, inode ? inode->i_ino : 0, blocknr, "freeing block already freed " "(bit %u)", first + i); } mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); } } static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count) { int i; if (unlikely(e4b->bd_info->bb_bitmap == NULL)) return; assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); for (i = 0; i < count; i++) { BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap)); mb_set_bit(first + i, e4b->bd_info->bb_bitmap); } } static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) { if (unlikely(e4b->bd_info->bb_bitmap == NULL)) return; if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) { unsigned char *b1, *b2; int i; b1 = (unsigned char *) e4b->bd_info->bb_bitmap; b2 = (unsigned char *) bitmap; for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { if (b1[i] != b2[i]) { ext4_msg(e4b->bd_sb, KERN_ERR, "corruption in group %u " "at byte %u(%u): %x in copy != %x " "on disk/prealloc", e4b->bd_group, i, i * 8, b1[i], b2[i]); BUG(); } } } } static void mb_group_bb_bitmap_alloc(struct super_block *sb, struct ext4_group_info *grp, ext4_group_t group) { struct buffer_head *bh; grp->bb_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS); if (!grp->bb_bitmap) return; bh = ext4_read_block_bitmap(sb, group); if (IS_ERR_OR_NULL(bh)) { kfree(grp->bb_bitmap); grp->bb_bitmap = NULL; return; } memcpy(grp->bb_bitmap, bh->b_data, sb->s_blocksize); put_bh(bh); } static void mb_group_bb_bitmap_free(struct ext4_group_info *grp) { kfree(grp->bb_bitmap); } #else static inline void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, int first, int count) { return; } static inline void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count) { return; } static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) { return; } static inline void mb_group_bb_bitmap_alloc(struct super_block *sb, struct ext4_group_info *grp, ext4_group_t group) { return; } static inline void mb_group_bb_bitmap_free(struct ext4_group_info *grp) { return; } #endif #ifdef AGGRESSIVE_CHECK #define MB_CHECK_ASSERT(assert) \ do { \ if (!(assert)) { \ printk(KERN_EMERG \ "Assertion failure in %s() at %s:%d: \"%s\"\n", \ function, file, line, # assert); \ BUG(); \ } \ } while (0) /* * Perform buddy integrity check with the following steps: * * 1. Top-down validation (from highest order down to order 1, excluding order-0 bitmap): * For each pair of adjacent orders, if a higher-order bit is set (indicating a free block), * at most one of the two corresponding lower-order bits may be clear (free). * * 2. Order-0 (bitmap) validation, performed on bit pairs: * - If either bit in a pair is set (1, allocated), then all corresponding higher-order bits * must not be free (0). * - If both bits in a pair are clear (0, free), then exactly one of the corresponding * higher-order bits must be free (0). * * 3. Preallocation (pa) list validation: * For each preallocated block (pa) in the group: * - Verify that pa_pstart falls within the bounds of this block group. * - Ensure the corresponding bit(s) in the order-0 bitmap are marked as allocated (1). */ static void __mb_check_buddy(struct ext4_buddy *e4b, char *file, const char *function, int line) { struct super_block *sb = e4b->bd_sb; int order = e4b->bd_blkbits + 1; int max; int max2; int i; int j; int k; int count; struct ext4_group_info *grp; int fragments = 0; int fstart; struct list_head *cur; void *buddy; void *buddy2; if (e4b->bd_info->bb_check_counter++ % 10) return; while (order > 1) { buddy = mb_find_buddy(e4b, order, &max); MB_CHECK_ASSERT(buddy); buddy2 = mb_find_buddy(e4b, order - 1, &max2); MB_CHECK_ASSERT(buddy2); MB_CHECK_ASSERT(buddy != buddy2); MB_CHECK_ASSERT(max * 2 == max2); count = 0; for (i = 0; i < max; i++) { if (mb_test_bit(i, buddy)) { /* only single bit in buddy2 may be 0 */ if (!mb_test_bit(i << 1, buddy2)) { MB_CHECK_ASSERT( mb_test_bit((i<<1)+1, buddy2)); } continue; } count++; } MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count); order--; } fstart = -1; buddy = mb_find_buddy(e4b, 0, &max); for (i = 0; i < max; i++) { if (!mb_test_bit(i, buddy)) { MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free); if (fstart == -1) { fragments++; fstart = i; } } else { fstart = -1; } if (!(i & 1)) { int in_use, zero_bit_count = 0; in_use = mb_test_bit(i, buddy) || mb_test_bit(i + 1, buddy); for (j = 1; j < e4b->bd_blkbits + 2; j++) { buddy2 = mb_find_buddy(e4b, j, &max2); k = i >> j; MB_CHECK_ASSERT(k < max2); if (!mb_test_bit(k, buddy2)) zero_bit_count++; } MB_CHECK_ASSERT(zero_bit_count == !in_use); } } MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info)); MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments); grp = ext4_get_group_info(sb, e4b->bd_group); if (!grp) return; list_for_each(cur, &grp->bb_prealloc_list) { ext4_group_t groupnr; struct ext4_prealloc_space *pa; pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); if (!pa->pa_len) continue; ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k); MB_CHECK_ASSERT(groupnr == e4b->bd_group); for (i = 0; i < pa->pa_len; i++) MB_CHECK_ASSERT(mb_test_bit(k + i, buddy)); } } #undef MB_CHECK_ASSERT #define mb_check_buddy(e4b) __mb_check_buddy(e4b, \ __FILE__, __func__, __LINE__) #else #define mb_check_buddy(e4b) #endif /* * Divide blocks started from @first with length @len into * smaller chunks with power of 2 blocks. * Clear the bits in bitmap which the blocks of the chunk(s) covered, * then increase bb_counters[] for corresponded chunk size. */ static void ext4_mb_mark_free_simple(struct super_block *sb, void *buddy, ext4_grpblk_t first, ext4_grpblk_t len, struct ext4_group_info *grp) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_grpblk_t min; ext4_grpblk_t max; ext4_grpblk_t chunk; unsigned int border; BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb)); border = 2 << sb->s_blocksize_bits; while (len > 0) { /* find how many blocks can be covered since this position */ max = ffs(first | border) - 1; /* find how many blocks of power 2 we need to mark */ min = fls(len) - 1; if (max < min) min = max; chunk = 1 << min; /* mark multiblock chunks only */ grp->bb_counters[min]++; if (min > 0) mb_clear_bit(first >> min, buddy + sbi->s_mb_offsets[min]); len -= chunk; first += chunk; } } static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len) { int order; /* * We don't bother with a special lists groups with only 1 block free * extents and for completely empty groups. */ order = fls(len) - 2; if (order < 0) return 0; if (order == MB_NUM_ORDERS(sb)) order--; if (WARN_ON_ONCE(order > MB_NUM_ORDERS(sb))) order = MB_NUM_ORDERS(sb) - 1; return order; } /* Move group to appropriate avg_fragment_size list */ static void mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) { struct ext4_sb_info *sbi = EXT4_SB(sb); int new, old; if (!test_opt2(sb, MB_OPTIMIZE_SCAN)) return; old = grp->bb_avg_fragment_size_order; new = grp->bb_fragments == 0 ? -1 : mb_avg_fragment_size_order(sb, grp->bb_free / grp->bb_fragments); if (new == old) return; if (old >= 0) xa_erase(&sbi->s_mb_avg_fragment_size[old], grp->bb_group); grp->bb_avg_fragment_size_order = new; if (new >= 0) { /* * Cannot use __GFP_NOFAIL because we hold the group lock. * Although allocation for insertion may fails, it's not fatal * as we have linear traversal to fall back on. */ int err = xa_insert(&sbi->s_mb_avg_fragment_size[new], grp->bb_group, grp, GFP_ATOMIC); if (err) mb_debug(sb, "insert group: %u to s_mb_avg_fragment_size[%d] failed, err %d", grp->bb_group, new, err); } } static int ext4_mb_scan_groups_xa_range(struct ext4_allocation_context *ac, struct xarray *xa, ext4_group_t start, ext4_group_t end) { struct super_block *sb = ac->ac_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); enum criteria cr = ac->ac_criteria; ext4_group_t ngroups = ext4_get_groups_count(sb); unsigned long group = start; struct ext4_group_info *grp; if (WARN_ON_ONCE(end > ngroups || start >= end)) return 0; xa_for_each_range(xa, group, grp, start, end - 1) { int err; if (sbi->s_mb_stats) atomic64_inc(&sbi->s_bal_cX_groups_considered[cr]); err = ext4_mb_scan_group(ac, grp->bb_group); if (err || ac->ac_status != AC_STATUS_CONTINUE) return err; cond_resched(); } return 0; } /* * Find a suitable group of given order from the largest free orders xarray. */ static inline int ext4_mb_scan_groups_largest_free_order_range(struct ext4_allocation_context *ac, int order, ext4_group_t start, ext4_group_t end) { struct xarray *xa = &EXT4_SB(ac->ac_sb)->s_mb_largest_free_orders[order]; if (xa_empty(xa)) return 0; return ext4_mb_scan_groups_xa_range(ac, xa, start, end); } /* * Choose next group by traversing largest_free_order lists. Updates *new_cr if * cr level needs an update. */ static int ext4_mb_scan_groups_p2_aligned(struct ext4_allocation_context *ac, ext4_group_t group) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int i; int ret = 0; ext4_group_t start, end; start = group; end = ext4_get_groups_count(ac->ac_sb); wrap_around: for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) { ret = ext4_mb_scan_groups_largest_free_order_range(ac, i, start, end); if (ret || ac->ac_status != AC_STATUS_CONTINUE) return ret; } if (start) { end = start; start = 0; goto wrap_around; } if (sbi->s_mb_stats) atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]); /* Increment cr and search again if no group is found */ ac->ac_criteria = CR_GOAL_LEN_FAST; return ret; } /* * Find a suitable group of given order from the average fragments xarray. */ static int ext4_mb_scan_groups_avg_frag_order_range(struct ext4_allocation_context *ac, int order, ext4_group_t start, ext4_group_t end) { struct xarray *xa = &EXT4_SB(ac->ac_sb)->s_mb_avg_fragment_size[order]; if (xa_empty(xa)) return 0; return ext4_mb_scan_groups_xa_range(ac, xa, start, end); } /* * Choose next group by traversing average fragment size list of suitable * order. Updates *new_cr if cr level needs an update. */ static int ext4_mb_scan_groups_goal_fast(struct ext4_allocation_context *ac, ext4_group_t group) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int i, ret = 0; ext4_group_t start, end; start = group; end = ext4_get_groups_count(ac->ac_sb); wrap_around: i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); for (; i < MB_NUM_ORDERS(ac->ac_sb); i++) { ret = ext4_mb_scan_groups_avg_frag_order_range(ac, i, start, end); if (ret || ac->ac_status != AC_STATUS_CONTINUE) return ret; } if (start) { end = start; start = 0; goto wrap_around; } if (sbi->s_mb_stats) atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]); /* * CR_BEST_AVAIL_LEN works based on the concept that we have * a larger normalized goal len request which can be trimmed to * a smaller goal len such that it can still satisfy original * request len. However, allocation request for non-regular * files never gets normalized. * See function ext4_mb_normalize_request() (EXT4_MB_HINT_DATA). */ if (ac->ac_flags & EXT4_MB_HINT_DATA) ac->ac_criteria = CR_BEST_AVAIL_LEN; else ac->ac_criteria = CR_GOAL_LEN_SLOW; return ret; } /* * We couldn't find a group in CR_GOAL_LEN_FAST so try to find the highest free fragment * order we have and proactively trim the goal request length to that order to * find a suitable group faster. * * This optimizes allocation speed at the cost of slightly reduced * preallocations. However, we make sure that we don't trim the request too * much and fall to CR_GOAL_LEN_SLOW in that case. */ static int ext4_mb_scan_groups_best_avail(struct ext4_allocation_context *ac, ext4_group_t group) { int ret = 0; struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int i, order, min_order; unsigned long num_stripe_clusters = 0; ext4_group_t start, end; /* * mb_avg_fragment_size_order() returns order in a way that makes * retrieving back the length using (1 << order) inaccurate. Hence, use * fls() instead since we need to know the actual length while modifying * goal length. */ order = fls(ac->ac_g_ex.fe_len) - 1; if (WARN_ON_ONCE(order - 1 > MB_NUM_ORDERS(ac->ac_sb))) order = MB_NUM_ORDERS(ac->ac_sb); min_order = order - sbi->s_mb_best_avail_max_trim_order; if (min_order < 0) min_order = 0; if (sbi->s_stripe > 0) { /* * We are assuming that stripe size is always a multiple of * cluster ratio otherwise __ext4_fill_super exists early. */ num_stripe_clusters = EXT4_NUM_B2C(sbi, sbi->s_stripe); if (1 << min_order < num_stripe_clusters) /* * We consider 1 order less because later we round * up the goal len to num_stripe_clusters */ min_order = fls(num_stripe_clusters) - 1; } if (1 << min_order < ac->ac_o_ex.fe_len) min_order = fls(ac->ac_o_ex.fe_len); start = group; end = ext4_get_groups_count(ac->ac_sb); wrap_around: for (i = order; i >= min_order; i--) { int frag_order; /* * Scale down goal len to make sure we find something * in the free fragments list. Basically, reduce * preallocations. */ ac->ac_g_ex.fe_len = 1 << i; if (num_stripe_clusters > 0) { /* * Try to round up the adjusted goal length to * stripe size (in cluster units) multiple for * efficiency. */ ac->ac_g_ex.fe_len = roundup(ac->ac_g_ex.fe_len, num_stripe_clusters); } frag_order = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); ret = ext4_mb_scan_groups_avg_frag_order_range(ac, frag_order, start, end); if (ret || ac->ac_status != AC_STATUS_CONTINUE) return ret; } if (start) { end = start; start = 0; goto wrap_around; } /* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */ ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; if (sbi->s_mb_stats) atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]); ac->ac_criteria = CR_GOAL_LEN_SLOW; return ret; } static inline int should_optimize_scan(struct ext4_allocation_context *ac) { if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN))) return 0; if (ac->ac_criteria >= CR_GOAL_LEN_SLOW) return 0; if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) return 0; return 1; } /* * next linear group for allocation. */ static void next_linear_group(ext4_group_t *group, ext4_group_t ngroups) { /* * Artificially restricted ngroups for non-extent * files makes group > ngroups possible on first loop. */ *group = *group + 1 >= ngroups ? 0 : *group + 1; } static int ext4_mb_scan_groups_linear(struct ext4_allocation_context *ac, ext4_group_t ngroups, ext4_group_t *start, ext4_group_t count) { int ret, i; enum criteria cr = ac->ac_criteria; struct super_block *sb = ac->ac_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t group = *start; for (i = 0; i < count; i++, next_linear_group(&group, ngroups)) { ret = ext4_mb_scan_group(ac, group); if (ret || ac->ac_status != AC_STATUS_CONTINUE) return ret; cond_resched(); } *start = group; if (count == ngroups) ac->ac_criteria++; /* Processed all groups and haven't found blocks */ if (sbi->s_mb_stats && i == ngroups) atomic64_inc(&sbi->s_bal_cX_failed[cr]); return 0; } static int ext4_mb_scan_groups(struct ext4_allocation_context *ac) { int ret = 0; ext4_group_t start; struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); ext4_group_t ngroups = ext4_get_groups_count(ac->ac_sb); /* non-extent files are limited to low blocks/groups */ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))) ngroups = sbi->s_blockfile_groups; /* searching for the right group start from the goal value specified */ start = ac->ac_g_ex.fe_group; ac->ac_prefetch_grp = start; ac->ac_prefetch_nr = 0; if (!should_optimize_scan(ac)) return ext4_mb_scan_groups_linear(ac, ngroups, &start, ngroups); /* * Optimized scanning can return non adjacent groups which can cause * seek overhead for rotational disks. So try few linear groups before * trying optimized scan. */ if (sbi->s_mb_max_linear_groups) ret = ext4_mb_scan_groups_linear(ac, ngroups, &start, sbi->s_mb_max_linear_groups); if (ret || ac->ac_status != AC_STATUS_CONTINUE) return ret; switch (ac->ac_criteria) { case CR_POWER2_ALIGNED: return ext4_mb_scan_groups_p2_aligned(ac, start); case CR_GOAL_LEN_FAST: return ext4_mb_scan_groups_goal_fast(ac, start); case CR_BEST_AVAIL_LEN: return ext4_mb_scan_groups_best_avail(ac, start); default: /* * TODO: For CR_GOAL_LEN_SLOW, we can arrange groups in an * rb tree sorted by bb_free. But until that happens, we should * never come here. */ WARN_ON(1); } return 0; } /* * Cache the order of the largest free extent we have available in this block * group. */ static void mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) { struct ext4_sb_info *sbi = EXT4_SB(sb); int new, old = grp->bb_largest_free_order; for (new = MB_NUM_ORDERS(sb) - 1; new >= 0; new--) if (grp->bb_counters[new] > 0) break; /* No need to move between order lists? */ if (new == old) return; if (old >= 0) { struct xarray *xa = &sbi->s_mb_largest_free_orders[old]; if (!xa_empty(xa) && xa_load(xa, grp->bb_group)) xa_erase(xa, grp->bb_group); } grp->bb_largest_free_order = new; if (test_opt2(sb, MB_OPTIMIZE_SCAN) && new >= 0 && grp->bb_free) { /* * Cannot use __GFP_NOFAIL because we hold the group lock. * Although allocation for insertion may fails, it's not fatal * as we have linear traversal to fall back on. */ int err = xa_insert(&sbi->s_mb_largest_free_orders[new], grp->bb_group, grp, GFP_ATOMIC); if (err) mb_debug(sb, "insert group: %u to s_mb_largest_free_orders[%d] failed, err %d", grp->bb_group, new, err); } } static noinline_for_stack void ext4_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, ext4_group_t group, struct ext4_group_info *grp) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); ext4_grpblk_t i = 0; ext4_grpblk_t first; ext4_grpblk_t len; unsigned free = 0; unsigned fragments = 0; unsigned long long period = get_cycles(); /* initialize buddy from bitmap which is aggregation * of on-disk bitmap and preallocations */ i = mb_find_next_zero_bit(bitmap, max, 0); grp->bb_first_free = i; while (i < max) { fragments++; first = i; i = mb_find_next_bit(bitmap, max, i); len = i - first; free += len; if (len > 1) ext4_mb_mark_free_simple(sb, buddy, first, len, grp); else grp->bb_counters[0]++; if (i < max) i = mb_find_next_zero_bit(bitmap, max, i); } grp->bb_fragments = fragments; if (free != grp->bb_free) { ext4_grp_locked_error(sb, group, 0, 0, "block bitmap and bg descriptor " "inconsistent: %u vs %u free clusters", free, grp->bb_free); /* * If we intend to continue, we consider group descriptor * corrupt and update bb_free using bitmap value */ grp->bb_free = free; ext4_mark_group_bitmap_corrupted(sb, group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); } mb_set_largest_free_order(sb, grp); mb_update_avg_fragment_size(sb, grp); clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); period = get_cycles() - period; atomic_inc(&sbi->s_mb_buddies_generated); atomic64_add(period, &sbi->s_mb_generation_time); } static void mb_regenerate_buddy(struct ext4_buddy *e4b) { int count; int order = 1; void *buddy; while ((buddy = mb_find_buddy(e4b, order++, &count))) mb_set_bits(buddy, 0, count); e4b->bd_info->bb_fragments = 0; memset(e4b->bd_info->bb_counters, 0, sizeof(*e4b->bd_info->bb_counters) * (e4b->bd_sb->s_blocksize_bits + 2)); ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy, e4b->bd_bitmap, e4b->bd_group, e4b->bd_info); } /* The buddy information is attached the buddy cache inode * for convenience. The information regarding each group * is loaded via ext4_mb_load_buddy. The information involve * block bitmap and buddy information. The information are * stored in the inode as * * { folio } * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... * * * one block each for bitmap and buddy information. * So for each group we take up 2 blocks. A folio can * contain blocks_per_folio (folio_size / blocksize) blocks. * So it can have information regarding groups_per_folio which * is blocks_per_folio/2 * * Locking note: This routine takes the block group lock of all groups * for this folio; do not hold this lock when calling this routine! */ static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp) { ext4_group_t ngroups; unsigned int blocksize; int blocks_per_folio; int groups_per_folio; int err = 0; int i; ext4_group_t first_group, group; int first_block; struct super_block *sb; struct buffer_head *bhs; struct buffer_head **bh = NULL; struct inode *inode; char *data; char *bitmap; struct ext4_group_info *grinfo; inode = folio->mapping->host; sb = inode->i_sb; ngroups = ext4_get_groups_count(sb); blocksize = i_blocksize(inode); blocks_per_folio = folio_size(folio) / blocksize; WARN_ON_ONCE(!blocks_per_folio); groups_per_folio = DIV_ROUND_UP(blocks_per_folio, 2); mb_debug(sb, "init folio %lu\n", folio->index); /* allocate buffer_heads to read bitmaps */ if (groups_per_folio > 1) { i = sizeof(struct buffer_head *) * groups_per_folio; bh = kzalloc(i, gfp); if (bh == NULL) return -ENOMEM; } else bh = &bhs; /* read all groups the folio covers into the cache */ first_group = EXT4_PG_TO_LBLK(inode, folio->index) / 2; for (i = 0, group = first_group; i < groups_per_folio; i++, group++) { if (group >= ngroups) break; grinfo = ext4_get_group_info(sb, group); if (!grinfo) continue; /* * If folio is uptodate then we came here after online resize * which added some new uninitialized group info structs, so * we must skip all initialized uptodate buddies on the folio, * which may be currently in use by an allocating task. */ if (folio_test_uptodate(folio) && !EXT4_MB_GRP_NEED_INIT(grinfo)) { bh[i] = NULL; continue; } bh[i] = ext4_read_block_bitmap_nowait(sb, group, false); if (IS_ERR(bh[i])) { err = PTR_ERR(bh[i]); bh[i] = NULL; goto out; } mb_debug(sb, "read bitmap for group %u\n", group); } /* wait for I/O completion */ for (i = 0, group = first_group; i < groups_per_folio; i++, group++) { int err2; if (!bh[i]) continue; err2 = ext4_wait_block_bitmap(sb, group, bh[i]); if (!err) err = err2; } first_block = EXT4_PG_TO_LBLK(inode, folio->index); for (i = 0; i < blocks_per_folio; i++) { group = (first_block + i) >> 1; if (group >= ngroups) break; if (!bh[group - first_group]) /* skip initialized uptodate buddy */ continue; if (!buffer_verified(bh[group - first_group])) /* Skip faulty bitmaps */ continue; err = 0; /* * data carry information regarding this * particular group in the format specified * above * */ data = folio_address(folio) + (i * blocksize); bitmap = bh[group - first_group]->b_data; /* * We place the buddy block and bitmap block * close together */ grinfo = ext4_get_group_info(sb, group); if (!grinfo) { err = -EFSCORRUPTED; goto out; } if ((first_block + i) & 1) { /* this is block of buddy */ BUG_ON(incore == NULL); mb_debug(sb, "put buddy for group %u in folio %lu/%x\n", group, folio->index, i * blocksize); trace_ext4_mb_buddy_bitmap_load(sb, group); grinfo->bb_fragments = 0; memset(grinfo->bb_counters, 0, sizeof(*grinfo->bb_counters) * (MB_NUM_ORDERS(sb))); /* * incore got set to the group block bitmap below */ ext4_lock_group(sb, group); /* init the buddy */ memset(data, 0xff, blocksize); ext4_mb_generate_buddy(sb, data, incore, group, grinfo); ext4_unlock_group(sb, group); incore = NULL; } else { /* this is block of bitmap */ BUG_ON(incore != NULL); mb_debug(sb, "put bitmap for group %u in folio %lu/%x\n", group, folio->index, i * blocksize); trace_ext4_mb_bitmap_load(sb, group); /* see comments in ext4_mb_put_pa() */ ext4_lock_group(sb, group); memcpy(data, bitmap, blocksize); /* mark all preallocated blks used in in-core bitmap */ ext4_mb_generate_from_pa(sb, data, group); WARN_ON_ONCE(!RB_EMPTY_ROOT(&grinfo->bb_free_root)); ext4_unlock_group(sb, group); /* set incore so that the buddy information can be * generated using this */ incore = data; } } folio_mark_uptodate(folio); out: if (bh) { for (i = 0; i < groups_per_folio; i++) brelse(bh[i]); if (bh != &bhs) kfree(bh); } return err; } /* * Lock the buddy and bitmap folios. This makes sure other parallel init_group * on the same buddy folio doesn't happen while holding the buddy folio lock. * Return locked buddy and bitmap folios on e4b struct. If buddy and bitmap * are on the same folio e4b->bd_buddy_folio is NULL and return value is 0. */ static int ext4_mb_get_buddy_folio_lock(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp) { struct inode *inode = EXT4_SB(sb)->s_buddy_cache; int block, pnum; struct folio *folio; e4b->bd_buddy_folio = NULL; e4b->bd_bitmap_folio = NULL; /* * the buddy cache inode stores the block bitmap * and buddy information in consecutive blocks. * So for each group we need two blocks. */ block = group * 2; pnum = EXT4_LBLK_TO_PG(inode, block); folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); if (IS_ERR(folio)) return PTR_ERR(folio); BUG_ON(folio->mapping != inode->i_mapping); WARN_ON_ONCE(folio_size(folio) < sb->s_blocksize); e4b->bd_bitmap_folio = folio; e4b->bd_bitmap = folio_address(folio) + offset_in_folio(folio, EXT4_LBLK_TO_B(inode, block)); block++; pnum = EXT4_LBLK_TO_PG(inode, block); if (folio_contains(folio, pnum)) { /* buddy and bitmap are on the same folio */ return 0; } /* we need another folio for the buddy */ folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); if (IS_ERR(folio)) return PTR_ERR(folio); BUG_ON(folio->mapping != inode->i_mapping); WARN_ON_ONCE(folio_size(folio) < sb->s_blocksize); e4b->bd_buddy_folio = folio; return 0; } static void ext4_mb_put_buddy_folio_lock(struct ext4_buddy *e4b) { if (e4b->bd_bitmap_folio) { folio_unlock(e4b->bd_bitmap_folio); folio_put(e4b->bd_bitmap_folio); } if (e4b->bd_buddy_folio) { folio_unlock(e4b->bd_buddy_folio); folio_put(e4b->bd_buddy_folio); } } /* * Locking note: This routine calls ext4_mb_init_cache(), which takes the * block group lock of all groups for this folio; do not hold the BG lock when * calling this routine! */ static noinline_for_stack int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) { struct ext4_group_info *this_grp; struct ext4_buddy e4b; struct folio *folio; int ret = 0; might_sleep(); mb_debug(sb, "init group %u\n", group); this_grp = ext4_get_group_info(sb, group); if (!this_grp) return -EFSCORRUPTED; /* * This ensures that we don't reinit the buddy cache * folio which map to the group from which we are already * allocating. If we are looking at the buddy cache we would * have taken a reference using ext4_mb_load_buddy and that * would have pinned buddy folio to page cache. * The call to ext4_mb_get_buddy_folio_lock will mark the * folio accessed. */ ret = ext4_mb_get_buddy_folio_lock(sb, group, &e4b, gfp); if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { /* * somebody initialized the group * return without doing anything */ goto err; } folio = e4b.bd_bitmap_folio; ret = ext4_mb_init_cache(folio, NULL, gfp); if (ret) goto err; if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } if (e4b.bd_buddy_folio == NULL) { /* * If both the bitmap and buddy are in * the same folio we don't need to force * init the buddy */ ret = 0; goto err; } /* init buddy cache */ folio = e4b.bd_buddy_folio; ret = ext4_mb_init_cache(folio, e4b.bd_bitmap, gfp); if (ret) goto err; if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } err: ext4_mb_put_buddy_folio_lock(&e4b); return ret; } /* * Locking note: This routine calls ext4_mb_init_cache(), which takes the * block group lock of all groups for this folio; do not hold the BG lock when * calling this routine! */ static noinline_for_stack int ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp) { int block; int pnum; struct folio *folio; int ret; struct ext4_group_info *grp; struct ext4_sb_info *sbi = EXT4_SB(sb); struct inode *inode = sbi->s_buddy_cache; might_sleep(); mb_debug(sb, "load group %u\n", group); grp = ext4_get_group_info(sb, group); if (!grp) return -EFSCORRUPTED; e4b->bd_blkbits = sb->s_blocksize_bits; e4b->bd_info = grp; e4b->bd_sb = sb; e4b->bd_group = group; e4b->bd_buddy_folio = NULL; e4b->bd_bitmap_folio = NULL; if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { /* * we need full data about the group * to make a good selection */ ret = ext4_mb_init_group(sb, group, gfp); if (ret) return ret; } /* * the buddy cache inode stores the block bitmap * and buddy information in consecutive blocks. * So for each group we need two blocks. */ block = group * 2; pnum = EXT4_LBLK_TO_PG(inode, block); /* Avoid locking the folio in the fast path ... */ folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0); if (IS_ERR(folio) || !folio_test_uptodate(folio)) { if (!IS_ERR(folio)) /* * drop the folio reference and try * to get the folio with lock. If we * are not uptodate that implies * somebody just created the folio but * is yet to initialize it. So * wait for it to initialize. */ folio_put(folio); folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); if (!IS_ERR(folio)) { if (WARN_RATELIMIT(folio->mapping != inode->i_mapping, "ext4: bitmap's mapping != inode->i_mapping\n")) { /* should never happen */ folio_unlock(folio); ret = -EINVAL; goto err; } if (!folio_test_uptodate(folio)) { ret = ext4_mb_init_cache(folio, NULL, gfp); if (ret) { folio_unlock(folio); goto err; } mb_cmp_bitmaps(e4b, folio_address(folio) + offset_in_folio(folio, EXT4_LBLK_TO_B(inode, block))); } folio_unlock(folio); } } if (IS_ERR(folio)) { ret = PTR_ERR(folio); goto err; } if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } /* Folios marked accessed already */ e4b->bd_bitmap_folio = folio; e4b->bd_bitmap = folio_address(folio) + offset_in_folio(folio, EXT4_LBLK_TO_B(inode, block)); block++; pnum = EXT4_LBLK_TO_PG(inode, block); /* buddy and bitmap are on the same folio? */ if (folio_contains(folio, pnum)) { folio_get(folio); goto update_buddy; } /* we need another folio for the buddy */ folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0); if (IS_ERR(folio) || !folio_test_uptodate(folio)) { if (!IS_ERR(folio)) folio_put(folio); folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); if (!IS_ERR(folio)) { if (WARN_RATELIMIT(folio->mapping != inode->i_mapping, "ext4: buddy bitmap's mapping != inode->i_mapping\n")) { /* should never happen */ folio_unlock(folio); ret = -EINVAL; goto err; } if (!folio_test_uptodate(folio)) { ret = ext4_mb_init_cache(folio, e4b->bd_bitmap, gfp); if (ret) { folio_unlock(folio); goto err; } } folio_unlock(folio); } } if (IS_ERR(folio)) { ret = PTR_ERR(folio); goto err; } if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } update_buddy: /* Folios marked accessed already */ e4b->bd_buddy_folio = folio; e4b->bd_buddy = folio_address(folio) + offset_in_folio(folio, EXT4_LBLK_TO_B(inode, block)); return 0; err: if (!IS_ERR_OR_NULL(folio)) folio_put(folio); if (e4b->bd_bitmap_folio) folio_put(e4b->bd_bitmap_folio); e4b->bd_buddy = NULL; e4b->bd_bitmap = NULL; return ret; } static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b) { return ext4_mb_load_buddy_gfp(sb, group, e4b, GFP_NOFS); } static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) { if (e4b->bd_bitmap_folio) folio_put(e4b->bd_bitmap_folio); if (e4b->bd_buddy_folio) folio_put(e4b->bd_buddy_folio); } static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) { int order = 1, max; void *bb; BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); BUG_ON(block >= (1 << (e4b->bd_blkbits + 3))); while (order <= e4b->bd_blkbits + 1) { bb = mb_find_buddy(e4b, order, &max); if (!mb_test_bit(block >> order, bb)) { /* this block is part of buddy of order 'order' */ return order; } order++; } return 0; } static void mb_clear_bits(void *bm, int cur, int len) { __u32 *addr; len = cur + len; while (cur < len) { if ((cur & 31) == 0 && (len - cur) >= 32) { /* fast path: clear whole word at once */ addr = bm + (cur >> 3); *addr = 0; cur += 32; continue; } mb_clear_bit(cur, bm); cur++; } } /* clear bits in given range * will return first found zero bit if any, -1 otherwise */ static int mb_test_and_clear_bits(void *bm, int cur, int len) { __u32 *addr; int zero_bit = -1; len = cur + len; while (cur < len) { if ((cur & 31) == 0 && (len - cur) >= 32) { /* fast path: clear whole word at once */ addr = bm + (cur >> 3); if (*addr != (__u32)(-1) && zero_bit == -1) zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0); *addr = 0; cur += 32; continue; } if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1) zero_bit = cur; cur++; } return zero_bit; } void mb_set_bits(void *bm, int cur, int len) { __u32 *addr; len = cur + len; while (cur < len) { if ((cur & 31) == 0 && (len - cur) >= 32) { /* fast path: set whole word at once */ addr = bm + (cur >> 3); *addr = 0xffffffff; cur += 32; continue; } mb_set_bit(cur, bm); cur++; } } static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side) { if (mb_test_bit(*bit + side, bitmap)) { mb_clear_bit(*bit, bitmap); (*bit) -= side; return 1; } else { (*bit) += side; mb_set_bit(*bit, bitmap); return -1; } } static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last) { int max; int order = 1; void *buddy = mb_find_buddy(e4b, order, &max); while (buddy) { void *buddy2; /* Bits in range [first; last] are known to be set since * corresponding blocks were allocated. Bits in range * (first; last) will stay set because they form buddies on * upper layer. We just deal with borders if they don't * align with upper layer and then go up. * Releasing entire group is all about clearing * single bit of highest order buddy. */ /* Example: * --------------------------------- * | 1 | 1 | 1 | 1 | * --------------------------------- * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | * --------------------------------- * 0 1 2 3 4 5 6 7 * \_____________________/ * * Neither [1] nor [6] is aligned to above layer. * Left neighbour [0] is free, so mark it busy, * decrease bb_counters and extend range to * [0; 6] * Right neighbour [7] is busy. It can't be coaleasced with [6], so * mark [6] free, increase bb_counters and shrink range to * [0; 5]. * Then shift range to [0; 2], go up and do the same. */ if (first & 1) e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1); if (!(last & 1)) e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1); if (first > last) break; order++; buddy2 = mb_find_buddy(e4b, order, &max); if (!buddy2) { mb_clear_bits(buddy, first, last - first + 1); e4b->bd_info->bb_counters[order - 1] += last - first + 1; break; } first >>= 1; last >>= 1; buddy = buddy2; } } static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, int first, int count) { int left_is_free = 0; int right_is_free = 0; int block; int last = first + count - 1; struct super_block *sb = e4b->bd_sb; if (WARN_ON(count == 0)) return; BUG_ON(last >= (sb->s_blocksize << 3)); assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); /* Don't bother if the block group is corrupt. */ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) return; mb_check_buddy(e4b); mb_free_blocks_double(inode, e4b, first, count); /* access memory sequentially: check left neighbour, * clear range and then check right neighbour */ if (first != 0) left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap); block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count); if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0]) right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap); if (unlikely(block != -1)) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t blocknr; /* * Fastcommit replay can free already freed blocks which * corrupts allocation info. Regenerate it. */ if (sbi->s_mount_state & EXT4_FC_REPLAY) { mb_regenerate_buddy(e4b); goto check; } blocknr = ext4_group_first_block_no(sb, e4b->bd_group); blocknr += EXT4_C2B(sbi, block); ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(sb, e4b->bd_group, inode ? inode->i_ino : 0, blocknr, "freeing already freed block (bit %u); block bitmap corrupt.", block); return; } this_cpu_inc(discard_pa_seq); e4b->bd_info->bb_free += count; if (first < e4b->bd_info->bb_first_free) e4b->bd_info->bb_first_free = first; /* let's maintain fragments counter */ if (left_is_free && right_is_free) e4b->bd_info->bb_fragments--; else if (!left_is_free && !right_is_free) e4b->bd_info->bb_fragments++; /* buddy[0] == bd_bitmap is a special case, so handle * it right away and let mb_buddy_mark_free stay free of * zero order checks. * Check if neighbours are to be coaleasced, * adjust bitmap bb_counters and borders appropriately. */ if (first & 1) { first += !left_is_free; e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1; } if (!(last & 1)) { last -= !right_is_free; e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1; } if (first <= last) mb_buddy_mark_free(e4b, first >> 1, last >> 1); mb_set_largest_free_order(sb, e4b->bd_info); mb_update_avg_fragment_size(sb, e4b->bd_info); check: mb_check_buddy(e4b); } static int mb_find_extent(struct ext4_buddy *e4b, int block, int needed, struct ext4_free_extent *ex) { int max, order, next; void *buddy; assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); BUG_ON(ex == NULL); buddy = mb_find_buddy(e4b, 0, &max); BUG_ON(buddy == NULL); BUG_ON(block >= max); if (mb_test_bit(block, buddy)) { ex->fe_len = 0; ex->fe_start = 0; ex->fe_group = 0; return 0; } /* find actual order */ order = mb_find_order_for_block(e4b, block); ex->fe_len = (1 << order) - (block & ((1 << order) - 1)); ex->fe_start = block; ex->fe_group = e4b->bd_group; block = block >> order; while (needed > ex->fe_len && mb_find_buddy(e4b, order, &max)) { if (block + 1 >= max) break; next = (block + 1) * (1 << order); if (mb_test_bit(next, e4b->bd_bitmap)) break; order = mb_find_order_for_block(e4b, next); block = next >> order; ex->fe_len += 1 << order; } if (ex->fe_start + ex->fe_len > EXT4_CLUSTERS_PER_GROUP(e4b->bd_sb)) { /* Should never happen! (but apparently sometimes does?!?) */ WARN_ON(1); ext4_grp_locked_error(e4b->bd_sb, e4b->bd_group, 0, 0, "corruption or bug in mb_find_extent " "block=%d, order=%d needed=%d ex=%u/%d/%d@%u", block, order, needed, ex->fe_group, ex->fe_start, ex->fe_len, ex->fe_logical); ex->fe_len = 0; ex->fe_start = 0; ex->fe_group = 0; } return ex->fe_len; } static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) { int ord; int mlen = 0; int max = 0; int start = ex->fe_start; int len = ex->fe_len; unsigned ret = 0; int len0 = len; void *buddy; int ord_start, ord_end; BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); BUG_ON(e4b->bd_group != ex->fe_group); assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); mb_check_buddy(e4b); mb_mark_used_double(e4b, start, len); this_cpu_inc(discard_pa_seq); e4b->bd_info->bb_free -= len; if (e4b->bd_info->bb_first_free == start) e4b->bd_info->bb_first_free += len; /* let's maintain fragments counter */ if (start != 0) mlen = !mb_test_bit(start - 1, e4b->bd_bitmap); if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0]) max = !mb_test_bit(start + len, e4b->bd_bitmap); if (mlen && max) e4b->bd_info->bb_fragments++; else if (!mlen && !max) e4b->bd_info->bb_fragments--; /* let's maintain buddy itself */ while (len) { ord = mb_find_order_for_block(e4b, start); if (((start >> ord) << ord) == start && len >= (1 << ord)) { /* the whole chunk may be allocated at once! */ mlen = 1 << ord; buddy = mb_find_buddy(e4b, ord, &max); BUG_ON((start >> ord) >= max); mb_set_bit(start >> ord, buddy); e4b->bd_info->bb_counters[ord]--; start += mlen; len -= mlen; BUG_ON(len < 0); continue; } /* store for history */ if (ret == 0) ret = len | (ord << 16); BUG_ON(ord <= 0); buddy = mb_find_buddy(e4b, ord, &max); mb_set_bit(start >> ord, buddy); e4b->bd_info->bb_counters[ord]--; ord_start = (start >> ord) << ord; ord_end = ord_start + (1 << ord); /* first chunk */ if (start > ord_start) ext4_mb_mark_free_simple(e4b->bd_sb, e4b->bd_buddy, ord_start, start - ord_start, e4b->bd_info); /* last chunk */ if (start + len < ord_end) { ext4_mb_mark_free_simple(e4b->bd_sb, e4b->bd_buddy, start + len, ord_end - (start + len), e4b->bd_info); break; } len = start + len - ord_end; start = ord_end; } mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); mb_update_avg_fragment_size(e4b->bd_sb, e4b->bd_info); mb_set_bits(e4b->bd_bitmap, ex->fe_start, len0); mb_check_buddy(e4b); return ret; } /* * Must be called under group lock! */ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int ret; BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group); BUG_ON(ac->ac_status == AC_STATUS_FOUND); ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical; ret = mb_mark_used(e4b, &ac->ac_b_ex); /* preallocation can change ac_b_ex, thus we store actually * allocated blocks for history */ ac->ac_f_ex = ac->ac_b_ex; ac->ac_status = AC_STATUS_FOUND; ac->ac_tail = ret & 0xffff; ac->ac_buddy = ret >> 16; /* * take the folio reference. We want the folio to be pinned * so that we don't get a ext4_mb_init_cache_call for this * group until we update the bitmap. That would mean we * double allocate blocks. The reference is dropped * in ext4_mb_release_context */ ac->ac_bitmap_folio = e4b->bd_bitmap_folio; folio_get(ac->ac_bitmap_folio); ac->ac_buddy_folio = e4b->bd_buddy_folio; folio_get(ac->ac_buddy_folio); /* store last allocated for subsequent stream allocation */ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals; WRITE_ONCE(sbi->s_mb_last_groups[hash], ac->ac_f_ex.fe_group); } /* * As we've just preallocated more space than * user requested originally, we store allocated * space in a special descriptor. */ if (ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) ext4_mb_new_preallocation(ac); } static void ext4_mb_check_limits(struct ext4_allocation_context *ac, struct ext4_buddy *e4b, int finish_group) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_free_extent *bex = &ac->ac_b_ex; struct ext4_free_extent *gex = &ac->ac_g_ex; if (ac->ac_status == AC_STATUS_FOUND) return; /* * We don't want to scan for a whole year */ if (ac->ac_found > sbi->s_mb_max_to_scan && !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { ac->ac_status = AC_STATUS_BREAK; return; } /* * Haven't found good chunk so far, let's continue */ if (bex->fe_len < gex->fe_len) return; if (finish_group || ac->ac_found > sbi->s_mb_min_to_scan) ext4_mb_use_best_found(ac, e4b); } /* * The routine checks whether found extent is good enough. If it is, * then the extent gets marked used and flag is set to the context * to stop scanning. Otherwise, the extent is compared with the * previous found extent and if new one is better, then it's stored * in the context. Later, the best found extent will be used, if * mballoc can't find good enough extent. * * The algorithm used is roughly as follows: * * * If free extent found is exactly as big as goal, then * stop the scan and use it immediately * * * If free extent found is smaller than goal, then keep retrying * upto a max of sbi->s_mb_max_to_scan times (default 200). After * that stop scanning and use whatever we have. * * * If free extent found is bigger than goal, then keep retrying * upto a max of sbi->s_mb_min_to_scan times (default 10) before * stopping the scan and using the extent. * * * FIXME: real allocation policy is to be designed yet! */ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, struct ext4_free_extent *ex, struct ext4_buddy *e4b) { struct ext4_free_extent *bex = &ac->ac_b_ex; struct ext4_free_extent *gex = &ac->ac_g_ex; BUG_ON(ex->fe_len <= 0); BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); ac->ac_found++; ac->ac_cX_found[ac->ac_criteria]++; /* * The special case - take what you catch first */ if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) { *bex = *ex; ext4_mb_use_best_found(ac, e4b); return; } /* * Let's check whether the chuck is good enough */ if (ex->fe_len == gex->fe_len) { *bex = *ex; ext4_mb_use_best_found(ac, e4b); return; } /* * If this is first found extent, just store it in the context */ if (bex->fe_len == 0) { *bex = *ex; return; } /* * If new found extent is better, store it in the context */ if (bex->fe_len < gex->fe_len) { /* if the request isn't satisfied, any found extent * larger than previous best one is better */ if (ex->fe_len > bex->fe_len) *bex = *ex; } else if (ex->fe_len > gex->fe_len) { /* if the request is satisfied, then we try to find * an extent that still satisfy the request, but is * smaller than previous one */ if (ex->fe_len < bex->fe_len) *bex = *ex; } ext4_mb_check_limits(ac, e4b, 0); } static noinline_for_stack void ext4_mb_try_best_found(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct ext4_free_extent ex = ac->ac_b_ex; ext4_group_t group = ex.fe_group; int max; int err; BUG_ON(ex.fe_len <= 0); err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); if (err) return; ext4_lock_group(ac->ac_sb, group); if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) goto out; max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex); if (max > 0) { ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); } out: ext4_unlock_group(ac->ac_sb, group); ext4_mb_unload_buddy(e4b); } static noinline_for_stack int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { ext4_group_t group = ac->ac_g_ex.fe_group; int max; int err; struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); struct ext4_free_extent ex; if (!grp) return -EFSCORRUPTED; if (!(ac->ac_flags & (EXT4_MB_HINT_TRY_GOAL | EXT4_MB_HINT_GOAL_ONLY))) return 0; if (grp->bb_free == 0) return 0; err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); if (err) return err; ext4_lock_group(ac->ac_sb, group); if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) goto out; max = mb_find_extent(e4b, ac->ac_g_ex.fe_start, ac->ac_g_ex.fe_len, &ex); ex.fe_logical = 0xDEADFA11; /* debug value */ if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == EXT4_NUM_B2C(sbi, sbi->s_stripe)) { ext4_fsblk_t start; start = ext4_grp_offs_to_block(ac->ac_sb, &ex); /* use do_div to get remainder (would be 64-bit modulo) */ if (do_div(start, sbi->s_stripe) == 0) { ac->ac_found++; ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); } } else if (max >= ac->ac_g_ex.fe_len) { BUG_ON(ex.fe_len <= 0); BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); ac->ac_found++; ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) { /* Sometimes, caller may want to merge even small * number of blocks to an existing extent */ BUG_ON(ex.fe_len <= 0); BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); ac->ac_found++; ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); } out: ext4_unlock_group(ac->ac_sb, group); ext4_mb_unload_buddy(e4b); return 0; } /* * The routine scans buddy structures (not bitmap!) from given order * to max order and tries to find big enough chunk to satisfy the req */ static noinline_for_stack void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct super_block *sb = ac->ac_sb; struct ext4_group_info *grp = e4b->bd_info; void *buddy; int i; int k; int max; BUG_ON(ac->ac_2order <= 0); for (i = ac->ac_2order; i < MB_NUM_ORDERS(sb); i++) { if (grp->bb_counters[i] == 0) continue; buddy = mb_find_buddy(e4b, i, &max); if (WARN_RATELIMIT(buddy == NULL, "ext4: mb_simple_scan_group: mb_find_buddy failed, (%d)\n", i)) continue; k = mb_find_next_zero_bit(buddy, max, 0); if (k >= max) { ext4_mark_group_bitmap_corrupted(ac->ac_sb, e4b->bd_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(ac->ac_sb, e4b->bd_group, 0, 0, "%d free clusters of order %d. But found 0", grp->bb_counters[i], i); break; } ac->ac_found++; ac->ac_cX_found[ac->ac_criteria]++; ac->ac_b_ex.fe_len = 1 << i; ac->ac_b_ex.fe_start = k << i; ac->ac_b_ex.fe_group = e4b->bd_group; ext4_mb_use_best_found(ac, e4b); BUG_ON(ac->ac_f_ex.fe_len != ac->ac_g_ex.fe_len); if (EXT4_SB(sb)->s_mb_stats) atomic_inc(&EXT4_SB(sb)->s_bal_2orders); break; } } /* * The routine scans the group and measures all found extents. * In order to optimize scanning, caller must pass number of * free blocks in the group, so the routine can know upper limit. */ static noinline_for_stack void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct super_block *sb = ac->ac_sb; void *bitmap = e4b->bd_bitmap; struct ext4_free_extent ex; int i, j, freelen; int free; free = e4b->bd_info->bb_free; if (WARN_ON(free <= 0)) return; i = e4b->bd_info->bb_first_free; while (free && ac->ac_status == AC_STATUS_CONTINUE) { i = mb_find_next_zero_bit(bitmap, EXT4_CLUSTERS_PER_GROUP(sb), i); if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) { /* * IF we have corrupt bitmap, we won't find any * free blocks even though group info says we * have free blocks */ ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, "%d free clusters as per " "group info. But bitmap says 0", free); break; } if (!ext4_mb_cr_expensive(ac->ac_criteria)) { /* * In CR_GOAL_LEN_FAST and CR_BEST_AVAIL_LEN, we are * sure that this group will have a large enough * continuous free extent, so skip over the smaller free * extents */ j = mb_find_next_bit(bitmap, EXT4_CLUSTERS_PER_GROUP(sb), i); freelen = j - i; if (freelen < ac->ac_g_ex.fe_len) { i = j; free -= freelen; continue; } } mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex); if (WARN_ON(ex.fe_len <= 0)) break; if (free < ex.fe_len) { ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, "%d free clusters as per " "group info. But got %d blocks", free, ex.fe_len); /* * The number of free blocks differs. This mostly * indicate that the bitmap is corrupt. So exit * without claiming the space. */ break; } ex.fe_logical = 0xDEADC0DE; /* debug value */ ext4_mb_measure_extent(ac, &ex, e4b); i += ex.fe_len; free -= ex.fe_len; } ext4_mb_check_limits(ac, e4b, 1); } /* * This is a special case for storages like raid5 * we try to find stripe-aligned chunks for stripe-size-multiple requests */ static noinline_for_stack void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct super_block *sb = ac->ac_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); void *bitmap = e4b->bd_bitmap; struct ext4_free_extent ex; ext4_fsblk_t first_group_block; ext4_fsblk_t a; ext4_grpblk_t i, stripe; int max; BUG_ON(sbi->s_stripe == 0); /* find first stripe-aligned block in group */ first_group_block = ext4_group_first_block_no(sb, e4b->bd_group); a = first_group_block + sbi->s_stripe - 1; do_div(a, sbi->s_stripe); i = (a * sbi->s_stripe) - first_group_block; stripe = EXT4_NUM_B2C(sbi, sbi->s_stripe); i = EXT4_B2C(sbi, i); while (i < EXT4_CLUSTERS_PER_GROUP(sb)) { if (!mb_test_bit(i, bitmap)) { max = mb_find_extent(e4b, i, stripe, &ex); if (max >= stripe) { ac->ac_found++; ac->ac_cX_found[ac->ac_criteria]++; ex.fe_logical = 0xDEADF00D; /* debug value */ ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); break; } } i += stripe; } } static void __ext4_mb_scan_group(struct ext4_allocation_context *ac) { bool is_stripe_aligned; struct ext4_sb_info *sbi; enum criteria cr = ac->ac_criteria; ac->ac_groups_scanned++; if (cr == CR_POWER2_ALIGNED) return ext4_mb_simple_scan_group(ac, ac->ac_e4b); sbi = EXT4_SB(ac->ac_sb); is_stripe_aligned = false; if ((sbi->s_stripe >= sbi->s_cluster_ratio) && !(ac->ac_g_ex.fe_len % EXT4_NUM_B2C(sbi, sbi->s_stripe))) is_stripe_aligned = true; if ((cr == CR_GOAL_LEN_FAST || cr == CR_BEST_AVAIL_LEN) && is_stripe_aligned) ext4_mb_scan_aligned(ac, ac->ac_e4b); if (ac->ac_status == AC_STATUS_CONTINUE) ext4_mb_complex_scan_group(ac, ac->ac_e4b); } /* * This is also called BEFORE we load the buddy bitmap. * Returns either 1 or 0 indicating that the group is either suitable * for the allocation or not. */ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, ext4_group_t group, enum criteria cr) { ext4_grpblk_t free, fragments; int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); BUG_ON(cr < CR_POWER2_ALIGNED || cr >= EXT4_MB_NUM_CRS); if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) return false; free = grp->bb_free; if (free == 0) return false; fragments = grp->bb_fragments; if (fragments == 0) return false; switch (cr) { case CR_POWER2_ALIGNED: BUG_ON(ac->ac_2order == 0); /* Avoid using the first bg of a flexgroup for data files */ if ((ac->ac_flags & EXT4_MB_HINT_DATA) && (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && ((group % flex_size) == 0)) return false; if (free < ac->ac_g_ex.fe_len) return false; if (ac->ac_2order >= MB_NUM_ORDERS(ac->ac_sb)) return true; if (grp->bb_largest_free_order < ac->ac_2order) return false; return true; case CR_GOAL_LEN_FAST: case CR_BEST_AVAIL_LEN: if ((free / fragments) >= ac->ac_g_ex.fe_len) return true; break; case CR_GOAL_LEN_SLOW: if (free >= ac->ac_g_ex.fe_len) return true; break; case CR_ANY_FREE: return true; default: BUG(); } return false; } /* * This could return negative error code if something goes wrong * during ext4_mb_init_group(). This should not be called with * ext4_lock_group() held. * * Note: because we are conditionally operating with the group lock in * the EXT4_MB_STRICT_CHECK case, we need to fake out sparse in this * function using __acquire and __release. This means we need to be * super careful before messing with the error path handling via "goto * out"! */ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, ext4_group_t group, enum criteria cr) { struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); struct super_block *sb = ac->ac_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK; ext4_grpblk_t free; int ret = 0; if (!grp) return -EFSCORRUPTED; if (sbi->s_mb_stats) atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]); if (should_lock) { ext4_lock_group(sb, group); __release(ext4_group_lock_ptr(sb, group)); } free = grp->bb_free; if (free == 0) goto out; /* * In all criterias except CR_ANY_FREE we try to avoid groups that * can't possibly satisfy the full goal request due to insufficient * free blocks. */ if (cr < CR_ANY_FREE && free < ac->ac_g_ex.fe_len) goto out; if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) goto out; if (should_lock) { __acquire(ext4_group_lock_ptr(sb, group)); ext4_unlock_group(sb, group); } /* We only do this if the grp has never been initialized */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); int ret; /* * CR_POWER2_ALIGNED/CR_GOAL_LEN_FAST is a very optimistic * search to find large good chunks almost for free. If buddy * data is not ready, then this optimization makes no sense. But * we never skip the first block group in a flex_bg, since this * gets used for metadata block allocation, and we want to make * sure we locate metadata blocks in the first block group in * the flex_bg if possible. */ if (!ext4_mb_cr_expensive(cr) && (!sbi->s_log_groups_per_flex || ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) && !(ext4_has_group_desc_csum(sb) && (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) return 0; ret = ext4_mb_init_group(sb, group, GFP_NOFS); if (ret) return ret; } if (should_lock) { ext4_lock_group(sb, group); __release(ext4_group_lock_ptr(sb, group)); } ret = ext4_mb_good_group(ac, group, cr); out: if (should_lock) { __acquire(ext4_group_lock_ptr(sb, group)); ext4_unlock_group(sb, group); } return ret; } /* * Start prefetching @nr block bitmaps starting at @group. * Return the next group which needs to be prefetched. */ ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group, unsigned int nr, int *cnt) { ext4_group_t ngroups = ext4_get_groups_count(sb); struct buffer_head *bh; struct blk_plug plug; blk_start_plug(&plug); while (nr-- > 0) { struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); struct ext4_group_info *grp = ext4_get_group_info(sb, group); /* * Prefetch block groups with free blocks; but don't * bother if it is marked uninitialized on disk, since * it won't require I/O to read. Also only try to * prefetch once, so we avoid getblk() call, which can * be expensive. */ if (gdp && grp && !EXT4_MB_GRP_TEST_AND_SET_READ(grp) && EXT4_MB_GRP_NEED_INIT(grp) && ext4_free_group_clusters(sb, gdp) > 0 ) { bh = ext4_read_block_bitmap_nowait(sb, group, true); if (bh && !IS_ERR(bh)) { if (!buffer_uptodate(bh) && cnt) (*cnt)++; brelse(bh); } } if (++group >= ngroups) group = 0; } blk_finish_plug(&plug); return group; } /* * Batch reads of the block allocation bitmaps to get * multiple READs in flight; limit prefetching at inexpensive * CR, otherwise mballoc can spend a lot of time loading * imperfect groups */ static void ext4_mb_might_prefetch(struct ext4_allocation_context *ac, ext4_group_t group) { struct ext4_sb_info *sbi; if (ac->ac_prefetch_grp != group) return; sbi = EXT4_SB(ac->ac_sb); if (ext4_mb_cr_expensive(ac->ac_criteria) || ac->ac_prefetch_ios < sbi->s_mb_prefetch_limit) { unsigned int nr = sbi->s_mb_prefetch; if (ext4_has_feature_flex_bg(ac->ac_sb)) { nr = 1 << sbi->s_log_groups_per_flex; nr -= group & (nr - 1); nr = umin(nr, sbi->s_mb_prefetch); } ac->ac_prefetch_nr = nr; ac->ac_prefetch_grp = ext4_mb_prefetch(ac->ac_sb, group, nr, &ac->ac_prefetch_ios); } } /* * Prefetching reads the block bitmap into the buffer cache; but we * need to make sure that the buddy bitmap in the page cache has been * initialized. Note that ext4_mb_init_group() will block if the I/O * is not yet completed, or indeed if it was not initiated by * ext4_mb_prefetch did not start the I/O. * * TODO: We should actually kick off the buddy bitmap setup in a work * queue when the buffer I/O is completed, so that we don't block * waiting for the block allocation bitmap read to finish when * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator(). */ void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group, unsigned int nr) { struct ext4_group_desc *gdp; struct ext4_group_info *grp; while (nr-- > 0) { if (!group) group = ext4_get_groups_count(sb); group--; gdp = ext4_get_group_desc(sb, group, NULL); grp = ext4_get_group_info(sb, group); if (grp && gdp && EXT4_MB_GRP_NEED_INIT(grp) && ext4_free_group_clusters(sb, gdp) > 0) { if (ext4_mb_init_group(sb, group, GFP_NOFS)) break; } } } static int ext4_mb_scan_group(struct ext4_allocation_context *ac, ext4_group_t group) { int ret; struct super_block *sb = ac->ac_sb; enum criteria cr = ac->ac_criteria; ext4_mb_might_prefetch(ac, group); /* prevent unnecessary buddy loading. */ if (cr < CR_ANY_FREE && spin_is_locked(ext4_group_lock_ptr(sb, group))) return 0; /* This now checks without needing the buddy folio */ ret = ext4_mb_good_group_nolock(ac, group, cr); if (ret <= 0) { if (!ac->ac_first_err) ac->ac_first_err = ret; return 0; } ret = ext4_mb_load_buddy(sb, group, ac->ac_e4b); if (ret) return ret; /* skip busy group */ if (cr >= CR_ANY_FREE) ext4_lock_group(sb, group); else if (!ext4_try_lock_group(sb, group)) goto out_unload; /* We need to check again after locking the block group. */ if (unlikely(!ext4_mb_good_group(ac, group, cr))) goto out_unlock; __ext4_mb_scan_group(ac); out_unlock: ext4_unlock_group(sb, group); out_unload: ext4_mb_unload_buddy(ac->ac_e4b); return ret; } static noinline_for_stack int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { ext4_group_t i; int err = 0; struct super_block *sb = ac->ac_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_buddy e4b; BUG_ON(ac->ac_status == AC_STATUS_FOUND); /* first, try the goal */ err = ext4_mb_find_by_goal(ac, &e4b); if (err || ac->ac_status == AC_STATUS_FOUND) goto out; if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) goto out; /* * ac->ac_2order is set only if the fe_len is a power of 2 * if ac->ac_2order is set we also set criteria to CR_POWER2_ALIGNED * so that we try exact allocation using buddy. */ i = fls(ac->ac_g_ex.fe_len); ac->ac_2order = 0; /* * We search using buddy data only if the order of the request * is greater than equal to the sbi_s_mb_order2_reqs * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req * We also support searching for power-of-two requests only for * requests upto maximum buddy size we have constructed. */ if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) { if (is_power_of_2(ac->ac_g_ex.fe_len)) ac->ac_2order = array_index_nospec(i - 1, MB_NUM_ORDERS(sb)); } /* if stream allocation is enabled, use global goal */ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals; ac->ac_g_ex.fe_group = READ_ONCE(sbi->s_mb_last_groups[hash]); ac->ac_g_ex.fe_start = -1; ac->ac_flags &= ~EXT4_MB_HINT_TRY_GOAL; } /* * Let's just scan groups to find more-less suitable blocks We * start with CR_GOAL_LEN_FAST, unless it is power of 2 * aligned, in which case let's do that faster approach first. */ ac->ac_criteria = CR_GOAL_LEN_FAST; if (ac->ac_2order) ac->ac_criteria = CR_POWER2_ALIGNED; ac->ac_e4b = &e4b; ac->ac_prefetch_ios = 0; ac->ac_first_err = 0; repeat: while (ac->ac_criteria < EXT4_MB_NUM_CRS) { err = ext4_mb_scan_groups(ac); if (err) goto out; if (ac->ac_status != AC_STATUS_CONTINUE) break; } if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { /* * We've been searching too long. Let's try to allocate * the best chunk we've found so far */ ext4_mb_try_best_found(ac, &e4b); if (ac->ac_status != AC_STATUS_FOUND) { int lost; /* * Someone more lucky has already allocated it. * The only thing we can do is just take first * found block(s) |