1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _SOCK_REUSEPORT_H #define _SOCK_REUSEPORT_H #include <linux/filter.h> #include <linux/skbuff.h> #include <linux/types.h> #include <linux/spinlock.h> #include <net/sock.h> extern spinlock_t reuseport_lock; struct sock_reuseport { struct rcu_head rcu; u16 max_socks; /* length of socks */ u16 num_socks; /* elements in socks */ u16 num_closed_socks; /* closed elements in socks */ u16 incoming_cpu; /* The last synq overflow event timestamp of this * reuse->socks[] group. */ unsigned int synq_overflow_ts; /* ID stays the same even after the size of socks[] grows. */ unsigned int reuseport_id; unsigned int bind_inany:1; unsigned int has_conns:1; struct bpf_prog __rcu *prog; /* optional BPF sock selector */ struct sock *socks[]; /* array of sock pointers */ }; extern int reuseport_alloc(struct sock *sk, bool bind_inany); extern int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany); extern void reuseport_detach_sock(struct sock *sk); void reuseport_stop_listen_sock(struct sock *sk); extern struct sock *reuseport_select_sock(struct sock *sk, u32 hash, struct sk_buff *skb, int hdr_len); struct sock *reuseport_migrate_sock(struct sock *sk, struct sock *migrating_sk, struct sk_buff *skb); extern int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog); extern int reuseport_detach_prog(struct sock *sk); static inline bool reuseport_has_conns(struct sock *sk) { struct sock_reuseport *reuse; bool ret = false; rcu_read_lock(); reuse = rcu_dereference(sk->sk_reuseport_cb); if (reuse && reuse->has_conns) ret = true; rcu_read_unlock(); return ret; } void reuseport_has_conns_set(struct sock *sk); void reuseport_update_incoming_cpu(struct sock *sk, int val); #endif /* _SOCK_REUSEPORT_H */
3 3 3 3 3 3 3 3 3 3 3 3 3075 3075 2825 3075 3075 2824 3074 595 3073 3073 3071 3069 3073 3076 3077 3077 3077 3075 3075 3073 3073 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 // SPDX-License-Identifier: GPL-2.0-or-later /* * Virtio SCSI HBA driver * * Copyright IBM Corp. 2010 * Copyright Red Hat, Inc. 2011 * * Authors: * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> * Paolo Bonzini <pbonzini@redhat.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/slab.h> #include <linux/mempool.h> #include <linux/interrupt.h> #include <linux/virtio.h> #include <linux/virtio_ids.h> #include <linux/virtio_config.h> #include <linux/virtio_scsi.h> #include <linux/cpu.h> #include <linux/blkdev.h> #include <linux/blk-integrity.h> #include <scsi/scsi_host.h> #include <scsi/scsi_device.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_tcq.h> #include <scsi/scsi_devinfo.h> #include <linux/seqlock.h> #include <linux/blk-mq-virtio.h> #include "sd.h" #define VIRTIO_SCSI_MEMPOOL_SZ 64 #define VIRTIO_SCSI_EVENT_LEN 8 #define VIRTIO_SCSI_VQ_BASE 2 /* Command queue element */ struct virtio_scsi_cmd { struct scsi_cmnd *sc; struct completion *comp; union { struct virtio_scsi_cmd_req cmd; struct virtio_scsi_cmd_req_pi cmd_pi; struct virtio_scsi_ctrl_tmf_req tmf; struct virtio_scsi_ctrl_an_req an; } req; union { struct virtio_scsi_cmd_resp cmd; struct virtio_scsi_ctrl_tmf_resp tmf; struct virtio_scsi_ctrl_an_resp an; struct virtio_scsi_event evt; } resp; } ____cacheline_aligned_in_smp; struct virtio_scsi_event_node { struct virtio_scsi *vscsi; struct virtio_scsi_event event; struct work_struct work; }; struct virtio_scsi_vq { /* Protects vq */ spinlock_t vq_lock; struct virtqueue *vq; }; /* Driver instance state */ struct virtio_scsi { struct virtio_device *vdev; /* Get some buffers ready for event vq */ struct virtio_scsi_event_node event_list[VIRTIO_SCSI_EVENT_LEN]; u32 num_queues; struct hlist_node node; /* Protected by event_vq lock */ bool stop_events; struct virtio_scsi_vq ctrl_vq; struct virtio_scsi_vq event_vq; struct virtio_scsi_vq req_vqs[]; }; static struct kmem_cache *virtscsi_cmd_cache; static mempool_t *virtscsi_cmd_pool; static inline struct Scsi_Host *virtio_scsi_host(struct virtio_device *vdev) { return vdev->priv; } static void virtscsi_compute_resid(struct scsi_cmnd *sc, u32 resid) { if (resid) scsi_set_resid(sc, min(resid, scsi_bufflen(sc))); } /* * virtscsi_complete_cmd - finish a scsi_cmd and invoke scsi_done * * Called with vq_lock held. */ static void virtscsi_complete_cmd(struct virtio_scsi *vscsi, void *buf) { struct virtio_scsi_cmd *cmd = buf; struct scsi_cmnd *sc = cmd->sc; struct virtio_scsi_cmd_resp *resp = &cmd->resp.cmd; dev_dbg(&sc->device->sdev_gendev, "cmd %p response %u status %#02x sense_len %u\n", sc, resp->response, resp->status, resp->sense_len); sc->result = resp->status; virtscsi_compute_resid(sc, virtio32_to_cpu(vscsi->vdev, resp->resid)); switch (resp->response) { case VIRTIO_SCSI_S_OK: set_host_byte(sc, DID_OK); break; case VIRTIO_SCSI_S_OVERRUN: set_host_byte(sc, DID_ERROR); break; case VIRTIO_SCSI_S_ABORTED: set_host_byte(sc, DID_ABORT); break; case VIRTIO_SCSI_S_BAD_TARGET: set_host_byte(sc, DID_BAD_TARGET); break; case VIRTIO_SCSI_S_RESET: set_host_byte(sc, DID_RESET); break; case VIRTIO_SCSI_S_BUSY: set_host_byte(sc, DID_BUS_BUSY); break; case VIRTIO_SCSI_S_TRANSPORT_FAILURE: set_host_byte(sc, DID_TRANSPORT_DISRUPTED); break; case VIRTIO_SCSI_S_TARGET_FAILURE: set_host_byte(sc, DID_BAD_TARGET); break; case VIRTIO_SCSI_S_NEXUS_FAILURE: set_status_byte(sc, SAM_STAT_RESERVATION_CONFLICT); break; default: scmd_printk(KERN_WARNING, sc, "Unknown response %d", resp->response); fallthrough; case VIRTIO_SCSI_S_FAILURE: set_host_byte(sc, DID_ERROR); break; } WARN_ON(virtio32_to_cpu(vscsi->vdev, resp->sense_len) > VIRTIO_SCSI_SENSE_SIZE); if (resp->sense_len) { memcpy(sc->sense_buffer, resp->sense, min_t(u32, virtio32_to_cpu(vscsi->vdev, resp->sense_len), VIRTIO_SCSI_SENSE_SIZE)); } scsi_done(sc); } static void virtscsi_vq_done(struct virtio_scsi *vscsi, struct virtio_scsi_vq *virtscsi_vq, void (*fn)(struct virtio_scsi *vscsi, void *buf)) { void *buf; unsigned int len; unsigned long flags; struct virtqueue *vq = virtscsi_vq->vq; spin_lock_irqsave(&virtscsi_vq->vq_lock, flags); do { virtqueue_disable_cb(vq); while ((buf = virtqueue_get_buf(vq, &len)) != NULL) fn(vscsi, buf); if (unlikely(virtqueue_is_broken(vq))) break; } while (!virtqueue_enable_cb(vq)); spin_unlock_irqrestore(&virtscsi_vq->vq_lock, flags); } static void virtscsi_req_done(struct virtqueue *vq) { struct Scsi_Host *sh = virtio_scsi_host(vq->vdev); struct virtio_scsi *vscsi = shost_priv(sh); int index = vq->index - VIRTIO_SCSI_VQ_BASE; struct virtio_scsi_vq *req_vq = &vscsi->req_vqs[index]; virtscsi_vq_done(vscsi, req_vq, virtscsi_complete_cmd); }; static void virtscsi_poll_requests(struct virtio_scsi *vscsi) { int i, num_vqs; num_vqs = vscsi->num_queues; for (i = 0; i < num_vqs; i++) virtscsi_vq_done(vscsi, &vscsi->req_vqs[i], virtscsi_complete_cmd); } static void virtscsi_complete_free(struct virtio_scsi *vscsi, void *buf) { struct virtio_scsi_cmd *cmd = buf; if (cmd->comp) complete(cmd->comp); } static void virtscsi_ctrl_done(struct virtqueue *vq) { struct Scsi_Host *sh = virtio_scsi_host(vq->vdev); struct virtio_scsi *vscsi = shost_priv(sh); virtscsi_vq_done(vscsi, &vscsi->ctrl_vq, virtscsi_complete_free); }; static void virtscsi_handle_event(struct work_struct *work); static int virtscsi_kick_event(struct virtio_scsi *vscsi, struct virtio_scsi_event_node *event_node) { int err; struct scatterlist sg; unsigned long flags; INIT_WORK(&event_node->work, virtscsi_handle_event); sg_init_one(&sg, &event_node->event, sizeof(struct virtio_scsi_event)); spin_lock_irqsave(&vscsi->event_vq.vq_lock, flags); err = virtqueue_add_inbuf(vscsi->event_vq.vq, &sg, 1, event_node, GFP_ATOMIC); if (!err) virtqueue_kick(vscsi->event_vq.vq); spin_unlock_irqrestore(&vscsi->event_vq.vq_lock, flags); return err; } static int virtscsi_kick_event_all(struct virtio_scsi *vscsi) { int i; for (i = 0; i < VIRTIO_SCSI_EVENT_LEN; i++) { vscsi->event_list[i].vscsi = vscsi; virtscsi_kick_event(vscsi, &vscsi->event_list[i]); } return 0; } static void virtscsi_cancel_event_work(struct virtio_scsi *vscsi) { int i; /* Stop scheduling work before calling cancel_work_sync. */ spin_lock_irq(&vscsi->event_vq.vq_lock); vscsi->stop_events = true; spin_unlock_irq(&vscsi->event_vq.vq_lock); for (i = 0; i < VIRTIO_SCSI_EVENT_LEN; i++) cancel_work_sync(&vscsi->event_list[i].work); } static void virtscsi_handle_transport_reset(struct virtio_scsi *vscsi, struct virtio_scsi_event *event) { struct scsi_device *sdev; struct Scsi_Host *shost = virtio_scsi_host(vscsi->vdev); unsigned int target = event->lun[1]; unsigned int lun = (event->lun[2] << 8) | event->lun[3]; switch (virtio32_to_cpu(vscsi->vdev, event->reason)) { case VIRTIO_SCSI_EVT_RESET_RESCAN: if (lun == 0) { scsi_scan_target(&shost->shost_gendev, 0, target, SCAN_WILD_CARD, SCSI_SCAN_INITIAL); } else { scsi_add_device(shost, 0, target, lun); } break; case VIRTIO_SCSI_EVT_RESET_REMOVED: sdev = scsi_device_lookup(shost, 0, target, lun); if (sdev) { scsi_remove_device(sdev); scsi_device_put(sdev); } else { pr_err("SCSI device %d 0 %d %d not found\n", shost->host_no, target, lun); } break; default: pr_info("Unsupported virtio scsi event reason %x\n", event->reason); } } static void virtscsi_handle_param_change(struct virtio_scsi *vscsi, struct virtio_scsi_event *event) { struct scsi_device *sdev; struct Scsi_Host *shost = virtio_scsi_host(vscsi->vdev); unsigned int target = event->lun[1]; unsigned int lun = (event->lun[2] << 8) | event->lun[3]; u8 asc = virtio32_to_cpu(vscsi->vdev, event->reason) & 255; u8 ascq = virtio32_to_cpu(vscsi->vdev, event->reason) >> 8; sdev = scsi_device_lookup(shost, 0, target, lun); if (!sdev) { pr_err("SCSI device %d 0 %d %d not found\n", shost->host_no, target, lun); return; } /* Handle "Parameters changed", "Mode parameters changed", and "Capacity data has changed". */ if (asc == 0x2a && (ascq == 0x00 || ascq == 0x01 || ascq == 0x09)) scsi_rescan_device(sdev); scsi_device_put(sdev); } static int virtscsi_rescan_hotunplug(struct virtio_scsi *vscsi) { struct scsi_device *sdev; struct Scsi_Host *shost = virtio_scsi_host(vscsi->vdev); unsigned char scsi_cmd[MAX_COMMAND_SIZE]; int result, inquiry_len, inq_result_len = 256; char *inq_result = kmalloc(inq_result_len, GFP_KERNEL); if (!inq_result) return -ENOMEM; shost_for_each_device(sdev, shost) { inquiry_len = sdev->inquiry_len ? sdev->inquiry_len : 36; memset(scsi_cmd, 0, sizeof(scsi_cmd)); scsi_cmd[0] = INQUIRY; scsi_cmd[4] = (unsigned char) inquiry_len; memset(inq_result, 0, inq_result_len); result = scsi_execute_cmd(sdev, scsi_cmd, REQ_OP_DRV_IN, inq_result, inquiry_len, SD_TIMEOUT, SD_MAX_RETRIES, NULL); if (result == 0 && inq_result[0] >> 5) { /* PQ indicates the LUN is not attached */ scsi_remove_device(sdev); } else if (result > 0 && host_byte(result) == DID_BAD_TARGET) { /* * If all LUNs of a virtio-scsi device are unplugged * it will respond with BAD TARGET on any INQUIRY * command. * Remove the device in this case as well. */ scsi_remove_device(sdev); } } kfree(inq_result); return 0; } static void virtscsi_handle_event(struct work_struct *work) { struct virtio_scsi_event_node *event_node = container_of(work, struct virtio_scsi_event_node, work); struct virtio_scsi *vscsi = event_node->vscsi; struct virtio_scsi_event *event = &event_node->event; if (event->event & cpu_to_virtio32(vscsi->vdev, VIRTIO_SCSI_T_EVENTS_MISSED)) { int ret; event->event &= ~cpu_to_virtio32(vscsi->vdev, VIRTIO_SCSI_T_EVENTS_MISSED); ret = virtscsi_rescan_hotunplug(vscsi); if (ret) return; scsi_scan_host(virtio_scsi_host(vscsi->vdev)); } switch (virtio32_to_cpu(vscsi->vdev, event->event)) { case VIRTIO_SCSI_T_NO_EVENT: break; case VIRTIO_SCSI_T_TRANSPORT_RESET: virtscsi_handle_transport_reset(vscsi, event); break; case VIRTIO_SCSI_T_PARAM_CHANGE: virtscsi_handle_param_change(vscsi, event); break; default: pr_err("Unsupported virtio scsi event %x\n", event->event); } virtscsi_kick_event(vscsi, event_node); } static void virtscsi_complete_event(struct virtio_scsi *vscsi, void *buf) { struct virtio_scsi_event_node *event_node = buf; if (!vscsi->stop_events) queue_work(system_freezable_wq, &event_node->work); } static void virtscsi_event_done(struct virtqueue *vq) { struct Scsi_Host *sh = virtio_scsi_host(vq->vdev); struct virtio_scsi *vscsi = shost_priv(sh); virtscsi_vq_done(vscsi, &vscsi->event_vq, virtscsi_complete_event); }; static int __virtscsi_add_cmd(struct virtqueue *vq, struct virtio_scsi_cmd *cmd, size_t req_size, size_t resp_size) { struct scsi_cmnd *sc = cmd->sc; struct scatterlist *sgs[6], req, resp; struct sg_table *out, *in; unsigned out_num = 0, in_num = 0; out = in = NULL; if (sc && sc->sc_data_direction != DMA_NONE) { if (sc->sc_data_direction != DMA_FROM_DEVICE) out = &sc->sdb.table; if (sc->sc_data_direction != DMA_TO_DEVICE) in = &sc->sdb.table; } /* Request header. */ sg_init_one(&req, &cmd->req, req_size); sgs[out_num++] = &req; /* Data-out buffer. */ if (out) { /* Place WRITE protection SGLs before Data OUT payload */ if (scsi_prot_sg_count(sc)) sgs[out_num++] = scsi_prot_sglist(sc); sgs[out_num++] = out->sgl; } /* Response header. */ sg_init_one(&resp, &cmd->resp, resp_size); sgs[out_num + in_num++] = &resp; /* Data-in buffer */ if (in) { /* Place READ protection SGLs before Data IN payload */ if (scsi_prot_sg_count(sc)) sgs[out_num + in_num++] = scsi_prot_sglist(sc); sgs[out_num + in_num++] = in->sgl; } return virtqueue_add_sgs(vq, sgs, out_num, in_num, cmd, GFP_ATOMIC); } static void virtscsi_kick_vq(struct virtio_scsi_vq *vq) { bool needs_kick; unsigned long flags; spin_lock_irqsave(&vq->vq_lock, flags); needs_kick = virtqueue_kick_prepare(vq->vq); spin_unlock_irqrestore(&vq->vq_lock, flags); if (needs_kick) virtqueue_notify(vq->vq); } /** * virtscsi_add_cmd - add a virtio_scsi_cmd to a virtqueue, optionally kick it * @vq : the struct virtqueue we're talking about * @cmd : command structure * @req_size : size of the request buffer * @resp_size : size of the response buffer * @kick : whether to kick the virtqueue immediately */ static int virtscsi_add_cmd(struct virtio_scsi_vq *vq, struct virtio_scsi_cmd *cmd, size_t req_size, size_t resp_size, bool kick) { unsigned long flags; int err; bool needs_kick = false; spin_lock_irqsave(&vq->vq_lock, flags); err = __virtscsi_add_cmd(vq->vq, cmd, req_size, resp_size); if (!err && kick) needs_kick = virtqueue_kick_prepare(vq->vq); spin_unlock_irqrestore(&vq->vq_lock, flags); if (needs_kick) virtqueue_notify(vq->vq); return err; } static void virtio_scsi_init_hdr(struct virtio_device *vdev, struct virtio_scsi_cmd_req *cmd, struct scsi_cmnd *sc) { cmd->lun[0] = 1; cmd->lun[1] = sc->device->id; cmd->lun[2] = (sc->device->lun >> 8) | 0x40; cmd->lun[3] = sc->device->lun & 0xff; cmd->tag = cpu_to_virtio64(vdev, (unsigned long)sc); cmd->task_attr = VIRTIO_SCSI_S_SIMPLE; cmd->prio = 0; cmd->crn = 0; } #ifdef CONFIG_BLK_DEV_INTEGRITY static void virtio_scsi_init_hdr_pi(struct virtio_device *vdev, struct virtio_scsi_cmd_req_pi *cmd_pi, struct scsi_cmnd *sc) { struct request *rq = scsi_cmd_to_rq(sc); struct blk_integrity *bi; virtio_scsi_init_hdr(vdev, (struct virtio_scsi_cmd_req *)cmd_pi, sc); if (!rq || !scsi_prot_sg_count(sc)) return; bi = blk_get_integrity(rq->q->disk); if (sc->sc_data_direction == DMA_TO_DEVICE) cmd_pi->pi_bytesout = cpu_to_virtio32(vdev, bio_integrity_bytes(bi, blk_rq_sectors(rq))); else if (sc->sc_data_direction == DMA_FROM_DEVICE) cmd_pi->pi_bytesin = cpu_to_virtio32(vdev, bio_integrity_bytes(bi, blk_rq_sectors(rq))); } #endif static struct virtio_scsi_vq *virtscsi_pick_vq_mq(struct virtio_scsi *vscsi, struct scsi_cmnd *sc) { u32 tag = blk_mq_unique_tag(scsi_cmd_to_rq(sc)); u16 hwq = blk_mq_unique_tag_to_hwq(tag); return &vscsi->req_vqs[hwq]; } static int virtscsi_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *sc) { struct virtio_scsi *vscsi = shost_priv(shost); struct virtio_scsi_vq *req_vq = virtscsi_pick_vq_mq(vscsi, sc); struct virtio_scsi_cmd *cmd = scsi_cmd_priv(sc); bool kick; unsigned long flags; int req_size; int ret; BUG_ON(scsi_sg_count(sc) > shost->sg_tablesize); /* TODO: check feature bit and fail if unsupported? */ BUG_ON(sc->sc_data_direction == DMA_BIDIRECTIONAL); dev_dbg(&sc->device->sdev_gendev, "cmd %p CDB: %#02x\n", sc, sc->cmnd[0]); cmd->sc = sc; BUG_ON(sc->cmd_len > VIRTIO_SCSI_CDB_SIZE); #ifdef CONFIG_BLK_DEV_INTEGRITY if (virtio_has_feature(vscsi->vdev, VIRTIO_SCSI_F_T10_PI)) { virtio_scsi_init_hdr_pi(vscsi->vdev, &cmd->req.cmd_pi, sc); memcpy(cmd->req.cmd_pi.cdb, sc->cmnd, sc->cmd_len); req_size = sizeof(cmd->req.cmd_pi); } else #endif { virtio_scsi_init_hdr(vscsi->vdev, &cmd->req.cmd, sc); memcpy(cmd->req.cmd.cdb, sc->cmnd, sc->cmd_len); req_size = sizeof(cmd->req.cmd); } kick = (sc->flags & SCMD_LAST) != 0; ret = virtscsi_add_cmd(req_vq, cmd, req_size, sizeof(cmd->resp.cmd), kick); if (ret == -EIO) { cmd->resp.cmd.response = VIRTIO_SCSI_S_BAD_TARGET; spin_lock_irqsave(&req_vq->vq_lock, flags); virtscsi_complete_cmd(vscsi, cmd); spin_unlock_irqrestore(&req_vq->vq_lock, flags); } else if (ret != 0) { return SCSI_MLQUEUE_HOST_BUSY; } return 0; } static int virtscsi_tmf(struct virtio_scsi *vscsi, struct virtio_scsi_cmd *cmd) { DECLARE_COMPLETION_ONSTACK(comp); int ret = FAILED; cmd->comp = &comp; if (virtscsi_add_cmd(&vscsi->ctrl_vq, cmd, sizeof cmd->req.tmf, sizeof cmd->resp.tmf, true) < 0) goto out; wait_for_completion(&comp); if (cmd->resp.tmf.response == VIRTIO_SCSI_S_OK || cmd->resp.tmf.response == VIRTIO_SCSI_S_FUNCTION_SUCCEEDED) ret = SUCCESS; /* * The spec guarantees that all requests related to the TMF have * been completed, but the callback might not have run yet if * we're using independent interrupts (e.g. MSI). Poll the * virtqueues once. * * In the abort case, scsi_done() will do nothing, because the * command timed out and hence SCMD_STATE_COMPLETE has been set. */ virtscsi_poll_requests(vscsi); out: mempool_free(cmd, virtscsi_cmd_pool); return ret; } static int virtscsi_device_reset(struct scsi_cmnd *sc) { struct virtio_scsi *vscsi = shost_priv(sc->device->host); struct virtio_scsi_cmd *cmd; sdev_printk(KERN_INFO, sc->device, "device reset\n"); cmd = mempool_alloc(virtscsi_cmd_pool, GFP_NOIO); if (!cmd) return FAILED; memset(cmd, 0, sizeof(*cmd)); cmd->req.tmf = (struct virtio_scsi_ctrl_tmf_req){ .type = VIRTIO_SCSI_T_TMF, .subtype = cpu_to_virtio32(vscsi->vdev, VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET), .lun[0] = 1, .lun[1] = sc->device->id, .lun[2] = (sc->device->lun >> 8) | 0x40, .lun[3] = sc->device->lun & 0xff, }; return virtscsi_tmf(vscsi, cmd); } static int virtscsi_device_alloc(struct scsi_device *sdevice) { /* * Passed through SCSI targets (e.g. with qemu's 'scsi-block') * may have transfer limits which come from the host SCSI * controller or something on the host side other than the * target itself. * * To make this work properly, the hypervisor can adjust the * target's VPD information to advertise these limits. But * for that to work, the guest has to look at the VPD pages, * which we won't do by default if it is an SPC-2 device, even * if it does actually support it. * * So, set the blist to always try to read the VPD pages. */ sdevice->sdev_bflags = BLIST_TRY_VPD_PAGES; return 0; } /** * virtscsi_change_queue_depth() - Change a virtscsi target's queue depth * @sdev: Virtscsi target whose queue depth to change * @qdepth: New queue depth */ static int virtscsi_change_queue_depth(struct scsi_device *sdev, int qdepth) { struct Scsi_Host *shost = sdev->host; int max_depth = shost->cmd_per_lun; return scsi_change_queue_depth(sdev, min(max_depth, qdepth)); } static int virtscsi_abort(struct scsi_cmnd *sc) { struct virtio_scsi *vscsi = shost_priv(sc->device->host); struct virtio_scsi_cmd *cmd; scmd_printk(KERN_INFO, sc, "abort\n"); cmd = mempool_alloc(virtscsi_cmd_pool, GFP_NOIO); if (!cmd) return FAILED; memset(cmd, 0, sizeof(*cmd)); cmd->req.tmf = (struct virtio_scsi_ctrl_tmf_req){ .type = VIRTIO_SCSI_T_TMF, .subtype = VIRTIO_SCSI_T_TMF_ABORT_TASK, .lun[0] = 1, .lun[1] = sc->device->id, .lun[2] = (sc->device->lun >> 8) | 0x40, .lun[3] = sc->device->lun & 0xff, .tag = cpu_to_virtio64(vscsi->vdev, (unsigned long)sc), }; return virtscsi_tmf(vscsi, cmd); } static void virtscsi_map_queues(struct Scsi_Host *shost) { struct virtio_scsi *vscsi = shost_priv(shost); struct blk_mq_queue_map *qmap = &shost->tag_set.map[HCTX_TYPE_DEFAULT]; blk_mq_virtio_map_queues(qmap, vscsi->vdev, 2); } static void virtscsi_commit_rqs(struct Scsi_Host *shost, u16 hwq) { struct virtio_scsi *vscsi = shost_priv(shost); virtscsi_kick_vq(&vscsi->req_vqs[hwq]); } /* * The host guarantees to respond to each command, although I/O * latencies might be higher than on bare metal. Reset the timer * unconditionally to give the host a chance to perform EH. */ static enum scsi_timeout_action virtscsi_eh_timed_out(struct scsi_cmnd *scmnd) { return SCSI_EH_RESET_TIMER; } static const struct scsi_host_template virtscsi_host_template = { .module = THIS_MODULE, .name = "Virtio SCSI HBA", .proc_name = "virtio_scsi", .this_id = -1, .cmd_size = sizeof(struct virtio_scsi_cmd), .queuecommand = virtscsi_queuecommand, .commit_rqs = virtscsi_commit_rqs, .change_queue_depth = virtscsi_change_queue_depth, .eh_abort_handler = virtscsi_abort, .eh_device_reset_handler = virtscsi_device_reset, .eh_timed_out = virtscsi_eh_timed_out, .slave_alloc = virtscsi_device_alloc, .dma_boundary = UINT_MAX, .map_queues = virtscsi_map_queues, .track_queue_depth = 1, }; #define virtscsi_config_get(vdev, fld) \ ({ \ __virtio_native_type(struct virtio_scsi_config, fld) __val; \ virtio_cread(vdev, struct virtio_scsi_config, fld, &__val); \ __val; \ }) #define virtscsi_config_set(vdev, fld, val) \ do { \ __virtio_native_type(struct virtio_scsi_config, fld) __val = (val); \ virtio_cwrite(vdev, struct virtio_scsi_config, fld, &__val); \ } while(0) static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq, struct virtqueue *vq) { spin_lock_init(&virtscsi_vq->vq_lock); virtscsi_vq->vq = vq; } static void virtscsi_remove_vqs(struct virtio_device *vdev) { /* Stop all the virtqueues. */ virtio_reset_device(vdev); vdev->config->del_vqs(vdev); } static int virtscsi_init(struct virtio_device *vdev, struct virtio_scsi *vscsi) { int err; u32 i; u32 num_vqs; vq_callback_t **callbacks; const char **names; struct virtqueue **vqs; struct irq_affinity desc = { .pre_vectors = 2 }; num_vqs = vscsi->num_queues + VIRTIO_SCSI_VQ_BASE; vqs = kmalloc_array(num_vqs, sizeof(struct virtqueue *), GFP_KERNEL); callbacks = kmalloc_array(num_vqs, sizeof(vq_callback_t *), GFP_KERNEL); names = kmalloc_array(num_vqs, sizeof(char *), GFP_KERNEL); if (!callbacks || !vqs || !names) { err = -ENOMEM; goto out; } callbacks[0] = virtscsi_ctrl_done; callbacks[1] = virtscsi_event_done; names[0] = "control"; names[1] = "event"; for (i = VIRTIO_SCSI_VQ_BASE; i < num_vqs; i++) { callbacks[i] = virtscsi_req_done; names[i] = "request"; } /* Discover virtqueues and write information to configuration. */ err = virtio_find_vqs(vdev, num_vqs, vqs, callbacks, names, &desc); if (err) goto out; virtscsi_init_vq(&vscsi->ctrl_vq, vqs[0]); virtscsi_init_vq(&vscsi->event_vq, vqs[1]); for (i = VIRTIO_SCSI_VQ_BASE; i < num_vqs; i++) virtscsi_init_vq(&vscsi->req_vqs[i - VIRTIO_SCSI_VQ_BASE], vqs[i]); virtscsi_config_set(vdev, cdb_size, VIRTIO_SCSI_CDB_SIZE); virtscsi_config_set(vdev, sense_size, VIRTIO_SCSI_SENSE_SIZE); err = 0; out: kfree(names); kfree(callbacks); kfree(vqs); if (err) virtscsi_remove_vqs(vdev); return err; } static int virtscsi_probe(struct virtio_device *vdev) { struct Scsi_Host *shost; struct virtio_scsi *vscsi; int err; u32 sg_elems, num_targets; u32 cmd_per_lun; u32 num_queues; if (!vdev->config->get) { dev_err(&vdev->dev, "%s failure: config access disabled\n", __func__); return -EINVAL; } /* We need to know how many queues before we allocate. */ num_queues = virtscsi_config_get(vdev, num_queues) ? : 1; num_queues = min_t(unsigned int, nr_cpu_ids, num_queues); num_targets = virtscsi_config_get(vdev, max_target) + 1; shost = scsi_host_alloc(&virtscsi_host_template, struct_size(vscsi, req_vqs, num_queues)); if (!shost) return -ENOMEM; sg_elems = virtscsi_config_get(vdev, seg_max) ?: 1; shost->sg_tablesize = sg_elems; vscsi = shost_priv(shost); vscsi->vdev = vdev; vscsi->num_queues = num_queues; vdev->priv = shost; err = virtscsi_init(vdev, vscsi); if (err) goto virtscsi_init_failed; shost->can_queue = virtqueue_get_vring_size(vscsi->req_vqs[0].vq); cmd_per_lun = virtscsi_config_get(vdev, cmd_per_lun) ?: 1; shost->cmd_per_lun = min_t(u32, cmd_per_lun, shost->can_queue); shost->max_sectors = virtscsi_config_get(vdev, max_sectors) ?: 0xFFFF; /* LUNs > 256 are reported with format 1, so they go in the range * 16640-32767. */ shost->max_lun = virtscsi_config_get(vdev, max_lun) + 1 + 0x4000; shost->max_id = num_targets; shost->max_channel = 0; shost->max_cmd_len = VIRTIO_SCSI_CDB_SIZE; shost->nr_hw_queues = num_queues; #ifdef CONFIG_BLK_DEV_INTEGRITY if (virtio_has_feature(vdev, VIRTIO_SCSI_F_T10_PI)) { int host_prot; host_prot = SHOST_DIF_TYPE1_PROTECTION | SHOST_DIF_TYPE2_PROTECTION | SHOST_DIF_TYPE3_PROTECTION | SHOST_DIX_TYPE1_PROTECTION | SHOST_DIX_TYPE2_PROTECTION | SHOST_DIX_TYPE3_PROTECTION; scsi_host_set_prot(shost, host_prot); scsi_host_set_guard(shost, SHOST_DIX_GUARD_CRC); } #endif err = scsi_add_host(shost, &vdev->dev); if (err) goto scsi_add_host_failed; virtio_device_ready(vdev); if (virtio_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG)) virtscsi_kick_event_all(vscsi); scsi_scan_host(shost); return 0; scsi_add_host_failed: vdev->config->del_vqs(vdev); virtscsi_init_failed: scsi_host_put(shost); return err; } static void virtscsi_remove(struct virtio_device *vdev) { struct Scsi_Host *shost = virtio_scsi_host(vdev); struct virtio_scsi *vscsi = shost_priv(shost); if (virtio_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG)) virtscsi_cancel_event_work(vscsi); scsi_remove_host(shost); virtscsi_remove_vqs(vdev); scsi_host_put(shost); } #ifdef CONFIG_PM_SLEEP static int virtscsi_freeze(struct virtio_device *vdev) { virtscsi_remove_vqs(vdev); return 0; } static int virtscsi_restore(struct virtio_device *vdev) { struct Scsi_Host *sh = virtio_scsi_host(vdev); struct virtio_scsi *vscsi = shost_priv(sh); int err; err = virtscsi_init(vdev, vscsi); if (err) return err; virtio_device_ready(vdev); if (virtio_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG)) virtscsi_kick_event_all(vscsi); return err; } #endif static struct virtio_device_id id_table[] = { { VIRTIO_ID_SCSI, VIRTIO_DEV_ANY_ID }, { 0 }, }; static unsigned int features[] = { VIRTIO_SCSI_F_HOTPLUG, VIRTIO_SCSI_F_CHANGE, #ifdef CONFIG_BLK_DEV_INTEGRITY VIRTIO_SCSI_F_T10_PI, #endif }; static struct virtio_driver virtio_scsi_driver = { .feature_table = features, .feature_table_size = ARRAY_SIZE(features), .driver.name = KBUILD_MODNAME, .driver.owner = THIS_MODULE, .id_table = id_table, .probe = virtscsi_probe, #ifdef CONFIG_PM_SLEEP .freeze = virtscsi_freeze, .restore = virtscsi_restore, #endif .remove = virtscsi_remove, }; static int __init virtio_scsi_init(void) { int ret = -ENOMEM; virtscsi_cmd_cache = KMEM_CACHE(virtio_scsi_cmd, 0); if (!virtscsi_cmd_cache) { pr_err("kmem_cache_create() for virtscsi_cmd_cache failed\n"); goto error; } virtscsi_cmd_pool = mempool_create_slab_pool(VIRTIO_SCSI_MEMPOOL_SZ, virtscsi_cmd_cache); if (!virtscsi_cmd_pool) { pr_err("mempool_create() for virtscsi_cmd_pool failed\n"); goto error; } ret = register_virtio_driver(&virtio_scsi_driver); if (ret < 0) goto error; return 0; error: mempool_destroy(virtscsi_cmd_pool); virtscsi_cmd_pool = NULL; kmem_cache_destroy(virtscsi_cmd_cache); virtscsi_cmd_cache = NULL; return ret; } static void __exit virtio_scsi_fini(void) { unregister_virtio_driver(&virtio_scsi_driver); mempool_destroy(virtscsi_cmd_pool); kmem_cache_destroy(virtscsi_cmd_cache); } module_init(virtio_scsi_init); module_exit(virtio_scsi_fini); MODULE_DEVICE_TABLE(virtio, id_table); MODULE_DESCRIPTION("Virtio SCSI HBA driver"); MODULE_LICENSE("GPL");
4144 27 779 4269 34 12 1 1 7401 16 4223 4224 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 /* SPDX-License-Identifier: GPL-2.0-only */ /* A pointer that can point to either kernel or userspace memory. */ #ifndef _LINUX_BPFPTR_H #define _LINUX_BPFPTR_H #include <linux/mm.h> #include <linux/sockptr.h> typedef sockptr_t bpfptr_t; static inline bool bpfptr_is_kernel(bpfptr_t bpfptr) { return bpfptr.is_kernel; } static inline bpfptr_t KERNEL_BPFPTR(void *p) { return (bpfptr_t) { .kernel = p, .is_kernel = true }; } static inline bpfptr_t USER_BPFPTR(void __user *p) { return (bpfptr_t) { .user = p }; } static inline bpfptr_t make_bpfptr(u64 addr, bool is_kernel) { if (is_kernel) return KERNEL_BPFPTR((void*) (uintptr_t) addr); else return USER_BPFPTR(u64_to_user_ptr(addr)); } static inline bool bpfptr_is_null(bpfptr_t bpfptr) { if (bpfptr_is_kernel(bpfptr)) return !bpfptr.kernel; return !bpfptr.user; } static inline void bpfptr_add(bpfptr_t *bpfptr, size_t val) { if (bpfptr_is_kernel(*bpfptr)) bpfptr->kernel += val; else bpfptr->user += val; } static inline int copy_from_bpfptr_offset(void *dst, bpfptr_t src, size_t offset, size_t size) { if (!bpfptr_is_kernel(src)) return copy_from_user(dst, src.user + offset, size); return copy_from_kernel_nofault(dst, src.kernel + offset, size); } static inline int copy_from_bpfptr(void *dst, bpfptr_t src, size_t size) { return copy_from_bpfptr_offset(dst, src, 0, size); } static inline int copy_to_bpfptr_offset(bpfptr_t dst, size_t offset, const void *src, size_t size) { return copy_to_sockptr_offset((sockptr_t) dst, offset, src, size); } static inline void *kvmemdup_bpfptr(bpfptr_t src, size_t len) { void *p = kvmalloc(len, GFP_USER | __GFP_NOWARN); if (!p) return ERR_PTR(-ENOMEM); if (copy_from_bpfptr(p, src, len)) { kvfree(p); return ERR_PTR(-EFAULT); } return p; } static inline long strncpy_from_bpfptr(char *dst, bpfptr_t src, size_t count) { if (bpfptr_is_kernel(src)) return strncpy_from_kernel_nofault(dst, src.kernel, count); return strncpy_from_user(dst, src.user, count); } #endif /* _LINUX_BPFPTR_H */
1176 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 // SPDX-License-Identifier: GPL-2.0 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/errno.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/smp.h> #include <linux/cpu.h> #include <linux/prctl.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/idle.h> #include <linux/sched/debug.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/init.h> #include <linux/export.h> #include <linux/pm.h> #include <linux/tick.h> #include <linux/random.h> #include <linux/user-return-notifier.h> #include <linux/dmi.h> #include <linux/utsname.h> #include <linux/stackprotector.h> #include <linux/cpuidle.h> #include <linux/acpi.h> #include <linux/elf-randomize.h> #include <linux/static_call.h> #include <trace/events/power.h> #include <linux/hw_breakpoint.h> #include <linux/entry-common.h> #include <asm/cpu.h> #include <asm/apic.h> #include <linux/uaccess.h> #include <asm/mwait.h> #include <asm/fpu/api.h> #include <asm/fpu/sched.h> #include <asm/fpu/xstate.h> #include <asm/debugreg.h> #include <asm/nmi.h> #include <asm/tlbflush.h> #include <asm/mce.h> #include <asm/vm86.h> #include <asm/switch_to.h> #include <asm/desc.h> #include <asm/prctl.h> #include <asm/spec-ctrl.h> #include <asm/io_bitmap.h> #include <asm/proto.h> #include <asm/frame.h> #include <asm/unwind.h> #include <asm/tdx.h> #include <asm/mmu_context.h> #include <asm/shstk.h> #include "process.h" /* * per-CPU TSS segments. Threads are completely 'soft' on Linux, * no more per-task TSS's. The TSS size is kept cacheline-aligned * so they are allowed to end up in the .data..cacheline_aligned * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = { .x86_tss = { /* * .sp0 is only used when entering ring 0 from a lower * privilege level. Since the init task never runs anything * but ring 0 code, there is no need for a valid value here. * Poison it. */ .sp0 = (1UL << (BITS_PER_LONG-1)) + 1, #ifdef CONFIG_X86_32 .sp1 = TOP_OF_INIT_STACK, .ss0 = __KERNEL_DS, .ss1 = __KERNEL_CS, #endif .io_bitmap_base = IO_BITMAP_OFFSET_INVALID, }, }; EXPORT_PER_CPU_SYMBOL(cpu_tss_rw); DEFINE_PER_CPU(bool, __tss_limit_invalid); EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid); /* * this gets called so that we can store lazy state into memory and copy the * current task into the new thread. */ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { memcpy(dst, src, arch_task_struct_size); #ifdef CONFIG_VM86 dst->thread.vm86 = NULL; #endif /* Drop the copied pointer to current's fpstate */ dst->thread.fpu.fpstate = NULL; return 0; } #ifdef CONFIG_X86_64 void arch_release_task_struct(struct task_struct *tsk) { if (fpu_state_size_dynamic()) fpstate_free(&tsk->thread.fpu); } #endif /* * Free thread data structures etc.. */ void exit_thread(struct task_struct *tsk) { struct thread_struct *t = &tsk->thread; struct fpu *fpu = &t->fpu; if (test_thread_flag(TIF_IO_BITMAP)) io_bitmap_exit(tsk); free_vm86(t); shstk_free(tsk); fpu__drop(fpu); } static int set_new_tls(struct task_struct *p, unsigned long tls) { struct user_desc __user *utls = (struct user_desc __user *)tls; if (in_ia32_syscall()) return do_set_thread_area(p, -1, utls, 0); else return do_set_thread_area_64(p, ARCH_SET_FS, tls); } __visible void ret_from_fork(struct task_struct *prev, struct pt_regs *regs, int (*fn)(void *), void *fn_arg) { schedule_tail(prev); /* Is this a kernel thread? */ if (unlikely(fn)) { fn(fn_arg); /* * A kernel thread is allowed to return here after successfully * calling kernel_execve(). Exit to userspace to complete the * execve() syscall. */ regs->ax = 0; } syscall_exit_to_user_mode(regs); } int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long sp = args->stack; unsigned long tls = args->tls; struct inactive_task_frame *frame; struct fork_frame *fork_frame; struct pt_regs *childregs; unsigned long new_ssp; int ret = 0; childregs = task_pt_regs(p); fork_frame = container_of(childregs, struct fork_frame, regs); frame = &fork_frame->frame; frame->bp = encode_frame_pointer(childregs); frame->ret_addr = (unsigned long) ret_from_fork_asm; p->thread.sp = (unsigned long) fork_frame; p->thread.io_bitmap = NULL; p->thread.iopl_warn = 0; memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); #ifdef CONFIG_X86_64 current_save_fsgs(); p->thread.fsindex = current->thread.fsindex; p->thread.fsbase = current->thread.fsbase; p->thread.gsindex = current->thread.gsindex; p->thread.gsbase = current->thread.gsbase; savesegment(es, p->thread.es); savesegment(ds, p->thread.ds); if (p->mm && (clone_flags & (CLONE_VM | CLONE_VFORK)) == CLONE_VM) set_bit(MM_CONTEXT_LOCK_LAM, &p->mm->context.flags); #else p->thread.sp0 = (unsigned long) (childregs + 1); savesegment(gs, p->thread.gs); /* * Clear all status flags including IF and set fixed bit. 64bit * does not have this initialization as the frame does not contain * flags. The flags consistency (especially vs. AC) is there * ensured via objtool, which lacks 32bit support. */ frame->flags = X86_EFLAGS_FIXED; #endif /* * Allocate a new shadow stack for thread if needed. If shadow stack, * is disabled, new_ssp will remain 0, and fpu_clone() will know not to * update it. */ new_ssp = shstk_alloc_thread_stack(p, clone_flags, args->stack_size); if (IS_ERR_VALUE(new_ssp)) return PTR_ERR((void *)new_ssp); fpu_clone(p, clone_flags, args->fn, new_ssp); /* Kernel thread ? */ if (unlikely(p->flags & PF_KTHREAD)) { p->thread.pkru = pkru_get_init_value(); memset(childregs, 0, sizeof(struct pt_regs)); kthread_frame_init(frame, args->fn, args->fn_arg); return 0; } /* * Clone current's PKRU value from hardware. tsk->thread.pkru * is only valid when scheduled out. */ p->thread.pkru = read_pkru(); frame->bx = 0; *childregs = *current_pt_regs(); childregs->ax = 0; if (sp) childregs->sp = sp; if (unlikely(args->fn)) { /* * A user space thread, but it doesn't return to * ret_after_fork(). * * In order to indicate that to tools like gdb, * we reset the stack and instruction pointers. * * It does the same kernel frame setup to return to a kernel * function that a kernel thread does. */ childregs->sp = 0; childregs->ip = 0; kthread_frame_init(frame, args->fn, args->fn_arg); return 0; } /* Set a new TLS for the child thread? */ if (clone_flags & CLONE_SETTLS) ret = set_new_tls(p, tls); if (!ret && unlikely(test_tsk_thread_flag(current, TIF_IO_BITMAP))) io_bitmap_share(p); return ret; } static void pkru_flush_thread(void) { /* * If PKRU is enabled the default PKRU value has to be loaded into * the hardware right here (similar to context switch). */ pkru_write_default(); } void flush_thread(void) { struct task_struct *tsk = current; flush_ptrace_hw_breakpoint(tsk); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); fpu_flush_thread(); pkru_flush_thread(); } void disable_TSC(void) { preempt_disable(); if (!test_and_set_thread_flag(TIF_NOTSC)) /* * Must flip the CPU state synchronously with * TIF_NOTSC in the current running context. */ cr4_set_bits(X86_CR4_TSD); preempt_enable(); } static void enable_TSC(void) { preempt_disable(); if (test_and_clear_thread_flag(TIF_NOTSC)) /* * Must flip the CPU state synchronously with * TIF_NOTSC in the current running context. */ cr4_clear_bits(X86_CR4_TSD); preempt_enable(); } int get_tsc_mode(unsigned long adr) { unsigned int val; if (test_thread_flag(TIF_NOTSC)) val = PR_TSC_SIGSEGV; else val = PR_TSC_ENABLE; return put_user(val, (unsigned int __user *)adr); } int set_tsc_mode(unsigned int val) { if (val == PR_TSC_SIGSEGV) disable_TSC(); else if (val == PR_TSC_ENABLE) enable_TSC(); else return -EINVAL; return 0; } DEFINE_PER_CPU(u64, msr_misc_features_shadow); static void set_cpuid_faulting(bool on) { u64 msrval; msrval = this_cpu_read(msr_misc_features_shadow); msrval &= ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT; msrval |= (on << MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT); this_cpu_write(msr_misc_features_shadow, msrval); wrmsrl(MSR_MISC_FEATURES_ENABLES, msrval); } static void disable_cpuid(void) { preempt_disable(); if (!test_and_set_thread_flag(TIF_NOCPUID)) { /* * Must flip the CPU state synchronously with * TIF_NOCPUID in the current running context. */ set_cpuid_faulting(true); } preempt_enable(); } static void enable_cpuid(void) { preempt_disable(); if (test_and_clear_thread_flag(TIF_NOCPUID)) { /* * Must flip the CPU state synchronously with * TIF_NOCPUID in the current running context. */ set_cpuid_faulting(false); } preempt_enable(); } static int get_cpuid_mode(void) { return !test_thread_flag(TIF_NOCPUID); } static int set_cpuid_mode(unsigned long cpuid_enabled) { if (!boot_cpu_has(X86_FEATURE_CPUID_FAULT)) return -ENODEV; if (cpuid_enabled) enable_cpuid(); else disable_cpuid(); return 0; } /* * Called immediately after a successful exec. */ void arch_setup_new_exec(void) { /* If cpuid was previously disabled for this task, re-enable it. */ if (test_thread_flag(TIF_NOCPUID)) enable_cpuid(); /* * Don't inherit TIF_SSBD across exec boundary when * PR_SPEC_DISABLE_NOEXEC is used. */ if (test_thread_flag(TIF_SSBD) && task_spec_ssb_noexec(current)) { clear_thread_flag(TIF_SSBD); task_clear_spec_ssb_disable(current); task_clear_spec_ssb_noexec(current); speculation_ctrl_update(read_thread_flags()); } mm_reset_untag_mask(current->mm); } #ifdef CONFIG_X86_IOPL_IOPERM static inline void switch_to_bitmap(unsigned long tifp) { /* * Invalidate I/O bitmap if the previous task used it. This prevents * any possible leakage of an active I/O bitmap. * * If the next task has an I/O bitmap it will handle it on exit to * user mode. */ if (tifp & _TIF_IO_BITMAP) tss_invalidate_io_bitmap(); } static void tss_copy_io_bitmap(struct tss_struct *tss, struct io_bitmap *iobm) { /* * Copy at least the byte range of the incoming tasks bitmap which * covers the permitted I/O ports. * * If the previous task which used an I/O bitmap had more bits * permitted, then the copy needs to cover those as well so they * get turned off. */ memcpy(tss->io_bitmap.bitmap, iobm->bitmap, max(tss->io_bitmap.prev_max, iobm->max)); /* * Store the new max and the sequence number of this bitmap * and a pointer to the bitmap itself. */ tss->io_bitmap.prev_max = iobm->max; tss->io_bitmap.prev_sequence = iobm->sequence; } /** * native_tss_update_io_bitmap - Update I/O bitmap before exiting to user mode */ void native_tss_update_io_bitmap(void) { struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw); struct thread_struct *t = &current->thread; u16 *base = &tss->x86_tss.io_bitmap_base; if (!test_thread_flag(TIF_IO_BITMAP)) { native_tss_invalidate_io_bitmap(); return; } if (IS_ENABLED(CONFIG_X86_IOPL_IOPERM) && t->iopl_emul == 3) { *base = IO_BITMAP_OFFSET_VALID_ALL; } else { struct io_bitmap *iobm = t->io_bitmap; /* * Only copy bitmap data when the sequence number differs. The * update time is accounted to the incoming task. */ if (tss->io_bitmap.prev_sequence != iobm->sequence) tss_copy_io_bitmap(tss, iobm); /* Enable the bitmap */ *base = IO_BITMAP_OFFSET_VALID_MAP; } /* * Make sure that the TSS limit is covering the IO bitmap. It might have * been cut down by a VMEXIT to 0x67 which would cause a subsequent I/O * access from user space to trigger a #GP because tbe bitmap is outside * the TSS limit. */ refresh_tss_limit(); } #else /* CONFIG_X86_IOPL_IOPERM */ static inline void switch_to_bitmap(unsigned long tifp) { } #endif #ifdef CONFIG_SMP struct ssb_state { struct ssb_state *shared_state; raw_spinlock_t lock; unsigned int disable_state; unsigned long local_state; }; #define LSTATE_SSB 0 static DEFINE_PER_CPU(struct ssb_state, ssb_state); void speculative_store_bypass_ht_init(void) { struct ssb_state *st = this_cpu_ptr(&ssb_state); unsigned int this_cpu = smp_processor_id(); unsigned int cpu; st->local_state = 0; /* * Shared state setup happens once on the first bringup * of the CPU. It's not destroyed on CPU hotunplug. */ if (st->shared_state) return; raw_spin_lock_init(&st->lock); /* * Go over HT siblings and check whether one of them has set up the * shared state pointer already. */ for_each_cpu(cpu, topology_sibling_cpumask(this_cpu)) { if (cpu == this_cpu) continue; if (!per_cpu(ssb_state, cpu).shared_state) continue; /* Link it to the state of the sibling: */ st->shared_state = per_cpu(ssb_state, cpu).shared_state; return; } /* * First HT sibling to come up on the core. Link shared state of * the first HT sibling to itself. The siblings on the same core * which come up later will see the shared state pointer and link * themselves to the state of this CPU. */ st->shared_state = st; } /* * Logic is: First HT sibling enables SSBD for both siblings in the core * and last sibling to disable it, disables it for the whole core. This how * MSR_SPEC_CTRL works in "hardware": * * CORE_SPEC_CTRL = THREAD0_SPEC_CTRL | THREAD1_SPEC_CTRL */ static __always_inline void amd_set_core_ssb_state(unsigned long tifn) { struct ssb_state *st = this_cpu_ptr(&ssb_state); u64 msr = x86_amd_ls_cfg_base; if (!static_cpu_has(X86_FEATURE_ZEN)) { msr |= ssbd_tif_to_amd_ls_cfg(tifn); wrmsrl(MSR_AMD64_LS_CFG, msr); return; } if (tifn & _TIF_SSBD) { /* * Since this can race with prctl(), block reentry on the * same CPU. */ if (__test_and_set_bit(LSTATE_SSB, &st->local_state)) return; msr |= x86_amd_ls_cfg_ssbd_mask; raw_spin_lock(&st->shared_state->lock); /* First sibling enables SSBD: */ if (!st->shared_state->disable_state) wrmsrl(MSR_AMD64_LS_CFG, msr); st->shared_state->disable_state++; raw_spin_unlock(&st->shared_state->lock); } else { if (!__test_and_clear_bit(LSTATE_SSB, &st->local_state)) return; raw_spin_lock(&st->shared_state->lock); st->shared_state->disable_state--; if (!st->shared_state->disable_state) wrmsrl(MSR_AMD64_LS_CFG, msr); raw_spin_unlock(&st->shared_state->lock); } } #else static __always_inline void amd_set_core_ssb_state(unsigned long tifn) { u64 msr = x86_amd_ls_cfg_base | ssbd_tif_to_amd_ls_cfg(tifn); wrmsrl(MSR_AMD64_LS_CFG, msr); } #endif static __always_inline void amd_set_ssb_virt_state(unsigned long tifn) { /* * SSBD has the same definition in SPEC_CTRL and VIRT_SPEC_CTRL, * so ssbd_tif_to_spec_ctrl() just works. */ wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn)); } /* * Update the MSRs managing speculation control, during context switch. * * tifp: Previous task's thread flags * tifn: Next task's thread flags */ static __always_inline void __speculation_ctrl_update(unsigned long tifp, unsigned long tifn) { unsigned long tif_diff = tifp ^ tifn; u64 msr = x86_spec_ctrl_base; bool updmsr = false; lockdep_assert_irqs_disabled(); /* Handle change of TIF_SSBD depending on the mitigation method. */ if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) { if (tif_diff & _TIF_SSBD) amd_set_ssb_virt_state(tifn); } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) { if (tif_diff & _TIF_SSBD) amd_set_core_ssb_state(tifn); } else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || static_cpu_has(X86_FEATURE_AMD_SSBD)) { updmsr |= !!(tif_diff & _TIF_SSBD); msr |= ssbd_tif_to_spec_ctrl(tifn); } /* Only evaluate TIF_SPEC_IB if conditional STIBP is enabled. */ if (IS_ENABLED(CONFIG_SMP) && static_branch_unlikely(&switch_to_cond_stibp)) { updmsr |= !!(tif_diff & _TIF_SPEC_IB); msr |= stibp_tif_to_spec_ctrl(tifn); } if (updmsr) update_spec_ctrl_cond(msr); } static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk) { if (test_and_clear_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE)) { if (task_spec_ssb_disable(tsk)) set_tsk_thread_flag(tsk, TIF_SSBD); else clear_tsk_thread_flag(tsk, TIF_SSBD); if (task_spec_ib_disable(tsk)) set_tsk_thread_flag(tsk, TIF_SPEC_IB); else clear_tsk_thread_flag(tsk, TIF_SPEC_IB); } /* Return the updated threadinfo flags*/ return read_task_thread_flags(tsk); } void speculation_ctrl_update(unsigned long tif) { unsigned long flags; /* Forced update. Make sure all relevant TIF flags are different */ local_irq_save(flags); __speculation_ctrl_update(~tif, tif); local_irq_restore(flags); } /* Called from seccomp/prctl update */ void speculation_ctrl_update_current(void) { preempt_disable(); speculation_ctrl_update(speculation_ctrl_update_tif(current)); preempt_enable(); } static inline void cr4_toggle_bits_irqsoff(unsigned long mask) { unsigned long newval, cr4 = this_cpu_read(cpu_tlbstate.cr4); newval = cr4 ^ mask; if (newval != cr4) { this_cpu_write(cpu_tlbstate.cr4, newval); __write_cr4(newval); } } void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) { unsigned long tifp, tifn; tifn = read_task_thread_flags(next_p); tifp = read_task_thread_flags(prev_p); switch_to_bitmap(tifp); propagate_user_return_notify(prev_p, next_p); if ((tifp & _TIF_BLOCKSTEP || tifn & _TIF_BLOCKSTEP) && arch_has_block_step()) { unsigned long debugctl, msk; rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); debugctl &= ~DEBUGCTLMSR_BTF; msk = tifn & _TIF_BLOCKSTEP; debugctl |= (msk >> TIF_BLOCKSTEP) << DEBUGCTLMSR_BTF_SHIFT; wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); } if ((tifp ^ tifn) & _TIF_NOTSC) cr4_toggle_bits_irqsoff(X86_CR4_TSD); if ((tifp ^ tifn) & _TIF_NOCPUID) set_cpuid_faulting(!!(tifn & _TIF_NOCPUID)); if (likely(!((tifp | tifn) & _TIF_SPEC_FORCE_UPDATE))) { __speculation_ctrl_update(tifp, tifn); } else { speculation_ctrl_update_tif(prev_p); tifn = speculation_ctrl_update_tif(next_p); /* Enforce MSR update to ensure consistent state */ __speculation_ctrl_update(~tifn, tifn); } } /* * Idle related variables and functions */ unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE; EXPORT_SYMBOL(boot_option_idle_override); /* * We use this if we don't have any better idle routine.. */ void __cpuidle default_idle(void) { raw_safe_halt(); raw_local_irq_disable(); } #if defined(CONFIG_APM_MODULE) || defined(CONFIG_HALTPOLL_CPUIDLE_MODULE) EXPORT_SYMBOL(default_idle); #endif DEFINE_STATIC_CALL_NULL(x86_idle, default_idle); static bool x86_idle_set(void) { return !!static_call_query(x86_idle); } #ifndef CONFIG_SMP static inline void __noreturn play_dead(void) { BUG(); } #endif void arch_cpu_idle_enter(void) { tsc_verify_tsc_adjust(false); local_touch_nmi(); } void __noreturn arch_cpu_idle_dead(void) { play_dead(); } /* * Called from the generic idle code. */ void __cpuidle arch_cpu_idle(void) { static_call(x86_idle)(); } EXPORT_SYMBOL_GPL(arch_cpu_idle); #ifdef CONFIG_XEN bool xen_set_default_idle(void) { bool ret = x86_idle_set(); static_call_update(x86_idle, default_idle); return ret; } #endif struct cpumask cpus_stop_mask; void __noreturn stop_this_cpu(void *dummy) { struct cpuinfo_x86 *c = this_cpu_ptr(&cpu_info); unsigned int cpu = smp_processor_id(); local_irq_disable(); /* * Remove this CPU from the online mask and disable it * unconditionally. This might be redundant in case that the reboot * vector was handled late and stop_other_cpus() sent an NMI. * * According to SDM and APM NMIs can be accepted even after soft * disabling the local APIC. */ set_cpu_online(cpu, false); disable_local_APIC(); mcheck_cpu_clear(c); /* * Use wbinvd on processors that support SME. This provides support * for performing a successful kexec when going from SME inactive * to SME active (or vice-versa). The cache must be cleared so that * if there are entries with the same physical address, both with and * without the encryption bit, they don't race each other when flushed * and potentially end up with the wrong entry being committed to * memory. * * Test the CPUID bit directly because the machine might've cleared * X86_FEATURE_SME due to cmdline options. */ if (c->extended_cpuid_level >= 0x8000001f && (cpuid_eax(0x8000001f) & BIT(0))) native_wbinvd(); /* * This brings a cache line back and dirties it, but * native_stop_other_cpus() will overwrite cpus_stop_mask after it * observed that all CPUs reported stop. This write will invalidate * the related cache line on this CPU. */ cpumask_clear_cpu(cpu, &cpus_stop_mask); for (;;) { /* * Use native_halt() so that memory contents don't change * (stack usage and variables) after possibly issuing the * native_wbinvd() above. */ native_halt(); } } /* * AMD Erratum 400 aware idle routine. We handle it the same way as C3 power * states (local apic timer and TSC stop). * * XXX this function is completely buggered vs RCU and tracing. */ static void amd_e400_idle(void) { /* * We cannot use static_cpu_has_bug() here because X86_BUG_AMD_APIC_C1E * gets set after static_cpu_has() places have been converted via * alternatives. */ if (!boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) { default_idle(); return; } tick_broadcast_enter(); default_idle(); tick_broadcast_exit(); } /* * Prefer MWAIT over HALT if MWAIT is supported, MWAIT_CPUID leaf * exists and whenever MONITOR/MWAIT extensions are present there is at * least one C1 substate. * * Do not prefer MWAIT if MONITOR instruction has a bug or idle=nomwait * is passed to kernel commandline parameter. */ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; /* User has disallowed the use of MWAIT. Fallback to HALT */ if (boot_option_idle_override == IDLE_NOMWAIT) return 0; /* MWAIT is not supported on this platform. Fallback to HALT */ if (!cpu_has(c, X86_FEATURE_MWAIT)) return 0; /* Monitor has a bug. Fallback to HALT */ if (boot_cpu_has_bug(X86_BUG_MONITOR)) return 0; cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); /* * If MWAIT extensions are not available, it is safe to use MWAIT * with EAX=0, ECX=0. */ if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) return 1; /* * If MWAIT extensions are available, there should be at least one * MWAIT C1 substate present. */ return (edx & MWAIT_C1_SUBSTATE_MASK); } /* * MONITOR/MWAIT with no hints, used for default C1 state. This invokes MWAIT * with interrupts enabled and no flags, which is backwards compatible with the * original MWAIT implementation. */ static __cpuidle void mwait_idle(void) { if (!current_set_polling_and_test()) { if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) { mb(); /* quirk */ clflush((void *)&current_thread_info()->flags); mb(); /* quirk */ } __monitor((void *)&current_thread_info()->flags, 0, 0); if (!need_resched()) { __sti_mwait(0, 0); raw_local_irq_disable(); } } __current_clr_polling(); } void select_idle_routine(const struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP if (boot_option_idle_override == IDLE_POLL && smp_num_siblings > 1) pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n"); #endif if (x86_idle_set() || boot_option_idle_override == IDLE_POLL) return; if (boot_cpu_has_bug(X86_BUG_AMD_E400)) { pr_info("using AMD E400 aware idle routine\n"); static_call_update(x86_idle, amd_e400_idle); } else if (prefer_mwait_c1_over_halt(c)) { pr_info("using mwait in idle threads\n"); static_call_update(x86_idle, mwait_idle); } else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { pr_info("using TDX aware idle routine\n"); static_call_update(x86_idle, tdx_safe_halt); } else static_call_update(x86_idle, default_idle); } void amd_e400_c1e_apic_setup(void) { if (boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) { pr_info("Switch to broadcast mode on CPU%d\n", smp_processor_id()); local_irq_disable(); tick_broadcast_force(); local_irq_enable(); } } void __init arch_post_acpi_subsys_init(void) { u32 lo, hi; if (!boot_cpu_has_bug(X86_BUG_AMD_E400)) return; /* * AMD E400 detection needs to happen after ACPI has been enabled. If * the machine is affected K8_INTP_C1E_ACTIVE_MASK bits are set in * MSR_K8_INT_PENDING_MSG. */ rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); if (!(lo & K8_INTP_C1E_ACTIVE_MASK)) return; boot_cpu_set_bug(X86_BUG_AMD_APIC_C1E); if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) mark_tsc_unstable("TSC halt in AMD C1E"); pr_info("System has AMD C1E enabled\n"); } static int __init idle_setup(char *str) { if (!str) return -EINVAL; if (!strcmp(str, "poll")) { pr_info("using polling idle threads\n"); boot_option_idle_override = IDLE_POLL; cpu_idle_poll_ctrl(true); } else if (!strcmp(str, "halt")) { /* * When the boot option of idle=halt is added, halt is * forced to be used for CPU idle. In such case CPU C2/C3 * won't be used again. * To continue to load the CPU idle driver, don't touch * the boot_option_idle_override. */ static_call_update(x86_idle, default_idle); boot_option_idle_override = IDLE_HALT; } else if (!strcmp(str, "nomwait")) { /* * If the boot option of "idle=nomwait" is added, * it means that mwait will be disabled for CPU C1/C2/C3 * states. */ boot_option_idle_override = IDLE_NOMWAIT; } else return -1; return 0; } early_param("idle", idle_setup); unsigned long arch_align_stack(unsigned long sp) { if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) sp -= get_random_u32_below(8192); return sp & ~0xf; } unsigned long arch_randomize_brk(struct mm_struct *mm) { return randomize_page(mm->brk, 0x02000000); } /* * Called from fs/proc with a reference on @p to find the function * which called into schedule(). This needs to be done carefully * because the task might wake up and we might look at a stack * changing under us. */ unsigned long __get_wchan(struct task_struct *p) { struct unwind_state state; unsigned long addr = 0; if (!try_get_task_stack(p)) return 0; for (unwind_start(&state, p, NULL, NULL); !unwind_done(&state); unwind_next_frame(&state)) { addr = unwind_get_return_address(&state); if (!addr) break; if (in_sched_functions(addr)) continue; break; } put_task_stack(p); return addr; } long do_arch_prctl_common(int option, unsigned long arg2) { switch (option) { case ARCH_GET_CPUID: return get_cpuid_mode(); case ARCH_SET_CPUID: return set_cpuid_mode(arg2); case ARCH_GET_XCOMP_SUPP: case ARCH_GET_XCOMP_PERM: case ARCH_REQ_XCOMP_PERM: case ARCH_GET_XCOMP_GUEST_PERM: case ARCH_REQ_XCOMP_GUEST_PERM: return fpu_xstate_prctl(option, arg2); } return -EINVAL; }
1011 1011 1011 966 1011 606 606 604 603 552 966 1011 1011 1067 55 1012 51 1021 41 1016 1016 1016 1016 1016 1016 1016 1016 1016 1016 1016 1016 1016 1016 1016 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2005,2006,2007,2008 IBM Corporation * * Authors: * Mimi Zohar <zohar@us.ibm.com> * Kylene Hall <kjhall@us.ibm.com> * * File: ima_crypto.c * Calculates md5/sha1 file hash, template hash, boot-aggreate hash */ #include <linux/kernel.h> #include <linux/moduleparam.h> #include <linux/ratelimit.h> #include <linux/file.h> #include <linux/crypto.h> #include <linux/scatterlist.h> #include <linux/err.h> #include <linux/slab.h> #include <crypto/hash.h> #include "ima.h" /* minimum file size for ahash use */ static unsigned long ima_ahash_minsize; module_param_named(ahash_minsize, ima_ahash_minsize, ulong, 0644); MODULE_PARM_DESC(ahash_minsize, "Minimum file size for ahash use"); /* default is 0 - 1 page. */ static int ima_maxorder; static unsigned int ima_bufsize = PAGE_SIZE; static int param_set_bufsize(const char *val, const struct kernel_param *kp) { unsigned long long size; int order; size = memparse(val, NULL); order = get_order(size); if (order > MAX_ORDER) return -EINVAL; ima_maxorder = order; ima_bufsize = PAGE_SIZE << order; return 0; } static const struct kernel_param_ops param_ops_bufsize = { .set = param_set_bufsize, .get = param_get_uint, }; #define param_check_bufsize(name, p) __param_check(name, p, unsigned int) module_param_named(ahash_bufsize, ima_bufsize, bufsize, 0644); MODULE_PARM_DESC(ahash_bufsize, "Maximum ahash buffer size"); static struct crypto_shash *ima_shash_tfm; static struct crypto_ahash *ima_ahash_tfm; struct ima_algo_desc { struct crypto_shash *tfm; enum hash_algo algo; }; int ima_sha1_idx __ro_after_init; int ima_hash_algo_idx __ro_after_init; /* * Additional number of slots reserved, as needed, for SHA1 * and IMA default algo. */ int ima_extra_slots __ro_after_init; static struct ima_algo_desc *ima_algo_array; static int __init ima_init_ima_crypto(void) { long rc; ima_shash_tfm = crypto_alloc_shash(hash_algo_name[ima_hash_algo], 0, 0); if (IS_ERR(ima_shash_tfm)) { rc = PTR_ERR(ima_shash_tfm); pr_err("Can not allocate %s (reason: %ld)\n", hash_algo_name[ima_hash_algo], rc); return rc; } pr_info("Allocated hash algorithm: %s\n", hash_algo_name[ima_hash_algo]); return 0; } static struct crypto_shash *ima_alloc_tfm(enum hash_algo algo) { struct crypto_shash *tfm = ima_shash_tfm; int rc, i; if (algo < 0 || algo >= HASH_ALGO__LAST) algo = ima_hash_algo; if (algo == ima_hash_algo) return tfm; for (i = 0; i < NR_BANKS(ima_tpm_chip) + ima_extra_slots; i++) if (ima_algo_array[i].tfm && ima_algo_array[i].algo == algo) return ima_algo_array[i].tfm; tfm = crypto_alloc_shash(hash_algo_name[algo], 0, 0); if (IS_ERR(tfm)) { rc = PTR_ERR(tfm); pr_err("Can not allocate %s (reason: %d)\n", hash_algo_name[algo], rc); } return tfm; } int __init ima_init_crypto(void) { enum hash_algo algo; long rc; int i; rc = ima_init_ima_crypto(); if (rc) return rc; ima_sha1_idx = -1; ima_hash_algo_idx = -1; for (i = 0; i < NR_BANKS(ima_tpm_chip); i++) { algo = ima_tpm_chip->allocated_banks[i].crypto_id; if (algo == HASH_ALGO_SHA1) ima_sha1_idx = i; if (algo == ima_hash_algo) ima_hash_algo_idx = i; } if (ima_sha1_idx < 0) { ima_sha1_idx = NR_BANKS(ima_tpm_chip) + ima_extra_slots++; if (ima_hash_algo == HASH_ALGO_SHA1) ima_hash_algo_idx = ima_sha1_idx; } if (ima_hash_algo_idx < 0) ima_hash_algo_idx = NR_BANKS(ima_tpm_chip) + ima_extra_slots++; ima_algo_array = kcalloc(NR_BANKS(ima_tpm_chip) + ima_extra_slots, sizeof(*ima_algo_array), GFP_KERNEL); if (!ima_algo_array) { rc = -ENOMEM; goto out; } for (i = 0; i < NR_BANKS(ima_tpm_chip); i++) { algo = ima_tpm_chip->allocated_banks[i].crypto_id; ima_algo_array[i].algo = algo; /* unknown TPM algorithm */ if (algo == HASH_ALGO__LAST) continue; if (algo == ima_hash_algo) { ima_algo_array[i].tfm = ima_shash_tfm; continue; } ima_algo_array[i].tfm = ima_alloc_tfm(algo); if (IS_ERR(ima_algo_array[i].tfm)) { if (algo == HASH_ALGO_SHA1) { rc = PTR_ERR(ima_algo_array[i].tfm); ima_algo_array[i].tfm = NULL; goto out_array; } ima_algo_array[i].tfm = NULL; } } if (ima_sha1_idx >= NR_BANKS(ima_tpm_chip)) { if (ima_hash_algo == HASH_ALGO_SHA1) { ima_algo_array[ima_sha1_idx].tfm = ima_shash_tfm; } else { ima_algo_array[ima_sha1_idx].tfm = ima_alloc_tfm(HASH_ALGO_SHA1); if (IS_ERR(ima_algo_array[ima_sha1_idx].tfm)) { rc = PTR_ERR(ima_algo_array[ima_sha1_idx].tfm); goto out_array; } } ima_algo_array[ima_sha1_idx].algo = HASH_ALGO_SHA1; } if (ima_hash_algo_idx >= NR_BANKS(ima_tpm_chip) && ima_hash_algo_idx != ima_sha1_idx) { ima_algo_array[ima_hash_algo_idx].tfm = ima_shash_tfm; ima_algo_array[ima_hash_algo_idx].algo = ima_hash_algo; } return 0; out_array: for (i = 0; i < NR_BANKS(ima_tpm_chip) + ima_extra_slots; i++) { if (!ima_algo_array[i].tfm || ima_algo_array[i].tfm == ima_shash_tfm) continue; crypto_free_shash(ima_algo_array[i].tfm); } kfree(ima_algo_array); out: crypto_free_shash(ima_shash_tfm); return rc; } static void ima_free_tfm(struct crypto_shash *tfm) { int i; if (tfm == ima_shash_tfm) return; for (i = 0; i < NR_BANKS(ima_tpm_chip) + ima_extra_slots; i++) if (ima_algo_array[i].tfm == tfm) return; crypto_free_shash(tfm); } /** * ima_alloc_pages() - Allocate contiguous pages. * @max_size: Maximum amount of memory to allocate. * @allocated_size: Returned size of actual allocation. * @last_warn: Should the min_size allocation warn or not. * * Tries to do opportunistic allocation for memory first trying to allocate * max_size amount of memory and then splitting that until zero order is * reached. Allocation is tried without generating allocation warnings unless * last_warn is set. Last_warn set affects only last allocation of zero order. * * By default, ima_maxorder is 0 and it is equivalent to kmalloc(GFP_KERNEL) * * Return pointer to allocated memory, or NULL on failure. */ static void *ima_alloc_pages(loff_t max_size, size_t *allocated_size, int last_warn) { void *ptr; int order = ima_maxorder; gfp_t gfp_mask = __GFP_RECLAIM | __GFP_NOWARN | __GFP_NORETRY; if (order) order = min(get_order(max_size), order); for (; order; order--) { ptr = (void *)__get_free_pages(gfp_mask, order); if (ptr) { *allocated_size = PAGE_SIZE << order; return ptr; } } /* order is zero - one page */ gfp_mask = GFP_KERNEL; if (!last_warn) gfp_mask |= __GFP_NOWARN; ptr = (void *)__get_free_pages(gfp_mask, 0); if (ptr) { *allocated_size = PAGE_SIZE; return ptr; } *allocated_size = 0; return NULL; } /** * ima_free_pages() - Free pages allocated by ima_alloc_pages(). * @ptr: Pointer to allocated pages. * @size: Size of allocated buffer. */ static void ima_free_pages(void *ptr, size_t size) { if (!ptr) return; free_pages((unsigned long)ptr, get_order(size)); } static struct crypto_ahash *ima_alloc_atfm(enum hash_algo algo) { struct crypto_ahash *tfm = ima_ahash_tfm; int rc; if (algo < 0 || algo >= HASH_ALGO__LAST) algo = ima_hash_algo; if (algo != ima_hash_algo || !tfm) { tfm = crypto_alloc_ahash(hash_algo_name[algo], 0, 0); if (!IS_ERR(tfm)) { if (algo == ima_hash_algo) ima_ahash_tfm = tfm; } else { rc = PTR_ERR(tfm); pr_err("Can not allocate %s (reason: %d)\n", hash_algo_name[algo], rc); } } return tfm; } static void ima_free_atfm(struct crypto_ahash *tfm) { if (tfm != ima_ahash_tfm) crypto_free_ahash(tfm); } static inline int ahash_wait(int err, struct crypto_wait *wait) { err = crypto_wait_req(err, wait); if (err) pr_crit_ratelimited("ahash calculation failed: err: %d\n", err); return err; } static int ima_calc_file_hash_atfm(struct file *file, struct ima_digest_data *hash, struct crypto_ahash *tfm) { loff_t i_size, offset; char *rbuf[2] = { NULL, }; int rc, rbuf_len, active = 0, ahash_rc = 0; struct ahash_request *req; struct scatterlist sg[1]; struct crypto_wait wait; size_t rbuf_size[2]; hash->length = crypto_ahash_digestsize(tfm); req = ahash_request_alloc(tfm, GFP_KERNEL); if (!req) return -ENOMEM; crypto_init_wait(&wait); ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, &wait); rc = ahash_wait(crypto_ahash_init(req), &wait); if (rc) goto out1; i_size = i_size_read(file_inode(file)); if (i_size == 0) goto out2; /* * Try to allocate maximum size of memory. * Fail if even a single page cannot be allocated. */ rbuf[0] = ima_alloc_pages(i_size, &rbuf_size[0], 1); if (!rbuf[0]) { rc = -ENOMEM; goto out1; } /* Only allocate one buffer if that is enough. */ if (i_size > rbuf_size[0]) { /* * Try to allocate secondary buffer. If that fails fallback to * using single buffering. Use previous memory allocation size * as baseline for possible allocation size. */ rbuf[1] = ima_alloc_pages(i_size - rbuf_size[0], &rbuf_size[1], 0); } for (offset = 0; offset < i_size; offset += rbuf_len) { if (!rbuf[1] && offset) { /* Not using two buffers, and it is not the first * read/request, wait for the completion of the * previous ahash_update() request. */ rc = ahash_wait(ahash_rc, &wait); if (rc) goto out3; } /* read buffer */ rbuf_len = min_t(loff_t, i_size - offset, rbuf_size[active]); rc = integrity_kernel_read(file, offset, rbuf[active], rbuf_len); if (rc != rbuf_len) { if (rc >= 0) rc = -EINVAL; /* * Forward current rc, do not overwrite with return value * from ahash_wait() */ ahash_wait(ahash_rc, &wait); goto out3; } if (rbuf[1] && offset) { /* Using two buffers, and it is not the first * read/request, wait for the completion of the * previous ahash_update() request. */ rc = ahash_wait(ahash_rc, &wait); if (rc) goto out3; } sg_init_one(&sg[0], rbuf[active], rbuf_len); ahash_request_set_crypt(req, sg, NULL, rbuf_len); ahash_rc = crypto_ahash_update(req); if (rbuf[1]) active = !active; /* swap buffers, if we use two */ } /* wait for the last update request to complete */ rc = ahash_wait(ahash_rc, &wait); out3: ima_free_pages(rbuf[0], rbuf_size[0]); ima_free_pages(rbuf[1], rbuf_size[1]); out2: if (!rc) { ahash_request_set_crypt(req, NULL, hash->digest, 0); rc = ahash_wait(crypto_ahash_final(req), &wait); } out1: ahash_request_free(req); return rc; } static int ima_calc_file_ahash(struct file *file, struct ima_digest_data *hash) { struct crypto_ahash *tfm; int rc; tfm = ima_alloc_atfm(hash->algo); if (IS_ERR(tfm)) return PTR_ERR(tfm); rc = ima_calc_file_hash_atfm(file, hash, tfm); ima_free_atfm(tfm); return rc; } static int ima_calc_file_hash_tfm(struct file *file, struct ima_digest_data *hash, struct crypto_shash *tfm) { loff_t i_size, offset = 0; char *rbuf; int rc; SHASH_DESC_ON_STACK(shash, tfm); shash->tfm = tfm; hash->length = crypto_shash_digestsize(tfm); rc = crypto_shash_init(shash); if (rc != 0) return rc; i_size = i_size_read(file_inode(file)); if (i_size == 0) goto out; rbuf = kzalloc(PAGE_SIZE, GFP_KERNEL); if (!rbuf) return -ENOMEM; while (offset < i_size) { int rbuf_len; rbuf_len = integrity_kernel_read(file, offset, rbuf, PAGE_SIZE); if (rbuf_len < 0) { rc = rbuf_len; break; } if (rbuf_len == 0) { /* unexpected EOF */ rc = -EINVAL; break; } offset += rbuf_len; rc = crypto_shash_update(shash, rbuf, rbuf_len); if (rc) break; } kfree(rbuf); out: if (!rc) rc = crypto_shash_final(shash, hash->digest); return rc; } static int ima_calc_file_shash(struct file *file, struct ima_digest_data *hash) { struct crypto_shash *tfm; int rc; tfm = ima_alloc_tfm(hash->algo); if (IS_ERR(tfm)) return PTR_ERR(tfm); rc = ima_calc_file_hash_tfm(file, hash, tfm); ima_free_tfm(tfm); return rc; } /* * ima_calc_file_hash - calculate file hash * * Asynchronous hash (ahash) allows using HW acceleration for calculating * a hash. ahash performance varies for different data sizes on different * crypto accelerators. shash performance might be better for smaller files. * The 'ima.ahash_minsize' module parameter allows specifying the best * minimum file size for using ahash on the system. * * If the ima.ahash_minsize parameter is not specified, this function uses * shash for the hash calculation. If ahash fails, it falls back to using * shash. */ int ima_calc_file_hash(struct file *file, struct ima_digest_data *hash) { loff_t i_size; int rc; struct file *f = file; bool new_file_instance = false; /* * For consistency, fail file's opened with the O_DIRECT flag on * filesystems mounted with/without DAX option. */ if (file->f_flags & O_DIRECT) { hash->length = hash_digest_size[ima_hash_algo]; hash->algo = ima_hash_algo; return -EINVAL; } /* Open a new file instance in O_RDONLY if we cannot read */ if (!(file->f_mode & FMODE_READ)) { int flags = file->f_flags & ~(O_WRONLY | O_APPEND | O_TRUNC | O_CREAT | O_NOCTTY | O_EXCL); flags |= O_RDONLY; f = dentry_open(&file->f_path, flags, file->f_cred); if (IS_ERR(f)) return PTR_ERR(f); new_file_instance = true; } i_size = i_size_read(file_inode(f)); if (ima_ahash_minsize && i_size >= ima_ahash_minsize) { rc = ima_calc_file_ahash(f, hash); if (!rc) goto out; } rc = ima_calc_file_shash(f, hash); out: if (new_file_instance) fput(f); return rc; } /* * Calculate the hash of template data */ static int ima_calc_field_array_hash_tfm(struct ima_field_data *field_data, struct ima_template_entry *entry, int tfm_idx) { SHASH_DESC_ON_STACK(shash, ima_algo_array[tfm_idx].tfm); struct ima_template_desc *td = entry->template_desc; int num_fields = entry->template_desc->num_fields; int rc, i; shash->tfm = ima_algo_array[tfm_idx].tfm; rc = crypto_shash_init(shash); if (rc != 0) return rc; for (i = 0; i < num_fields; i++) { u8 buffer[IMA_EVENT_NAME_LEN_MAX + 1] = { 0 }; u8 *data_to_hash = field_data[i].data; u32 datalen = field_data[i].len; u32 datalen_to_hash = !ima_canonical_fmt ? datalen : (__force u32)cpu_to_le32(datalen); if (strcmp(td->name, IMA_TEMPLATE_IMA_NAME) != 0) { rc = crypto_shash_update(shash, (const u8 *) &datalen_to_hash, sizeof(datalen_to_hash)); if (rc) break; } else if (strcmp(td->fields[i]->field_id, "n") == 0) { memcpy(buffer, data_to_hash, datalen); data_to_hash = buffer; datalen = IMA_EVENT_NAME_LEN_MAX + 1; } rc = crypto_shash_update(shash, data_to_hash, datalen); if (rc) break; } if (!rc) rc = crypto_shash_final(shash, entry->digests[tfm_idx].digest); return rc; } int ima_calc_field_array_hash(struct ima_field_data *field_data, struct ima_template_entry *entry) { u16 alg_id; int rc, i; rc = ima_calc_field_array_hash_tfm(field_data, entry, ima_sha1_idx); if (rc) return rc; entry->digests[ima_sha1_idx].alg_id = TPM_ALG_SHA1; for (i = 0; i < NR_BANKS(ima_tpm_chip) + ima_extra_slots; i++) { if (i == ima_sha1_idx) continue; if (i < NR_BANKS(ima_tpm_chip)) { alg_id = ima_tpm_chip->allocated_banks[i].alg_id; entry->digests[i].alg_id = alg_id; } /* for unmapped TPM algorithms digest is still a padded SHA1 */ if (!ima_algo_array[i].tfm) { memcpy(entry->digests[i].digest, entry->digests[ima_sha1_idx].digest, TPM_DIGEST_SIZE); continue; } rc = ima_calc_field_array_hash_tfm(field_data, entry, i); if (rc) return rc; } return rc; } static int calc_buffer_ahash_atfm(const void *buf, loff_t len, struct ima_digest_data *hash, struct crypto_ahash *tfm) { struct ahash_request *req; struct scatterlist sg; struct crypto_wait wait; int rc, ahash_rc = 0; hash->length = crypto_ahash_digestsize(tfm); req = ahash_request_alloc(tfm, GFP_KERNEL); if (!req) return -ENOMEM; crypto_init_wait(&wait); ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, &wait); rc = ahash_wait(crypto_ahash_init(req), &wait); if (rc) goto out; sg_init_one(&sg, buf, len); ahash_request_set_crypt(req, &sg, NULL, len); ahash_rc = crypto_ahash_update(req); /* wait for the update request to complete */ rc = ahash_wait(ahash_rc, &wait); if (!rc) { ahash_request_set_crypt(req, NULL, hash->digest, 0); rc = ahash_wait(crypto_ahash_final(req), &wait); } out: ahash_request_free(req); return rc; } static int calc_buffer_ahash(const void *buf, loff_t len, struct ima_digest_data *hash) { struct crypto_ahash *tfm; int rc; tfm = ima_alloc_atfm(hash->algo); if (IS_ERR(tfm)) return PTR_ERR(tfm); rc = calc_buffer_ahash_atfm(buf, len, hash, tfm); ima_free_atfm(tfm); return rc; } static int calc_buffer_shash_tfm(const void *buf, loff_t size, struct ima_digest_data *hash, struct crypto_shash *tfm) { SHASH_DESC_ON_STACK(shash, tfm); unsigned int len; int rc; shash->tfm = tfm; hash->length = crypto_shash_digestsize(tfm); rc = crypto_shash_init(shash); if (rc != 0) return rc; while (size) { len = size < PAGE_SIZE ? size : PAGE_SIZE; rc = crypto_shash_update(shash, buf, len); if (rc) break; buf += len; size -= len; } if (!rc) rc = crypto_shash_final(shash, hash->digest); return rc; } static int calc_buffer_shash(const void *buf, loff_t len, struct ima_digest_data *hash) { struct crypto_shash *tfm; int rc; tfm = ima_alloc_tfm(hash->algo); if (IS_ERR(tfm)) return PTR_ERR(tfm); rc = calc_buffer_shash_tfm(buf, len, hash, tfm); ima_free_tfm(tfm); return rc; } int ima_calc_buffer_hash(const void *buf, loff_t len, struct ima_digest_data *hash) { int rc; if (ima_ahash_minsize && len >= ima_ahash_minsize) { rc = calc_buffer_ahash(buf, len, hash); if (!rc) return 0; } return calc_buffer_shash(buf, len, hash); } static void ima_pcrread(u32 idx, struct tpm_digest *d) { if (!ima_tpm_chip) return; if (tpm_pcr_read(ima_tpm_chip, idx, d) != 0) pr_err("Error Communicating to TPM chip\n"); } /* * The boot_aggregate is a cumulative hash over TPM registers 0 - 7. With * TPM 1.2 the boot_aggregate was based on reading the SHA1 PCRs, but with * TPM 2.0 hash agility, TPM chips could support multiple TPM PCR banks, * allowing firmware to configure and enable different banks. * * Knowing which TPM bank is read to calculate the boot_aggregate digest * needs to be conveyed to a verifier. For this reason, use the same * hash algorithm for reading the TPM PCRs as for calculating the boot * aggregate digest as stored in the measurement list. */ static int ima_calc_boot_aggregate_tfm(char *digest, u16 alg_id, struct crypto_shash *tfm) { struct tpm_digest d = { .alg_id = alg_id, .digest = {0} }; int rc; u32 i; SHASH_DESC_ON_STACK(shash, tfm); shash->tfm = tfm; pr_devel("calculating the boot-aggregate based on TPM bank: %04x\n", d.alg_id); rc = crypto_shash_init(shash); if (rc != 0) return rc; /* cumulative digest over TPM registers 0-7 */ for (i = TPM_PCR0; i < TPM_PCR8; i++) { ima_pcrread(i, &d); /* now accumulate with current aggregate */ rc = crypto_shash_update(shash, d.digest, crypto_shash_digestsize(tfm)); if (rc != 0) return rc; } /* * Extend cumulative digest over TPM registers 8-9, which contain * measurement for the kernel command line (reg. 8) and image (reg. 9) * in a typical PCR allocation. Registers 8-9 are only included in * non-SHA1 boot_aggregate digests to avoid ambiguity. */ if (alg_id != TPM_ALG_SHA1) { for (i = TPM_PCR8; i < TPM_PCR10; i++) { ima_pcrread(i, &d); rc = crypto_shash_update(shash, d.digest, crypto_shash_digestsize(tfm)); } } if (!rc) crypto_shash_final(shash, digest); return rc; } int ima_calc_boot_aggregate(struct ima_digest_data *hash) { struct crypto_shash *tfm; u16 crypto_id, alg_id; int rc, i, bank_idx = -1; for (i = 0; i < ima_tpm_chip->nr_allocated_banks; i++) { crypto_id = ima_tpm_chip->allocated_banks[i].crypto_id; if (crypto_id == hash->algo) { bank_idx = i; break; } if (crypto_id == HASH_ALGO_SHA256) bank_idx = i; if (bank_idx == -1 && crypto_id == HASH_ALGO_SHA1) bank_idx = i; } if (bank_idx == -1) { pr_err("No suitable TPM algorithm for boot aggregate\n"); return 0; } hash->algo = ima_tpm_chip->allocated_banks[bank_idx].crypto_id; tfm = ima_alloc_tfm(hash->algo); if (IS_ERR(tfm)) return PTR_ERR(tfm); hash->length = crypto_shash_digestsize(tfm); alg_id = ima_tpm_chip->allocated_banks[bank_idx].alg_id; rc = ima_calc_boot_aggregate_tfm(hash->digest, alg_id, tfm); ima_free_tfm(tfm); return rc; }
1454 1161 1456 1440 350 1409 1072 1075 1075 1034 1024 935 76 290 232 93 93 93 93 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 // SPDX-License-Identifier: GPL-2.0-only /* * IPv6 library code, needed by static components when full IPv6 support is * not configured or static. */ #include <linux/export.h> #include <net/ipv6.h> #include <net/ipv6_stubs.h> #include <net/addrconf.h> #include <net/ip.h> /* if ipv6 module registers this function is used by xfrm to force all * sockets to relookup their nodes - this is fairly expensive, be * careful */ void (*__fib6_flush_trees)(struct net *); EXPORT_SYMBOL(__fib6_flush_trees); #define IPV6_ADDR_SCOPE_TYPE(scope) ((scope) << 16) static inline unsigned int ipv6_addr_scope2type(unsigned int scope) { switch (scope) { case IPV6_ADDR_SCOPE_NODELOCAL: return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_NODELOCAL) | IPV6_ADDR_LOOPBACK); case IPV6_ADDR_SCOPE_LINKLOCAL: return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL) | IPV6_ADDR_LINKLOCAL); case IPV6_ADDR_SCOPE_SITELOCAL: return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_SITELOCAL) | IPV6_ADDR_SITELOCAL); } return IPV6_ADDR_SCOPE_TYPE(scope); } int __ipv6_addr_type(const struct in6_addr *addr) { __be32 st; st = addr->s6_addr32[0]; /* Consider all addresses with the first three bits different of 000 and 111 as unicasts. */ if ((st & htonl(0xE0000000)) != htonl(0x00000000) && (st & htonl(0xE0000000)) != htonl(0xE0000000)) return (IPV6_ADDR_UNICAST | IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); if ((st & htonl(0xFF000000)) == htonl(0xFF000000)) { /* multicast */ /* addr-select 3.1 */ return (IPV6_ADDR_MULTICAST | ipv6_addr_scope2type(IPV6_ADDR_MC_SCOPE(addr))); } if ((st & htonl(0xFFC00000)) == htonl(0xFE800000)) return (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST | IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL)); /* addr-select 3.1 */ if ((st & htonl(0xFFC00000)) == htonl(0xFEC00000)) return (IPV6_ADDR_SITELOCAL | IPV6_ADDR_UNICAST | IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_SITELOCAL)); /* addr-select 3.1 */ if ((st & htonl(0xFE000000)) == htonl(0xFC000000)) return (IPV6_ADDR_UNICAST | IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* RFC 4193 */ if ((addr->s6_addr32[0] | addr->s6_addr32[1]) == 0) { if (addr->s6_addr32[2] == 0) { if (addr->s6_addr32[3] == 0) return IPV6_ADDR_ANY; if (addr->s6_addr32[3] == htonl(0x00000001)) return (IPV6_ADDR_LOOPBACK | IPV6_ADDR_UNICAST | IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL)); /* addr-select 3.4 */ return (IPV6_ADDR_COMPATv4 | IPV6_ADDR_UNICAST | IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.3 */ } if (addr->s6_addr32[2] == htonl(0x0000ffff)) return (IPV6_ADDR_MAPPED | IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.3 */ } return (IPV6_ADDR_UNICAST | IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.4 */ } EXPORT_SYMBOL(__ipv6_addr_type); static ATOMIC_NOTIFIER_HEAD(inet6addr_chain); static BLOCKING_NOTIFIER_HEAD(inet6addr_validator_chain); int register_inet6addr_notifier(struct notifier_block *nb) { return atomic_notifier_chain_register(&inet6addr_chain, nb); } EXPORT_SYMBOL(register_inet6addr_notifier); int unregister_inet6addr_notifier(struct notifier_block *nb) { return atomic_notifier_chain_unregister(&inet6addr_chain, nb); } EXPORT_SYMBOL(unregister_inet6addr_notifier); int inet6addr_notifier_call_chain(unsigned long val, void *v) { return atomic_notifier_call_chain(&inet6addr_chain, val, v); } EXPORT_SYMBOL(inet6addr_notifier_call_chain); int register_inet6addr_validator_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&inet6addr_validator_chain, nb); } EXPORT_SYMBOL(register_inet6addr_validator_notifier); int unregister_inet6addr_validator_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&inet6addr_validator_chain, nb); } EXPORT_SYMBOL(unregister_inet6addr_validator_notifier); int inet6addr_validator_notifier_call_chain(unsigned long val, void *v) { return blocking_notifier_call_chain(&inet6addr_validator_chain, val, v); } EXPORT_SYMBOL(inet6addr_validator_notifier_call_chain); static struct dst_entry *eafnosupport_ipv6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6, const struct in6_addr *final_dst) { return ERR_PTR(-EAFNOSUPPORT); } static int eafnosupport_ipv6_route_input(struct sk_buff *skb) { return -EAFNOSUPPORT; } static struct fib6_table *eafnosupport_fib6_get_table(struct net *net, u32 id) { return NULL; } static int eafnosupport_fib6_table_lookup(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, struct fib6_result *res, int flags) { return -EAFNOSUPPORT; } static int eafnosupport_fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, struct fib6_result *res, int flags) { return -EAFNOSUPPORT; } static void eafnosupport_fib6_select_path(const struct net *net, struct fib6_result *res, struct flowi6 *fl6, int oif, bool have_oif_match, const struct sk_buff *skb, int strict) { } static u32 eafnosupport_ip6_mtu_from_fib6(const struct fib6_result *res, const struct in6_addr *daddr, const struct in6_addr *saddr) { return 0; } static int eafnosupport_fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack) { NL_SET_ERR_MSG(extack, "IPv6 support not enabled in kernel"); return -EAFNOSUPPORT; } static int eafnosupport_ip6_del_rt(struct net *net, struct fib6_info *rt, bool skip_notify) { return -EAFNOSUPPORT; } static int eafnosupport_ipv6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *)) { kfree_skb(skb); return -EAFNOSUPPORT; } static struct net_device *eafnosupport_ipv6_dev_find(struct net *net, const struct in6_addr *addr, struct net_device *dev) { return ERR_PTR(-EAFNOSUPPORT); } const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) { .ipv6_dst_lookup_flow = eafnosupport_ipv6_dst_lookup_flow, .ipv6_route_input = eafnosupport_ipv6_route_input, .fib6_get_table = eafnosupport_fib6_get_table, .fib6_table_lookup = eafnosupport_fib6_table_lookup, .fib6_lookup = eafnosupport_fib6_lookup, .fib6_select_path = eafnosupport_fib6_select_path, .ip6_mtu_from_fib6 = eafnosupport_ip6_mtu_from_fib6, .fib6_nh_init = eafnosupport_fib6_nh_init, .ip6_del_rt = eafnosupport_ip6_del_rt, .ipv6_fragment = eafnosupport_ipv6_fragment, .ipv6_dev_find = eafnosupport_ipv6_dev_find, }; EXPORT_SYMBOL_GPL(ipv6_stub); /* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */ const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT; EXPORT_SYMBOL(in6addr_loopback); const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT; EXPORT_SYMBOL(in6addr_any); const struct in6_addr in6addr_linklocal_allnodes = IN6ADDR_LINKLOCAL_ALLNODES_INIT; EXPORT_SYMBOL(in6addr_linklocal_allnodes); const struct in6_addr in6addr_linklocal_allrouters = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT; EXPORT_SYMBOL(in6addr_linklocal_allrouters); const struct in6_addr in6addr_interfacelocal_allnodes = IN6ADDR_INTERFACELOCAL_ALLNODES_INIT; EXPORT_SYMBOL(in6addr_interfacelocal_allnodes); const struct in6_addr in6addr_interfacelocal_allrouters = IN6ADDR_INTERFACELOCAL_ALLROUTERS_INIT; EXPORT_SYMBOL(in6addr_interfacelocal_allrouters); const struct in6_addr in6addr_sitelocal_allrouters = IN6ADDR_SITELOCAL_ALLROUTERS_INIT; EXPORT_SYMBOL(in6addr_sitelocal_allrouters); static void snmp6_free_dev(struct inet6_dev *idev) { kfree(idev->stats.icmpv6msgdev); kfree(idev->stats.icmpv6dev); free_percpu(idev->stats.ipv6); } static void in6_dev_finish_destroy_rcu(struct rcu_head *head) { struct inet6_dev *idev = container_of(head, struct inet6_dev, rcu); snmp6_free_dev(idev); kfree(idev); } /* Nobody refers to this device, we may destroy it. */ void in6_dev_finish_destroy(struct inet6_dev *idev) { struct net_device *dev = idev->dev; WARN_ON(!list_empty(&idev->addr_list)); WARN_ON(rcu_access_pointer(idev->mc_list)); WARN_ON(timer_pending(&idev->rs_timer)); #ifdef NET_REFCNT_DEBUG pr_debug("%s: %s\n", __func__, dev ? dev->name : "NIL"); #endif netdev_put(dev, &idev->dev_tracker); if (!idev->dead) { pr_warn("Freeing alive inet6 device %p\n", idev); return; } call_rcu(&idev->rcu, in6_dev_finish_destroy_rcu); } EXPORT_SYMBOL(in6_dev_finish_destroy);
1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM cgroup #if !defined(_TRACE_CGROUP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_CGROUP_H #include <linux/cgroup.h> #include <linux/tracepoint.h> DECLARE_EVENT_CLASS(cgroup_root, TP_PROTO(struct cgroup_root *root), TP_ARGS(root), TP_STRUCT__entry( __field( int, root ) __field( u16, ss_mask ) __string( name, root->name ) ), TP_fast_assign( __entry->root = root->hierarchy_id; __entry->ss_mask = root->subsys_mask; __assign_str(name, root->name); ), TP_printk("root=%d ss_mask=%#x name=%s", __entry->root, __entry->ss_mask, __get_str(name)) ); DEFINE_EVENT(cgroup_root, cgroup_setup_root, TP_PROTO(struct cgroup_root *root), TP_ARGS(root) ); DEFINE_EVENT(cgroup_root, cgroup_destroy_root, TP_PROTO(struct cgroup_root *root), TP_ARGS(root) ); DEFINE_EVENT(cgroup_root, cgroup_remount, TP_PROTO(struct cgroup_root *root), TP_ARGS(root) ); DECLARE_EVENT_CLASS(cgroup, TP_PROTO(struct cgroup *cgrp, const char *path), TP_ARGS(cgrp, path), TP_STRUCT__entry( __field( int, root ) __field( int, level ) __field( u64, id ) __string( path, path ) ), TP_fast_assign( __entry->root = cgrp->root->hierarchy_id; __entry->id = cgroup_id(cgrp); __entry->level = cgrp->level; __assign_str(path, path); ), TP_printk("root=%d id=%llu level=%d path=%s", __entry->root, __entry->id, __entry->level, __get_str(path)) ); DEFINE_EVENT(cgroup, cgroup_mkdir, TP_PROTO(struct cgroup *cgrp, const char *path), TP_ARGS(cgrp, path) ); DEFINE_EVENT(cgroup, cgroup_rmdir, TP_PROTO(struct cgroup *cgrp, const char *path), TP_ARGS(cgrp, path) ); DEFINE_EVENT(cgroup, cgroup_release, TP_PROTO(struct cgroup *cgrp, const char *path), TP_ARGS(cgrp, path) ); DEFINE_EVENT(cgroup, cgroup_rename, TP_PROTO(struct cgroup *cgrp, const char *path), TP_ARGS(cgrp, path) ); DEFINE_EVENT(cgroup, cgroup_freeze, TP_PROTO(struct cgroup *cgrp, const char *path), TP_ARGS(cgrp, path) ); DEFINE_EVENT(cgroup, cgroup_unfreeze, TP_PROTO(struct cgroup *cgrp, const char *path), TP_ARGS(cgrp, path) ); DECLARE_EVENT_CLASS(cgroup_migrate, TP_PROTO(struct cgroup *dst_cgrp, const char *path, struct task_struct *task, bool threadgroup), TP_ARGS(dst_cgrp, path, task, threadgroup), TP_STRUCT__entry( __field( int, dst_root ) __field( int, dst_level ) __field( u64, dst_id ) __field( int, pid ) __string( dst_path, path ) __string( comm, task->comm ) ), TP_fast_assign( __entry->dst_root = dst_cgrp->root->hierarchy_id; __entry->dst_id = cgroup_id(dst_cgrp); __entry->dst_level = dst_cgrp->level; __assign_str(dst_path, path); __entry->pid = task->pid; __assign_str(comm, task->comm); ), TP_printk("dst_root=%d dst_id=%llu dst_level=%d dst_path=%s pid=%d comm=%s", __entry->dst_root, __entry->dst_id, __entry->dst_level, __get_str(dst_path), __entry->pid, __get_str(comm)) ); DEFINE_EVENT(cgroup_migrate, cgroup_attach_task, TP_PROTO(struct cgroup *dst_cgrp, const char *path, struct task_struct *task, bool threadgroup), TP_ARGS(dst_cgrp, path, task, threadgroup) ); DEFINE_EVENT(cgroup_migrate, cgroup_transfer_tasks, TP_PROTO(struct cgroup *dst_cgrp, const char *path, struct task_struct *task, bool threadgroup), TP_ARGS(dst_cgrp, path, task, threadgroup) ); DECLARE_EVENT_CLASS(cgroup_event, TP_PROTO(struct cgroup *cgrp, const char *path, int val), TP_ARGS(cgrp, path, val), TP_STRUCT__entry( __field( int, root ) __field( int, level ) __field( u64, id ) __string( path, path ) __field( int, val ) ), TP_fast_assign( __entry->root = cgrp->root->hierarchy_id; __entry->id = cgroup_id(cgrp); __entry->level = cgrp->level; __assign_str(path, path); __entry->val = val; ), TP_printk("root=%d id=%llu level=%d path=%s val=%d", __entry->root, __entry->id, __entry->level, __get_str(path), __entry->val) ); DEFINE_EVENT(cgroup_event, cgroup_notify_populated, TP_PROTO(struct cgroup *cgrp, const char *path, int val), TP_ARGS(cgrp, path, val) ); DEFINE_EVENT(cgroup_event, cgroup_notify_frozen, TP_PROTO(struct cgroup *cgrp, const char *path, int val), TP_ARGS(cgrp, path, val) ); #endif /* _TRACE_CGROUP_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
14 14 2 196 196 196 1536 4 4 1 1 1 14 14 14 12 12 12 12 7 1 1 2 1 1 1 7 2 2 2 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 // SPDX-License-Identifier: GPL-2.0-or-later /** -*- linux-c -*- *********************************************************** * Linux PPP over Ethernet (PPPoX/PPPoE) Sockets * * PPPoX --- Generic PPP encapsulation socket family * PPPoE --- PPP over Ethernet (RFC 2516) * * Version: 0.7.0 * * 070228 : Fix to allow multiple sessions with same remote MAC and same * session id by including the local device ifindex in the * tuple identifying a session. This also ensures packets can't * be injected into a session from interfaces other than the one * specified by userspace. Florian Zumbiehl <florz@florz.de> * (Oh, BTW, this one is YYMMDD, in case you were wondering ...) * 220102 : Fix module use count on failure in pppoe_create, pppox_sk -acme * 030700 : Fixed connect logic to allow for disconnect. * 270700 : Fixed potential SMP problems; we must protect against * simultaneous invocation of ppp_input * and ppp_unregister_channel. * 040800 : Respect reference count mechanisms on net-devices. * 200800 : fix kfree(skb) in pppoe_rcv (acme) * Module reference count is decremented in the right spot now, * guards against sock_put not actually freeing the sk * in pppoe_release. * 051000 : Initialization cleanup. * 111100 : Fix recvmsg. * 050101 : Fix PADT processing. * 140501 : Use pppoe_rcv_core to handle all backlog. (Alexey) * 170701 : Do not lock_sock with rwlock held. (DaveM) * Ignore discovery frames if user has socket * locked. (DaveM) * Ignore return value of dev_queue_xmit in __pppoe_xmit * or else we may kfree an SKB twice. (DaveM) * 190701 : When doing copies of skb's in __pppoe_xmit, always delete * the original skb that was passed in on success, never on * failure. Delete the copy of the skb on failure to avoid * a memory leak. * 081001 : Misc. cleanup (licence string, non-blocking, prevent * reference of device on close). * 121301 : New ppp channels interface; cannot unregister a channel * from interrupts. Thus, we mark the socket as a ZOMBIE * and do the unregistration later. * 081002 : seq_file support for proc stuff -acme * 111602 : Merge all 2.4 fixes into 2.5/2.6 tree. Label 2.5/2.6 * as version 0.7. Spacing cleanup. * Author: Michal Ostrowski <mostrows@speakeasy.net> * Contributors: * Arnaldo Carvalho de Melo <acme@conectiva.com.br> * David S. Miller (davem@redhat.com) * * License: */ #include <linux/string.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/net.h> #include <linux/inetdevice.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/if_ether.h> #include <linux/if_pppox.h> #include <linux/ppp_channel.h> #include <linux/ppp_defs.h> #include <linux/ppp-ioctl.h> #include <linux/notifier.h> #include <linux/file.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/nsproxy.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/sock.h> #include <linux/uaccess.h> #define PPPOE_HASH_BITS CONFIG_PPPOE_HASH_BITS #define PPPOE_HASH_SIZE (1 << PPPOE_HASH_BITS) #define PPPOE_HASH_MASK (PPPOE_HASH_SIZE - 1) static int __pppoe_xmit(struct sock *sk, struct sk_buff *skb); static const struct proto_ops pppoe_ops; static const struct ppp_channel_ops pppoe_chan_ops; /* per-net private data for this module */ static unsigned int pppoe_net_id __read_mostly; struct pppoe_net { /* * we could use _single_ hash table for all * nets by injecting net id into the hash but * it would increase hash chains and add * a few additional math comparisons messy * as well, moreover in case of SMP less locking * controversy here */ struct pppox_sock *hash_table[PPPOE_HASH_SIZE]; rwlock_t hash_lock; }; /* * PPPoE could be in the following stages: * 1) Discovery stage (to obtain remote MAC and Session ID) * 2) Session stage (MAC and SID are known) * * Ethernet frames have a special tag for this but * we use simpler approach based on session id */ static inline bool stage_session(__be16 sid) { return sid != 0; } static inline struct pppoe_net *pppoe_pernet(struct net *net) { return net_generic(net, pppoe_net_id); } static inline int cmp_2_addr(struct pppoe_addr *a, struct pppoe_addr *b) { return a->sid == b->sid && ether_addr_equal(a->remote, b->remote); } static inline int cmp_addr(struct pppoe_addr *a, __be16 sid, char *addr) { return a->sid == sid && ether_addr_equal(a->remote, addr); } #if 8 % PPPOE_HASH_BITS #error 8 must be a multiple of PPPOE_HASH_BITS #endif static int hash_item(__be16 sid, unsigned char *addr) { unsigned char hash = 0; unsigned int i; for (i = 0; i < ETH_ALEN; i++) hash ^= addr[i]; for (i = 0; i < sizeof(sid_t) * 8; i += 8) hash ^= (__force __u32)sid >> i; for (i = 8; (i >>= 1) >= PPPOE_HASH_BITS;) hash ^= hash >> i; return hash & PPPOE_HASH_MASK; } /********************************************************************** * * Set/get/delete/rehash items (internal versions) * **********************************************************************/ static struct pppox_sock *__get_item(struct pppoe_net *pn, __be16 sid, unsigned char *addr, int ifindex) { int hash = hash_item(sid, addr); struct pppox_sock *ret; ret = pn->hash_table[hash]; while (ret) { if (cmp_addr(&ret->pppoe_pa, sid, addr) && ret->pppoe_ifindex == ifindex) return ret; ret = ret->next; } return NULL; } static int __set_item(struct pppoe_net *pn, struct pppox_sock *po) { int hash = hash_item(po->pppoe_pa.sid, po->pppoe_pa.remote); struct pppox_sock *ret; ret = pn->hash_table[hash]; while (ret) { if (cmp_2_addr(&ret->pppoe_pa, &po->pppoe_pa) && ret->pppoe_ifindex == po->pppoe_ifindex) return -EALREADY; ret = ret->next; } po->next = pn->hash_table[hash]; pn->hash_table[hash] = po; return 0; } static void __delete_item(struct pppoe_net *pn, __be16 sid, char *addr, int ifindex) { int hash = hash_item(sid, addr); struct pppox_sock *ret, **src; ret = pn->hash_table[hash]; src = &pn->hash_table[hash]; while (ret) { if (cmp_addr(&ret->pppoe_pa, sid, addr) && ret->pppoe_ifindex == ifindex) { *src = ret->next; break; } src = &ret->next; ret = ret->next; } } /********************************************************************** * * Set/get/delete/rehash items * **********************************************************************/ static inline struct pppox_sock *get_item(struct pppoe_net *pn, __be16 sid, unsigned char *addr, int ifindex) { struct pppox_sock *po; read_lock_bh(&pn->hash_lock); po = __get_item(pn, sid, addr, ifindex); if (po) sock_hold(sk_pppox(po)); read_unlock_bh(&pn->hash_lock); return po; } static inline struct pppox_sock *get_item_by_addr(struct net *net, struct sockaddr_pppox *sp) { struct net_device *dev; struct pppoe_net *pn; struct pppox_sock *pppox_sock = NULL; int ifindex; rcu_read_lock(); dev = dev_get_by_name_rcu(net, sp->sa_addr.pppoe.dev); if (dev) { ifindex = dev->ifindex; pn = pppoe_pernet(net); pppox_sock = get_item(pn, sp->sa_addr.pppoe.sid, sp->sa_addr.pppoe.remote, ifindex); } rcu_read_unlock(); return pppox_sock; } static inline void delete_item(struct pppoe_net *pn, __be16 sid, char *addr, int ifindex) { write_lock_bh(&pn->hash_lock); __delete_item(pn, sid, addr, ifindex); write_unlock_bh(&pn->hash_lock); } /*************************************************************************** * * Handler for device events. * Certain device events require that sockets be unconnected. * **************************************************************************/ static void pppoe_flush_dev(struct net_device *dev) { struct pppoe_net *pn; int i; pn = pppoe_pernet(dev_net(dev)); write_lock_bh(&pn->hash_lock); for (i = 0; i < PPPOE_HASH_SIZE; i++) { struct pppox_sock *po = pn->hash_table[i]; struct sock *sk; while (po) { while (po && po->pppoe_dev != dev) { po = po->next; } if (!po) break; sk = sk_pppox(po); /* We always grab the socket lock, followed by the * hash_lock, in that order. Since we should hold the * sock lock while doing any unbinding, we need to * release the lock we're holding. Hold a reference to * the sock so it doesn't disappear as we're jumping * between locks. */ sock_hold(sk); write_unlock_bh(&pn->hash_lock); lock_sock(sk); if (po->pppoe_dev == dev && sk->sk_state & (PPPOX_CONNECTED | PPPOX_BOUND)) { pppox_unbind_sock(sk); sk->sk_state_change(sk); po->pppoe_dev = NULL; dev_put(dev); } release_sock(sk); sock_put(sk); /* Restart the process from the start of the current * hash chain. We dropped locks so the world may have * change from underneath us. */ BUG_ON(pppoe_pernet(dev_net(dev)) == NULL); write_lock_bh(&pn->hash_lock); po = pn->hash_table[i]; } } write_unlock_bh(&pn->hash_lock); } static int pppoe_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); /* Only look at sockets that are using this specific device. */ switch (event) { case NETDEV_CHANGEADDR: case NETDEV_CHANGEMTU: /* A change in mtu or address is a bad thing, requiring * LCP re-negotiation. */ case NETDEV_GOING_DOWN: case NETDEV_DOWN: /* Find every socket on this device and kill it. */ pppoe_flush_dev(dev); break; default: break; } return NOTIFY_DONE; } static struct notifier_block pppoe_notifier = { .notifier_call = pppoe_device_event, }; /************************************************************************ * * Do the real work of receiving a PPPoE Session frame. * ***********************************************************************/ static int pppoe_rcv_core(struct sock *sk, struct sk_buff *skb) { struct pppox_sock *po = pppox_sk(sk); struct pppox_sock *relay_po; /* Backlog receive. Semantics of backlog rcv preclude any code from * executing in lock_sock()/release_sock() bounds; meaning sk->sk_state * can't change. */ if (skb->pkt_type == PACKET_OTHERHOST) goto abort_kfree; if (sk->sk_state & PPPOX_BOUND) { ppp_input(&po->chan, skb); } else if (sk->sk_state & PPPOX_RELAY) { relay_po = get_item_by_addr(sock_net(sk), &po->pppoe_relay); if (relay_po == NULL) goto abort_kfree; if ((sk_pppox(relay_po)->sk_state & PPPOX_CONNECTED) == 0) goto abort_put; if (!__pppoe_xmit(sk_pppox(relay_po), skb)) goto abort_put; sock_put(sk_pppox(relay_po)); } else { if (sock_queue_rcv_skb(sk, skb)) goto abort_kfree; } return NET_RX_SUCCESS; abort_put: sock_put(sk_pppox(relay_po)); abort_kfree: kfree_skb(skb); return NET_RX_DROP; } /************************************************************************ * * Receive wrapper called in BH context. * ***********************************************************************/ static int pppoe_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct pppoe_hdr *ph; struct pppox_sock *po; struct pppoe_net *pn; int len; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) goto out; if (skb_mac_header_len(skb) < ETH_HLEN) goto drop; if (!pskb_may_pull(skb, sizeof(struct pppoe_hdr))) goto drop; ph = pppoe_hdr(skb); len = ntohs(ph->length); skb_pull_rcsum(skb, sizeof(*ph)); if (skb->len < len) goto drop; if (pskb_trim_rcsum(skb, len)) goto drop; ph = pppoe_hdr(skb); pn = pppoe_pernet(dev_net(dev)); /* Note that get_item does a sock_hold(), so sk_pppox(po) * is known to be safe. */ po = get_item(pn, ph->sid, eth_hdr(skb)->h_source, dev->ifindex); if (!po) goto drop; return sk_receive_skb(sk_pppox(po), skb, 0); drop: kfree_skb(skb); out: return NET_RX_DROP; } static void pppoe_unbind_sock_work(struct work_struct *work) { struct pppox_sock *po = container_of(work, struct pppox_sock, proto.pppoe.padt_work); struct sock *sk = sk_pppox(po); lock_sock(sk); if (po->pppoe_dev) { dev_put(po->pppoe_dev); po->pppoe_dev = NULL; } pppox_unbind_sock(sk); release_sock(sk); sock_put(sk); } /************************************************************************ * * Receive a PPPoE Discovery frame. * This is solely for detection of PADT frames * ***********************************************************************/ static int pppoe_disc_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct pppoe_hdr *ph; struct pppox_sock *po; struct pppoe_net *pn; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) goto out; if (skb->pkt_type != PACKET_HOST) goto abort; if (!pskb_may_pull(skb, sizeof(struct pppoe_hdr))) goto abort; ph = pppoe_hdr(skb); if (ph->code != PADT_CODE) goto abort; pn = pppoe_pernet(dev_net(dev)); po = get_item(pn, ph->sid, eth_hdr(skb)->h_source, dev->ifindex); if (po) if (!schedule_work(&po->proto.pppoe.padt_work)) sock_put(sk_pppox(po)); abort: kfree_skb(skb); out: return NET_RX_SUCCESS; /* Lies... :-) */ } static struct packet_type pppoes_ptype __read_mostly = { .type = cpu_to_be16(ETH_P_PPP_SES), .func = pppoe_rcv, }; static struct packet_type pppoed_ptype __read_mostly = { .type = cpu_to_be16(ETH_P_PPP_DISC), .func = pppoe_disc_rcv, }; static struct proto pppoe_sk_proto __read_mostly = { .name = "PPPOE", .owner = THIS_MODULE, .obj_size = sizeof(struct pppox_sock), }; /*********************************************************************** * * Initialize a new struct sock. * **********************************************************************/ static int pppoe_create(struct net *net, struct socket *sock, int kern) { struct sock *sk; sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppoe_sk_proto, kern); if (!sk) return -ENOMEM; sock_init_data(sock, sk); sock->state = SS_UNCONNECTED; sock->ops = &pppoe_ops; sk->sk_backlog_rcv = pppoe_rcv_core; sk->sk_state = PPPOX_NONE; sk->sk_type = SOCK_STREAM; sk->sk_family = PF_PPPOX; sk->sk_protocol = PX_PROTO_OE; INIT_WORK(&pppox_sk(sk)->proto.pppoe.padt_work, pppoe_unbind_sock_work); return 0; } static int pppoe_release(struct socket *sock) { struct sock *sk = sock->sk; struct pppox_sock *po; struct pppoe_net *pn; struct net *net = NULL; if (!sk) return 0; lock_sock(sk); if (sock_flag(sk, SOCK_DEAD)) { release_sock(sk); return -EBADF; } po = pppox_sk(sk); if (po->pppoe_dev) { dev_put(po->pppoe_dev); po->pppoe_dev = NULL; } pppox_unbind_sock(sk); /* Signal the death of the socket. */ sk->sk_state = PPPOX_DEAD; net = sock_net(sk); pn = pppoe_pernet(net); /* * protect "po" from concurrent updates * on pppoe_flush_dev */ delete_item(pn, po->pppoe_pa.sid, po->pppoe_pa.remote, po->pppoe_ifindex); sock_orphan(sk); sock->sk = NULL; skb_queue_purge(&sk->sk_receive_queue); release_sock(sk); sock_put(sk); return 0; } static int pppoe_connect(struct socket *sock, struct sockaddr *uservaddr, int sockaddr_len, int flags) { struct sock *sk = sock->sk; struct sockaddr_pppox *sp = (struct sockaddr_pppox *)uservaddr; struct pppox_sock *po = pppox_sk(sk); struct net_device *dev = NULL; struct pppoe_net *pn; struct net *net = NULL; int error; lock_sock(sk); error = -EINVAL; if (sockaddr_len != sizeof(struct sockaddr_pppox)) goto end; if (sp->sa_protocol != PX_PROTO_OE) goto end; /* Check for already bound sockets */ error = -EBUSY; if ((sk->sk_state & PPPOX_CONNECTED) && stage_session(sp->sa_addr.pppoe.sid)) goto end; /* Check for already disconnected sockets, on attempts to disconnect */ error = -EALREADY; if ((sk->sk_state & PPPOX_DEAD) && !stage_session(sp->sa_addr.pppoe.sid)) goto end; error = 0; /* Delete the old binding */ if (stage_session(po->pppoe_pa.sid)) { pppox_unbind_sock(sk); pn = pppoe_pernet(sock_net(sk)); delete_item(pn, po->pppoe_pa.sid, po->pppoe_pa.remote, po->pppoe_ifindex); if (po->pppoe_dev) { dev_put(po->pppoe_dev); po->pppoe_dev = NULL; } po->pppoe_ifindex = 0; memset(&po->pppoe_pa, 0, sizeof(po->pppoe_pa)); memset(&po->pppoe_relay, 0, sizeof(po->pppoe_relay)); memset(&po->chan, 0, sizeof(po->chan)); po->next = NULL; po->num = 0; sk->sk_state = PPPOX_NONE; } /* Re-bind in session stage only */ if (stage_session(sp->sa_addr.pppoe.sid)) { error = -ENODEV; net = sock_net(sk); dev = dev_get_by_name(net, sp->sa_addr.pppoe.dev); if (!dev) goto err_put; po->pppoe_dev = dev; po->pppoe_ifindex = dev->ifindex; pn = pppoe_pernet(net); if (!(dev->flags & IFF_UP)) { goto err_put; } memcpy(&po->pppoe_pa, &sp->sa_addr.pppoe, sizeof(struct pppoe_addr)); write_lock_bh(&pn->hash_lock); error = __set_item(pn, po); write_unlock_bh(&pn->hash_lock); if (error < 0) goto err_put; po->chan.hdrlen = (sizeof(struct pppoe_hdr) + dev->hard_header_len); po->chan.mtu = dev->mtu - sizeof(struct pppoe_hdr) - 2; po->chan.private = sk; po->chan.ops = &pppoe_chan_ops; error = ppp_register_net_channel(dev_net(dev), &po->chan); if (error) { delete_item(pn, po->pppoe_pa.sid, po->pppoe_pa.remote, po->pppoe_ifindex); goto err_put; } sk->sk_state = PPPOX_CONNECTED; } po->num = sp->sa_addr.pppoe.sid; end: release_sock(sk); return error; err_put: if (po->pppoe_dev) { dev_put(po->pppoe_dev); po->pppoe_dev = NULL; } goto end; } static int pppoe_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { int len = sizeof(struct sockaddr_pppox); struct sockaddr_pppox sp; sp.sa_family = AF_PPPOX; sp.sa_protocol = PX_PROTO_OE; memcpy(&sp.sa_addr.pppoe, &pppox_sk(sock->sk)->pppoe_pa, sizeof(struct pppoe_addr)); memcpy(uaddr, &sp, len); return len; } static int pppoe_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; struct pppox_sock *po = pppox_sk(sk); int val; int err; switch (cmd) { case PPPIOCGMRU: err = -ENXIO; if (!(sk->sk_state & PPPOX_CONNECTED)) break; err = -EFAULT; if (put_user(po->pppoe_dev->mtu - sizeof(struct pppoe_hdr) - PPP_HDRLEN, (int __user *)arg)) break; err = 0; break; case PPPIOCSMRU: err = -ENXIO; if (!(sk->sk_state & PPPOX_CONNECTED)) break; err = -EFAULT; if (get_user(val, (int __user *)arg)) break; if (val < (po->pppoe_dev->mtu - sizeof(struct pppoe_hdr) - PPP_HDRLEN)) err = 0; else err = -EINVAL; break; case PPPIOCSFLAGS: err = -EFAULT; if (get_user(val, (int __user *)arg)) break; err = 0; break; case PPPOEIOCSFWD: { struct pppox_sock *relay_po; err = -EBUSY; if (sk->sk_state & (PPPOX_BOUND | PPPOX_DEAD)) break; err = -ENOTCONN; if (!(sk->sk_state & PPPOX_CONNECTED)) break; /* PPPoE address from the user specifies an outbound PPPoE address which frames are forwarded to */ err = -EFAULT; if (copy_from_user(&po->pppoe_relay, (void __user *)arg, sizeof(struct sockaddr_pppox))) break; err = -EINVAL; if (po->pppoe_relay.sa_family != AF_PPPOX || po->pppoe_relay.sa_protocol != PX_PROTO_OE) break; /* Check that the socket referenced by the address actually exists. */ relay_po = get_item_by_addr(sock_net(sk), &po->pppoe_relay); if (!relay_po) break; sock_put(sk_pppox(relay_po)); sk->sk_state |= PPPOX_RELAY; err = 0; break; } case PPPOEIOCDFWD: err = -EALREADY; if (!(sk->sk_state & PPPOX_RELAY)) break; sk->sk_state &= ~PPPOX_RELAY; err = 0; break; default: err = -ENOTTY; } return err; } static int pppoe_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { struct sk_buff *skb; struct sock *sk = sock->sk; struct pppox_sock *po = pppox_sk(sk); int error; struct pppoe_hdr hdr; struct pppoe_hdr *ph; struct net_device *dev; char *start; int hlen; lock_sock(sk); if (sock_flag(sk, SOCK_DEAD) || !(sk->sk_state & PPPOX_CONNECTED)) { error = -ENOTCONN; goto end; } hdr.ver = 1; hdr.type = 1; hdr.code = 0; hdr.sid = po->num; dev = po->pppoe_dev; error = -EMSGSIZE; if (total_len > (dev->mtu + dev->hard_header_len)) goto end; hlen = LL_RESERVED_SPACE(dev); skb = sock_wmalloc(sk, hlen + sizeof(*ph) + total_len + dev->needed_tailroom, 0, GFP_KERNEL); if (!skb) { error = -ENOMEM; goto end; } /* Reserve space for headers. */ skb_reserve(skb, hlen); skb_reset_network_header(skb); skb->dev = dev; skb->priority = sk->sk_priority; skb->protocol = cpu_to_be16(ETH_P_PPP_SES); ph = skb_put(skb, total_len + sizeof(struct pppoe_hdr)); start = (char *)&ph->tag[0]; error = memcpy_from_msg(start, m, total_len); if (error < 0) { kfree_skb(skb); goto end; } error = total_len; dev_hard_header(skb, dev, ETH_P_PPP_SES, po->pppoe_pa.remote, NULL, total_len); memcpy(ph, &hdr, sizeof(struct pppoe_hdr)); ph->length = htons(total_len); dev_queue_xmit(skb); end: release_sock(sk); return error; } /************************************************************************ * * xmit function for internal use. * ***********************************************************************/ static int __pppoe_xmit(struct sock *sk, struct sk_buff *skb) { struct pppox_sock *po = pppox_sk(sk); struct net_device *dev = po->pppoe_dev; struct pppoe_hdr *ph; int data_len = skb->len; /* The higher-level PPP code (ppp_unregister_channel()) ensures the PPP * xmit operations conclude prior to an unregistration call. Thus * sk->sk_state cannot change, so we don't need to do lock_sock(). * But, we also can't do a lock_sock since that introduces a potential * deadlock as we'd reverse the lock ordering used when calling * ppp_unregister_channel(). */ if (sock_flag(sk, SOCK_DEAD) || !(sk->sk_state & PPPOX_CONNECTED)) goto abort; if (!dev) goto abort; /* Copy the data if there is no space for the header or if it's * read-only. */ if (skb_cow_head(skb, LL_RESERVED_SPACE(dev) + sizeof(*ph))) goto abort; __skb_push(skb, sizeof(*ph)); skb_reset_network_header(skb); ph = pppoe_hdr(skb); ph->ver = 1; ph->type = 1; ph->code = 0; ph->sid = po->num; ph->length = htons(data_len); skb->protocol = cpu_to_be16(ETH_P_PPP_SES); skb->dev = dev; dev_hard_header(skb, dev, ETH_P_PPP_SES, po->pppoe_pa.remote, NULL, data_len); dev_queue_xmit(skb); return 1; abort: kfree_skb(skb); return 1; } /************************************************************************ * * xmit function called by generic PPP driver * sends PPP frame over PPPoE socket * ***********************************************************************/ static int pppoe_xmit(struct ppp_channel *chan, struct sk_buff *skb) { struct sock *sk = chan->private; return __pppoe_xmit(sk, skb); } static int pppoe_fill_forward_path(struct net_device_path_ctx *ctx, struct net_device_path *path, const struct ppp_channel *chan) { struct sock *sk = chan->private; struct pppox_sock *po = pppox_sk(sk); struct net_device *dev = po->pppoe_dev; if (sock_flag(sk, SOCK_DEAD) || !(sk->sk_state & PPPOX_CONNECTED) || !dev) return -1; path->type = DEV_PATH_PPPOE; path->encap.proto = htons(ETH_P_PPP_SES); path->encap.id = be16_to_cpu(po->num); memcpy(path->encap.h_dest, po->pppoe_pa.remote, ETH_ALEN); memcpy(ctx->daddr, po->pppoe_pa.remote, ETH_ALEN); path->dev = ctx->dev; ctx->dev = dev; return 0; } static const struct ppp_channel_ops pppoe_chan_ops = { .start_xmit = pppoe_xmit, .fill_forward_path = pppoe_fill_forward_path, }; static int pppoe_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; int error = 0; if (sk->sk_state & PPPOX_BOUND) { error = -EIO; goto end; } skb = skb_recv_datagram(sk, flags, &error); if (error < 0) goto end; if (skb) { total_len = min_t(size_t, total_len, skb->len); error = skb_copy_datagram_msg(skb, 0, m, total_len); if (error == 0) { consume_skb(skb); return total_len; } } kfree_skb(skb); end: return error; } #ifdef CONFIG_PROC_FS static int pppoe_seq_show(struct seq_file *seq, void *v) { struct pppox_sock *po; char *dev_name; if (v == SEQ_START_TOKEN) { seq_puts(seq, "Id Address Device\n"); goto out; } po = v; dev_name = po->pppoe_pa.dev; seq_printf(seq, "%08X %pM %8s\n", po->pppoe_pa.sid, po->pppoe_pa.remote, dev_name); out: return 0; } static inline struct pppox_sock *pppoe_get_idx(struct pppoe_net *pn, loff_t pos) { struct pppox_sock *po; int i; for (i = 0; i < PPPOE_HASH_SIZE; i++) { po = pn->hash_table[i]; while (po) { if (!pos--) goto out; po = po->next; } } out: return po; } static void *pppoe_seq_start(struct seq_file *seq, loff_t *pos) __acquires(pn->hash_lock) { struct pppoe_net *pn = pppoe_pernet(seq_file_net(seq)); loff_t l = *pos; read_lock_bh(&pn->hash_lock); return l ? pppoe_get_idx(pn, --l) : SEQ_START_TOKEN; } static void *pppoe_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct pppoe_net *pn = pppoe_pernet(seq_file_net(seq)); struct pppox_sock *po; ++*pos; if (v == SEQ_START_TOKEN) { po = pppoe_get_idx(pn, 0); goto out; } po = v; if (po->next) po = po->next; else { int hash = hash_item(po->pppoe_pa.sid, po->pppoe_pa.remote); po = NULL; while (++hash < PPPOE_HASH_SIZE) { po = pn->hash_table[hash]; if (po) break; } } out: return po; } static void pppoe_seq_stop(struct seq_file *seq, void *v) __releases(pn->hash_lock) { struct pppoe_net *pn = pppoe_pernet(seq_file_net(seq)); read_unlock_bh(&pn->hash_lock); } static const struct seq_operations pppoe_seq_ops = { .start = pppoe_seq_start, .next = pppoe_seq_next, .stop = pppoe_seq_stop, .show = pppoe_seq_show, }; #endif /* CONFIG_PROC_FS */ static const struct proto_ops pppoe_ops = { .family = AF_PPPOX, .owner = THIS_MODULE, .release = pppoe_release, .bind = sock_no_bind, .connect = pppoe_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = pppoe_getname, .poll = datagram_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .sendmsg = pppoe_sendmsg, .recvmsg = pppoe_recvmsg, .mmap = sock_no_mmap, .ioctl = pppox_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = pppox_compat_ioctl, #endif }; static const struct pppox_proto pppoe_proto = { .create = pppoe_create, .ioctl = pppoe_ioctl, .owner = THIS_MODULE, }; static __net_init int pppoe_init_net(struct net *net) { struct pppoe_net *pn = pppoe_pernet(net); struct proc_dir_entry *pde; rwlock_init(&pn->hash_lock); pde = proc_create_net("pppoe", 0444, net->proc_net, &pppoe_seq_ops, sizeof(struct seq_net_private)); #ifdef CONFIG_PROC_FS if (!pde) return -ENOMEM; #endif return 0; } static __net_exit void pppoe_exit_net(struct net *net) { remove_proc_entry("pppoe", net->proc_net); } static struct pernet_operations pppoe_net_ops = { .init = pppoe_init_net, .exit = pppoe_exit_net, .id = &pppoe_net_id, .size = sizeof(struct pppoe_net), }; static int __init pppoe_init(void) { int err; err = register_pernet_device(&pppoe_net_ops); if (err) goto out; err = proto_register(&pppoe_sk_proto, 0); if (err) goto out_unregister_net_ops; err = register_pppox_proto(PX_PROTO_OE, &pppoe_proto); if (err) goto out_unregister_pppoe_proto; dev_add_pack(&pppoes_ptype); dev_add_pack(&pppoed_ptype); register_netdevice_notifier(&pppoe_notifier); return 0; out_unregister_pppoe_proto: proto_unregister(&pppoe_sk_proto); out_unregister_net_ops: unregister_pernet_device(&pppoe_net_ops); out: return err; } static void __exit pppoe_exit(void) { unregister_netdevice_notifier(&pppoe_notifier); dev_remove_pack(&pppoed_ptype); dev_remove_pack(&pppoes_ptype); unregister_pppox_proto(PX_PROTO_OE); proto_unregister(&pppoe_sk_proto); unregister_pernet_device(&pppoe_net_ops); } module_init(pppoe_init); module_exit(pppoe_exit); MODULE_AUTHOR("Michal Ostrowski <mostrows@speakeasy.net>"); MODULE_DESCRIPTION("PPP over Ethernet driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NET_PF_PROTO(PF_PPPOX, PX_PROTO_OE);
56 39 39 39 39 39 39 39 92 90 34 710 57 57 57 1 710 31 26 709 27 62 62 62 62 62 62 62 62 62 62 62 62 62 62 62 62 62 62 61 62 62 62 61 61 62 61 62 62 62 62 62 62 62 62 62 62 62 62 62 62 62 55 55 55 55 55 55 54 55 55 10 10 10 10 10 10 47 67 48 67 68 68 68 68 68 68 68 47 68 48 68 66 67 64 65 65 65 68 28 39 10 30 68 27 27 27 27 29 45 45 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 29 45 69 44 44 44 43 43 43 43 43 43 43 43 43 43 43 43 43 43 43 43 43 43 43 43 43 42 43 43 42 43 43 12 6 12 11 12 12 12 12 12 12 12 12 12 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 // SPDX-License-Identifier: GPL-2.0-or-later /* * Bridge netlink control interface * * Authors: * Stephen Hemminger <shemminger@osdl.org> */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/etherdevice.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/sock.h> #include <uapi/linux/if_bridge.h> #include "br_private.h" #include "br_private_stp.h" #include "br_private_cfm.h" #include "br_private_tunnel.h" #include "br_private_mcast_eht.h" static int __get_num_vlan_infos(struct net_bridge_vlan_group *vg, u32 filter_mask) { struct net_bridge_vlan *v; u16 vid_range_start = 0, vid_range_end = 0, vid_range_flags = 0; u16 flags, pvid; int num_vlans = 0; if (!(filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED)) return 0; pvid = br_get_pvid(vg); /* Count number of vlan infos */ list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { flags = 0; /* only a context, bridge vlan not activated */ if (!br_vlan_should_use(v)) continue; if (v->vid == pvid) flags |= BRIDGE_VLAN_INFO_PVID; if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED) flags |= BRIDGE_VLAN_INFO_UNTAGGED; if (vid_range_start == 0) { goto initvars; } else if ((v->vid - vid_range_end) == 1 && flags == vid_range_flags) { vid_range_end = v->vid; continue; } else { if ((vid_range_end - vid_range_start) > 0) num_vlans += 2; else num_vlans += 1; } initvars: vid_range_start = v->vid; vid_range_end = v->vid; vid_range_flags = flags; } if (vid_range_start != 0) { if ((vid_range_end - vid_range_start) > 0) num_vlans += 2; else num_vlans += 1; } return num_vlans; } static int br_get_num_vlan_infos(struct net_bridge_vlan_group *vg, u32 filter_mask) { int num_vlans; if (!vg) return 0; if (filter_mask & RTEXT_FILTER_BRVLAN) return vg->num_vlans; rcu_read_lock(); num_vlans = __get_num_vlan_infos(vg, filter_mask); rcu_read_unlock(); return num_vlans; } static size_t br_get_link_af_size_filtered(const struct net_device *dev, u32 filter_mask) { struct net_bridge_vlan_group *vg = NULL; struct net_bridge_port *p = NULL; struct net_bridge *br = NULL; u32 num_cfm_peer_mep_infos; u32 num_cfm_mep_infos; size_t vinfo_sz = 0; int num_vlan_infos; rcu_read_lock(); if (netif_is_bridge_port(dev)) { p = br_port_get_check_rcu(dev); if (p) vg = nbp_vlan_group_rcu(p); } else if (netif_is_bridge_master(dev)) { br = netdev_priv(dev); vg = br_vlan_group_rcu(br); } num_vlan_infos = br_get_num_vlan_infos(vg, filter_mask); rcu_read_unlock(); if (p && (p->flags & BR_VLAN_TUNNEL)) vinfo_sz += br_get_vlan_tunnel_info_size(vg); /* Each VLAN is returned in bridge_vlan_info along with flags */ vinfo_sz += num_vlan_infos * nla_total_size(sizeof(struct bridge_vlan_info)); if (p && vg && (filter_mask & RTEXT_FILTER_MST)) vinfo_sz += br_mst_info_size(vg); if (!(filter_mask & RTEXT_FILTER_CFM_STATUS)) return vinfo_sz; if (!br) return vinfo_sz; /* CFM status info must be added */ br_cfm_mep_count(br, &num_cfm_mep_infos); br_cfm_peer_mep_count(br, &num_cfm_peer_mep_infos); vinfo_sz += nla_total_size(0); /* IFLA_BRIDGE_CFM */ /* For each status struct the MEP instance (u32) is added */ /* MEP instance (u32) + br_cfm_mep_status */ vinfo_sz += num_cfm_mep_infos * /*IFLA_BRIDGE_CFM_MEP_STATUS_INSTANCE */ (nla_total_size(sizeof(u32)) /* IFLA_BRIDGE_CFM_MEP_STATUS_OPCODE_UNEXP_SEEN */ + nla_total_size(sizeof(u32)) /* IFLA_BRIDGE_CFM_MEP_STATUS_VERSION_UNEXP_SEEN */ + nla_total_size(sizeof(u32)) /* IFLA_BRIDGE_CFM_MEP_STATUS_RX_LEVEL_LOW_SEEN */ + nla_total_size(sizeof(u32))); /* MEP instance (u32) + br_cfm_cc_peer_status */ vinfo_sz += num_cfm_peer_mep_infos * /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_INSTANCE */ (nla_total_size(sizeof(u32)) /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_PEER_MEPID */ + nla_total_size(sizeof(u32)) /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_CCM_DEFECT */ + nla_total_size(sizeof(u32)) /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_RDI */ + nla_total_size(sizeof(u32)) /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_PORT_TLV_VALUE */ + nla_total_size(sizeof(u8)) /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_IF_TLV_VALUE */ + nla_total_size(sizeof(u8)) /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_SEEN */ + nla_total_size(sizeof(u32)) /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_TLV_SEEN */ + nla_total_size(sizeof(u32)) /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_SEQ_UNEXP_SEEN */ + nla_total_size(sizeof(u32))); return vinfo_sz; } static inline size_t br_port_info_size(void) { return nla_total_size(1) /* IFLA_BRPORT_STATE */ + nla_total_size(2) /* IFLA_BRPORT_PRIORITY */ + nla_total_size(4) /* IFLA_BRPORT_COST */ + nla_total_size(1) /* IFLA_BRPORT_MODE */ + nla_total_size(1) /* IFLA_BRPORT_GUARD */ + nla_total_size(1) /* IFLA_BRPORT_PROTECT */ + nla_total_size(1) /* IFLA_BRPORT_FAST_LEAVE */ + nla_total_size(1) /* IFLA_BRPORT_MCAST_TO_UCAST */ + nla_total_size(1) /* IFLA_BRPORT_LEARNING */ + nla_total_size(1) /* IFLA_BRPORT_UNICAST_FLOOD */ + nla_total_size(1) /* IFLA_BRPORT_MCAST_FLOOD */ + nla_total_size(1) /* IFLA_BRPORT_BCAST_FLOOD */ + nla_total_size(1) /* IFLA_BRPORT_PROXYARP */ + nla_total_size(1) /* IFLA_BRPORT_PROXYARP_WIFI */ + nla_total_size(1) /* IFLA_BRPORT_VLAN_TUNNEL */ + nla_total_size(1) /* IFLA_BRPORT_NEIGH_SUPPRESS */ + nla_total_size(1) /* IFLA_BRPORT_ISOLATED */ + nla_total_size(1) /* IFLA_BRPORT_LOCKED */ + nla_total_size(1) /* IFLA_BRPORT_MAB */ + nla_total_size(1) /* IFLA_BRPORT_NEIGH_VLAN_SUPPRESS */ + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_ROOT_ID */ + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_BRIDGE_ID */ + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_PORT */ + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_COST */ + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_ID */ + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_NO */ + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_TOPOLOGY_CHANGE_ACK */ + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_CONFIG_PENDING */ + nla_total_size_64bit(sizeof(u64)) /* IFLA_BRPORT_MESSAGE_AGE_TIMER */ + nla_total_size_64bit(sizeof(u64)) /* IFLA_BRPORT_FORWARD_DELAY_TIMER */ + nla_total_size_64bit(sizeof(u64)) /* IFLA_BRPORT_HOLD_TIMER */ #ifdef CONFIG_BRIDGE_IGMP_SNOOPING + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MULTICAST_ROUTER */ + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_N_GROUPS */ + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_MAX_GROUPS */ #endif + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_GROUP_FWD_MASK */ + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MRP_RING_OPEN */ + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MRP_IN_OPEN */ + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT */ + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_EHT_HOSTS_CNT */ + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_BACKUP_NHID */ + 0; } static inline size_t br_nlmsg_size(struct net_device *dev, u32 filter_mask) { return NLMSG_ALIGN(sizeof(struct ifinfomsg)) + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ + nla_total_size(4) /* IFLA_MASTER */ + nla_total_size(4) /* IFLA_MTU */ + nla_total_size(4) /* IFLA_LINK */ + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(br_port_info_size()) /* IFLA_PROTINFO */ + nla_total_size(br_get_link_af_size_filtered(dev, filter_mask)) /* IFLA_AF_SPEC */ + nla_total_size(4); /* IFLA_BRPORT_BACKUP_PORT */ } static int br_port_fill_attrs(struct sk_buff *skb, const struct net_bridge_port *p) { u8 mode = !!(p->flags & BR_HAIRPIN_MODE); struct net_bridge_port *backup_p; u64 timerval; if (nla_put_u8(skb, IFLA_BRPORT_STATE, p->state) || nla_put_u16(skb, IFLA_BRPORT_PRIORITY, p->priority) || nla_put_u32(skb, IFLA_BRPORT_COST, p->path_cost) || nla_put_u8(skb, IFLA_BRPORT_MODE, mode) || nla_put_u8(skb, IFLA_BRPORT_GUARD, !!(p->flags & BR_BPDU_GUARD)) || nla_put_u8(skb, IFLA_BRPORT_PROTECT, !!(p->flags & BR_ROOT_BLOCK)) || nla_put_u8(skb, IFLA_BRPORT_FAST_LEAVE, !!(p->flags & BR_MULTICAST_FAST_LEAVE)) || nla_put_u8(skb, IFLA_BRPORT_MCAST_TO_UCAST, !!(p->flags & BR_MULTICAST_TO_UNICAST)) || nla_put_u8(skb, IFLA_BRPORT_LEARNING, !!(p->flags & BR_LEARNING)) || nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD, !!(p->flags & BR_FLOOD)) || nla_put_u8(skb, IFLA_BRPORT_MCAST_FLOOD, !!(p->flags & BR_MCAST_FLOOD)) || nla_put_u8(skb, IFLA_BRPORT_BCAST_FLOOD, !!(p->flags & BR_BCAST_FLOOD)) || nla_put_u8(skb, IFLA_BRPORT_PROXYARP, !!(p->flags & BR_PROXYARP)) || nla_put_u8(skb, IFLA_BRPORT_PROXYARP_WIFI, !!(p->flags & BR_PROXYARP_WIFI)) || nla_put(skb, IFLA_BRPORT_ROOT_ID, sizeof(struct ifla_bridge_id), &p->designated_root) || nla_put(skb, IFLA_BRPORT_BRIDGE_ID, sizeof(struct ifla_bridge_id), &p->designated_bridge) || nla_put_u16(skb, IFLA_BRPORT_DESIGNATED_PORT, p->designated_port) || nla_put_u16(skb, IFLA_BRPORT_DESIGNATED_COST, p->designated_cost) || nla_put_u16(skb, IFLA_BRPORT_ID, p->port_id) || nla_put_u16(skb, IFLA_BRPORT_NO, p->port_no) || nla_put_u8(skb, IFLA_BRPORT_TOPOLOGY_CHANGE_ACK, p->topology_change_ack) || nla_put_u8(skb, IFLA_BRPORT_CONFIG_PENDING, p->config_pending) || nla_put_u8(skb, IFLA_BRPORT_VLAN_TUNNEL, !!(p->flags & BR_VLAN_TUNNEL)) || nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask) || nla_put_u8(skb, IFLA_BRPORT_NEIGH_SUPPRESS, !!(p->flags & BR_NEIGH_SUPPRESS)) || nla_put_u8(skb, IFLA_BRPORT_MRP_RING_OPEN, !!(p->flags & BR_MRP_LOST_CONT)) || nla_put_u8(skb, IFLA_BRPORT_MRP_IN_OPEN, !!(p->flags & BR_MRP_LOST_IN_CONT)) || nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED)) || nla_put_u8(skb, IFLA_BRPORT_LOCKED, !!(p->flags & BR_PORT_LOCKED)) || nla_put_u8(skb, IFLA_BRPORT_MAB, !!(p->flags & BR_PORT_MAB)) || nla_put_u8(skb, IFLA_BRPORT_NEIGH_VLAN_SUPPRESS, !!(p->flags & BR_NEIGH_VLAN_SUPPRESS))) return -EMSGSIZE; timerval = br_timer_value(&p->message_age_timer); if (nla_put_u64_64bit(skb, IFLA_BRPORT_MESSAGE_AGE_TIMER, timerval, IFLA_BRPORT_PAD)) return -EMSGSIZE; timerval = br_timer_value(&p->forward_delay_timer); if (nla_put_u64_64bit(skb, IFLA_BRPORT_FORWARD_DELAY_TIMER, timerval, IFLA_BRPORT_PAD)) return -EMSGSIZE; timerval = br_timer_value(&p->hold_timer); if (nla_put_u64_64bit(skb, IFLA_BRPORT_HOLD_TIMER, timerval, IFLA_BRPORT_PAD)) return -EMSGSIZE; #ifdef CONFIG_BRIDGE_IGMP_SNOOPING if (nla_put_u8(skb, IFLA_BRPORT_MULTICAST_ROUTER, p->multicast_ctx.multicast_router) || nla_put_u32(skb, IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT, p->multicast_eht_hosts_limit) || nla_put_u32(skb, IFLA_BRPORT_MCAST_EHT_HOSTS_CNT, p->multicast_eht_hosts_cnt) || nla_put_u32(skb, IFLA_BRPORT_MCAST_N_GROUPS, br_multicast_ngroups_get(&p->multicast_ctx)) || nla_put_u32(skb, IFLA_BRPORT_MCAST_MAX_GROUPS, br_multicast_ngroups_get_max(&p->multicast_ctx))) return -EMSGSIZE; #endif /* we might be called only with br->lock */ rcu_read_lock(); backup_p = rcu_dereference(p->backup_port); if (backup_p) nla_put_u32(skb, IFLA_BRPORT_BACKUP_PORT, backup_p->dev->ifindex); rcu_read_unlock(); if (p->backup_nhid && nla_put_u32(skb, IFLA_BRPORT_BACKUP_NHID, p->backup_nhid)) return -EMSGSIZE; return 0; } static int br_fill_ifvlaninfo_range(struct sk_buff *skb, u16 vid_start, u16 vid_end, u16 flags) { struct bridge_vlan_info vinfo; if ((vid_end - vid_start) > 0) { /* add range to skb */ vinfo.vid = vid_start; vinfo.flags = flags | BRIDGE_VLAN_INFO_RANGE_BEGIN; if (nla_put(skb, IFLA_BRIDGE_VLAN_INFO, sizeof(vinfo), &vinfo)) goto nla_put_failure; vinfo.vid = vid_end; vinfo.flags = flags | BRIDGE_VLAN_INFO_RANGE_END; if (nla_put(skb, IFLA_BRIDGE_VLAN_INFO, sizeof(vinfo), &vinfo)) goto nla_put_failure; } else { vinfo.vid = vid_start; vinfo.flags = flags; if (nla_put(skb, IFLA_BRIDGE_VLAN_INFO, sizeof(vinfo), &vinfo)) goto nla_put_failure; } return 0; nla_put_failure: return -EMSGSIZE; } static int br_fill_ifvlaninfo_compressed(struct sk_buff *skb, struct net_bridge_vlan_group *vg) { struct net_bridge_vlan *v; u16 vid_range_start = 0, vid_range_end = 0, vid_range_flags = 0; u16 flags, pvid; int err = 0; /* Pack IFLA_BRIDGE_VLAN_INFO's for every vlan * and mark vlan info with begin and end flags * if vlaninfo represents a range */ pvid = br_get_pvid(vg); list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { flags = 0; if (!br_vlan_should_use(v)) continue; if (v->vid == pvid) flags |= BRIDGE_VLAN_INFO_PVID; if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED) flags |= BRIDGE_VLAN_INFO_UNTAGGED; if (vid_range_start == 0) { goto initvars; } else if ((v->vid - vid_range_end) == 1 && flags == vid_range_flags) { vid_range_end = v->vid; continue; } else { err = br_fill_ifvlaninfo_range(skb, vid_range_start, vid_range_end, vid_range_flags); if (err) return err; } initvars: vid_range_start = v->vid; vid_range_end = v->vid; vid_range_flags = flags; } if (vid_range_start != 0) { /* Call it once more to send any left over vlans */ err = br_fill_ifvlaninfo_range(skb, vid_range_start, vid_range_end, vid_range_flags); if (err) return err; } return 0; } static int br_fill_ifvlaninfo(struct sk_buff *skb, struct net_bridge_vlan_group *vg) { struct bridge_vlan_info vinfo; struct net_bridge_vlan *v; u16 pvid; pvid = br_get_pvid(vg); list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { if (!br_vlan_should_use(v)) continue; vinfo.vid = v->vid; vinfo.flags = 0; if (v->vid == pvid) vinfo.flags |= BRIDGE_VLAN_INFO_PVID; if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED) vinfo.flags |= BRIDGE_VLAN_INFO_UNTAGGED; if (nla_put(skb, IFLA_BRIDGE_VLAN_INFO, sizeof(vinfo), &vinfo)) goto nla_put_failure; } return 0; nla_put_failure: return -EMSGSIZE; } /* * Create one netlink message for one interface * Contains port and master info as well as carrier and bridge state. */ static int br_fill_ifinfo(struct sk_buff *skb, const struct net_bridge_port *port, u32 pid, u32 seq, int event, unsigned int flags, u32 filter_mask, const struct net_device *dev, bool getlink) { u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN; struct nlattr *af = NULL; struct net_bridge *br; struct ifinfomsg *hdr; struct nlmsghdr *nlh; if (port) br = port->br; else br = netdev_priv(dev); br_debug(br, "br_fill_info event %d port %s master %s\n", event, dev->name, br->dev->name); nlh = nlmsg_put(skb, pid, seq, event, sizeof(*hdr), flags); if (nlh == NULL) return -EMSGSIZE; hdr = nlmsg_data(nlh); hdr->ifi_family = AF_BRIDGE; hdr->__ifi_pad = 0; hdr->ifi_type = dev->type; hdr->ifi_index = dev->ifindex; hdr->ifi_flags = dev_get_flags(dev); hdr->ifi_change = 0; if (nla_put_string(skb, IFLA_IFNAME, dev->name) || nla_put_u32(skb, IFLA_MASTER, br->dev->ifindex) || nla_put_u32(skb, IFLA_MTU, dev->mtu) || nla_put_u8(skb, IFLA_OPERSTATE, operstate) || (dev->addr_len && nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) || (dev->ifindex != dev_get_iflink(dev) && nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev)))) goto nla_put_failure; if (event == RTM_NEWLINK && port) { struct nlattr *nest; nest = nla_nest_start(skb, IFLA_PROTINFO); if (nest == NULL || br_port_fill_attrs(skb, port) < 0) goto nla_put_failure; nla_nest_end(skb, nest); } if (filter_mask & (RTEXT_FILTER_BRVLAN | RTEXT_FILTER_BRVLAN_COMPRESSED | RTEXT_FILTER_MRP | RTEXT_FILTER_CFM_CONFIG | RTEXT_FILTER_CFM_STATUS | RTEXT_FILTER_MST)) { af = nla_nest_start_noflag(skb, IFLA_AF_SPEC); if (!af) goto nla_put_failure; } /* Check if the VID information is requested */ if ((filter_mask & RTEXT_FILTER_BRVLAN) || (filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED)) { struct net_bridge_vlan_group *vg; int err; /* RCU needed because of the VLAN locking rules (rcu || rtnl) */ rcu_read_lock(); if (port) vg = nbp_vlan_group_rcu(port); else vg = br_vlan_group_rcu(br); if (!vg || !vg->num_vlans) { rcu_read_unlock(); goto done; } if (filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED) err = br_fill_ifvlaninfo_compressed(skb, vg); else err = br_fill_ifvlaninfo(skb, vg); if (port && (port->flags & BR_VLAN_TUNNEL)) err = br_fill_vlan_tunnel_info(skb, vg); rcu_read_unlock(); if (err) goto nla_put_failure; } if (filter_mask & RTEXT_FILTER_MRP) { int err; if (!br_mrp_enabled(br) || port) goto done; rcu_read_lock(); err = br_mrp_fill_info(skb, br); rcu_read_unlock(); if (err) goto nla_put_failure; } if (filter_mask & (RTEXT_FILTER_CFM_CONFIG | RTEXT_FILTER_CFM_STATUS)) { struct nlattr *cfm_nest = NULL; int err; if (!br_cfm_created(br) || port) goto done; cfm_nest = nla_nest_start(skb, IFLA_BRIDGE_CFM); if (!cfm_nest) goto nla_put_failure; if (filter_mask & RTEXT_FILTER_CFM_CONFIG) { rcu_read_lock(); err = br_cfm_config_fill_info(skb, br); rcu_read_unlock(); if (err) goto nla_put_failure; } if (filter_mask & RTEXT_FILTER_CFM_STATUS) { rcu_read_lock(); err = br_cfm_status_fill_info(skb, br, getlink); rcu_read_unlock(); if (err) goto nla_put_failure; } nla_nest_end(skb, cfm_nest); } if ((filter_mask & RTEXT_FILTER_MST) && br_opt_get(br, BROPT_MST_ENABLED) && port) { const struct net_bridge_vlan_group *vg = nbp_vlan_group(port); struct nlattr *mst_nest; int err; if (!vg || !vg->num_vlans) goto done; mst_nest = nla_nest_start(skb, IFLA_BRIDGE_MST); if (!mst_nest) goto nla_put_failure; err = br_mst_fill_info(skb, vg); if (err) goto nla_put_failure; nla_nest_end(skb, mst_nest); } done: if (af) { if (nlmsg_get_pos(skb) - (void *)af > nla_attr_size(0)) nla_nest_end(skb, af); else nla_nest_cancel(skb, af); } nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } void br_info_notify(int event, const struct net_bridge *br, const struct net_bridge_port *port, u32 filter) { struct net_device *dev; struct sk_buff *skb; int err = -ENOBUFS; struct net *net; u16 port_no = 0; if (WARN_ON(!port && !br)) return; if (port) { dev = port->dev; br = port->br; port_no = port->port_no; } else { dev = br->dev; } net = dev_net(dev); br_debug(br, "port %u(%s) event %d\n", port_no, dev->name, event); skb = nlmsg_new(br_nlmsg_size(dev, filter), GFP_ATOMIC); if (skb == NULL) goto errout; err = br_fill_ifinfo(skb, port, 0, 0, event, 0, filter, dev, false); if (err < 0) { /* -EMSGSIZE implies BUG in br_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC); return; errout: rtnl_set_sk_err(net, RTNLGRP_LINK, err); } /* Notify listeners of a change in bridge or port information */ void br_ifinfo_notify(int event, const struct net_bridge *br, const struct net_bridge_port *port) { u32 filter = RTEXT_FILTER_BRVLAN_COMPRESSED; return br_info_notify(event, br, port, filter); } /* * Dump information about all ports, in response to GETLINK */ int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev, u32 filter_mask, int nlflags) { struct net_bridge_port *port = br_port_get_rtnl(dev); if (!port && !(filter_mask & RTEXT_FILTER_BRVLAN) && !(filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED) && !(filter_mask & RTEXT_FILTER_MRP) && !(filter_mask & RTEXT_FILTER_CFM_CONFIG) && !(filter_mask & RTEXT_FILTER_CFM_STATUS)) return 0; return br_fill_ifinfo(skb, port, pid, seq, RTM_NEWLINK, nlflags, filter_mask, dev, true); } static int br_vlan_info(struct net_bridge *br, struct net_bridge_port *p, int cmd, struct bridge_vlan_info *vinfo, bool *changed, struct netlink_ext_ack *extack) { bool curr_change; int err = 0; switch (cmd) { case RTM_SETLINK: if (p) { /* if the MASTER flag is set this will act on the global * per-VLAN entry as well */ err = nbp_vlan_add(p, vinfo->vid, vinfo->flags, &curr_change, extack); } else { vinfo->flags |= BRIDGE_VLAN_INFO_BRENTRY; err = br_vlan_add(br, vinfo->vid, vinfo->flags, &curr_change, extack); } if (curr_change) *changed = true; break; case RTM_DELLINK: if (p) { if (!nbp_vlan_delete(p, vinfo->vid)) *changed = true; if ((vinfo->flags & BRIDGE_VLAN_INFO_MASTER) && !br_vlan_delete(p->br, vinfo->vid)) *changed = true; } else if (!br_vlan_delete(br, vinfo->vid)) { *changed = true; } break; } return err; } int br_process_vlan_info(struct net_bridge *br, struct net_bridge_port *p, int cmd, struct bridge_vlan_info *vinfo_curr, struct bridge_vlan_info **vinfo_last, bool *changed, struct netlink_ext_ack *extack) { int err, rtm_cmd; if (!br_vlan_valid_id(vinfo_curr->vid, extack)) return -EINVAL; /* needed for vlan-only NEWVLAN/DELVLAN notifications */ rtm_cmd = br_afspec_cmd_to_rtm(cmd); if (vinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) { if (!br_vlan_valid_range(vinfo_curr, *vinfo_last, extack)) return -EINVAL; *vinfo_last = vinfo_curr; return 0; } if (*vinfo_last) { struct bridge_vlan_info tmp_vinfo; int v, v_change_start = 0; if (!br_vlan_valid_range(vinfo_curr, *vinfo_last, extack)) return -EINVAL; memcpy(&tmp_vinfo, *vinfo_last, sizeof(struct bridge_vlan_info)); for (v = (*vinfo_last)->vid; v <= vinfo_curr->vid; v++) { bool curr_change = false; tmp_vinfo.vid = v; err = br_vlan_info(br, p, cmd, &tmp_vinfo, &curr_change, extack); if (err) break; if (curr_change) { *changed = curr_change; if (!v_change_start) v_change_start = v; } else { /* nothing to notify yet */ if (!v_change_start) continue; br_vlan_notify(br, p, v_change_start, v - 1, rtm_cmd); v_change_start = 0; } cond_resched(); } /* v_change_start is set only if the last/whole range changed */ if (v_change_start) br_vlan_notify(br, p, v_change_start, v - 1, rtm_cmd); *vinfo_last = NULL; return err; } err = br_vlan_info(br, p, cmd, vinfo_curr, changed, extack); if (*changed) br_vlan_notify(br, p, vinfo_curr->vid, 0, rtm_cmd); return err; } static int br_afspec(struct net_bridge *br, struct net_bridge_port *p, struct nlattr *af_spec, int cmd, bool *changed, struct netlink_ext_ack *extack) { struct bridge_vlan_info *vinfo_curr = NULL; struct bridge_vlan_info *vinfo_last = NULL; struct nlattr *attr; struct vtunnel_info tinfo_last = {}; struct vtunnel_info tinfo_curr = {}; int err = 0, rem; nla_for_each_nested(attr, af_spec, rem) { err = 0; switch (nla_type(attr)) { case IFLA_BRIDGE_VLAN_TUNNEL_INFO: if (!p || !(p->flags & BR_VLAN_TUNNEL)) return -EINVAL; err = br_parse_vlan_tunnel_info(attr, &tinfo_curr); if (err) return err; err = br_process_vlan_tunnel_info(br, p, cmd, &tinfo_curr, &tinfo_last, changed); if (err) return err; break; case IFLA_BRIDGE_VLAN_INFO: if (nla_len(attr) != sizeof(struct bridge_vlan_info)) return -EINVAL; vinfo_curr = nla_data(attr); err = br_process_vlan_info(br, p, cmd, vinfo_curr, &vinfo_last, changed, extack); if (err) return err; break; case IFLA_BRIDGE_MRP: err = br_mrp_parse(br, p, attr, cmd, extack); if (err) return err; break; case IFLA_BRIDGE_CFM: err = br_cfm_parse(br, p, attr, cmd, extack); if (err) return err; break; case IFLA_BRIDGE_MST: if (!p) { NL_SET_ERR_MSG(extack, "MST states can only be set on bridge ports"); return -EINVAL; } if (cmd != RTM_SETLINK) { NL_SET_ERR_MSG(extack, "MST states can only be set through RTM_SETLINK"); return -EINVAL; } err = br_mst_process(p, attr, extack); if (err) return err; break; } } return err; } static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_UNSPEC] = { .strict_start_type = IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT + 1 }, [IFLA_BRPORT_STATE] = { .type = NLA_U8 }, [IFLA_BRPORT_COST] = { .type = NLA_U32 }, [IFLA_BRPORT_PRIORITY] = { .type = NLA_U16 }, [IFLA_BRPORT_MODE] = { .type = NLA_U8 }, [IFLA_BRPORT_GUARD] = { .type = NLA_U8 }, [IFLA_BRPORT_PROTECT] = { .type = NLA_U8 }, [IFLA_BRPORT_FAST_LEAVE]= { .type = NLA_U8 }, [IFLA_BRPORT_LEARNING] = { .type = NLA_U8 }, [IFLA_BRPORT_UNICAST_FLOOD] = { .type = NLA_U8 }, [IFLA_BRPORT_PROXYARP] = { .type = NLA_U8 }, [IFLA_BRPORT_PROXYARP_WIFI] = { .type = NLA_U8 }, [IFLA_BRPORT_MULTICAST_ROUTER] = { .type = NLA_U8 }, [IFLA_BRPORT_MCAST_TO_UCAST] = { .type = NLA_U8 }, [IFLA_BRPORT_MCAST_FLOOD] = { .type = NLA_U8 }, [IFLA_BRPORT_BCAST_FLOOD] = { .type = NLA_U8 }, [IFLA_BRPORT_VLAN_TUNNEL] = { .type = NLA_U8 }, [IFLA_BRPORT_GROUP_FWD_MASK] = { .type = NLA_U16 }, [IFLA_BRPORT_NEIGH_SUPPRESS] = { .type = NLA_U8 }, [IFLA_BRPORT_ISOLATED] = { .type = NLA_U8 }, [IFLA_BRPORT_LOCKED] = { .type = NLA_U8 }, [IFLA_BRPORT_MAB] = { .type = NLA_U8 }, [IFLA_BRPORT_BACKUP_PORT] = { .type = NLA_U32 }, [IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT] = { .type = NLA_U32 }, [IFLA_BRPORT_MCAST_N_GROUPS] = { .type = NLA_REJECT }, [IFLA_BRPORT_MCAST_MAX_GROUPS] = { .type = NLA_U32 }, [IFLA_BRPORT_NEIGH_VLAN_SUPPRESS] = NLA_POLICY_MAX(NLA_U8, 1), [IFLA_BRPORT_BACKUP_NHID] = { .type = NLA_U32 }, }; /* Change the state of the port and notify spanning tree */ static int br_set_port_state(struct net_bridge_port *p, u8 state) { if (state > BR_STATE_BLOCKING) return -EINVAL; /* if kernel STP is running, don't allow changes */ if (p->br->stp_enabled == BR_KERNEL_STP) return -EBUSY; /* if device is not up, change is not allowed * if link is not present, only allowable state is disabled */ if (!netif_running(p->dev) || (!netif_oper_up(p->dev) && state != BR_STATE_DISABLED)) return -ENETDOWN; br_set_state(p, state); br_port_state_selection(p->br); return 0; } /* Set/clear or port flags based on attribute */ static void br_set_port_flag(struct net_bridge_port *p, struct nlattr *tb[], int attrtype, unsigned long mask) { if (!tb[attrtype]) return; if (nla_get_u8(tb[attrtype])) p->flags |= mask; else p->flags &= ~mask; } /* Process bridge protocol info on port */ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[], struct netlink_ext_ack *extack) { unsigned long old_flags, changed_mask; bool br_vlan_tunnel_old; int err; old_flags = p->flags; br_vlan_tunnel_old = (old_flags & BR_VLAN_TUNNEL) ? true : false; br_set_port_flag(p, tb, IFLA_BRPORT_MODE, BR_HAIRPIN_MODE); br_set_port_flag(p, tb, IFLA_BRPORT_GUARD, BR_BPDU_GUARD); br_set_port_flag(p, tb, IFLA_BRPORT_FAST_LEAVE, BR_MULTICAST_FAST_LEAVE); br_set_port_flag(p, tb, IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK); br_set_port_flag(p, tb, IFLA_BRPORT_LEARNING, BR_LEARNING); br_set_port_flag(p, tb, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD); br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD); br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_TO_UCAST, BR_MULTICAST_TO_UNICAST); br_set_port_flag(p, tb, IFLA_BRPORT_BCAST_FLOOD, BR_BCAST_FLOOD); br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP, BR_PROXYARP); br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP_WIFI, BR_PROXYARP_WIFI); br_set_port_flag(p, tb, IFLA_BRPORT_VLAN_TUNNEL, BR_VLAN_TUNNEL); br_set_port_flag(p, tb, IFLA_BRPORT_NEIGH_SUPPRESS, BR_NEIGH_SUPPRESS); br_set_port_flag(p, tb, IFLA_BRPORT_ISOLATED, BR_ISOLATED); br_set_port_flag(p, tb, IFLA_BRPORT_LOCKED, BR_PORT_LOCKED); br_set_port_flag(p, tb, IFLA_BRPORT_MAB, BR_PORT_MAB); br_set_port_flag(p, tb, IFLA_BRPORT_NEIGH_VLAN_SUPPRESS, BR_NEIGH_VLAN_SUPPRESS); if ((p->flags & BR_PORT_MAB) && (!(p->flags & BR_PORT_LOCKED) || !(p->flags & BR_LEARNING))) { NL_SET_ERR_MSG(extack, "Bridge port must be locked and have learning enabled when MAB is enabled"); p->flags = old_flags; return -EINVAL; } else if (!(p->flags & BR_PORT_MAB) && (old_flags & BR_PORT_MAB)) { struct net_bridge_fdb_flush_desc desc = { .flags = BIT(BR_FDB_LOCKED), .flags_mask = BIT(BR_FDB_LOCKED), .port_ifindex = p->dev->ifindex, }; br_fdb_flush(p->br, &desc); } changed_mask = old_flags ^ p->flags; err = br_switchdev_set_port_flag(p, p->flags, changed_mask, extack); if (err) { p->flags = old_flags; return err; } if (br_vlan_tunnel_old && !(p->flags & BR_VLAN_TUNNEL)) nbp_vlan_tunnel_info_flush(p); br_port_flags_change(p, changed_mask); if (tb[IFLA_BRPORT_COST]) { err = br_stp_set_path_cost(p, nla_get_u32(tb[IFLA_BRPORT_COST])); if (err) return err; } if (tb[IFLA_BRPORT_PRIORITY]) { err = br_stp_set_port_priority(p, nla_get_u16(tb[IFLA_BRPORT_PRIORITY])); if (err) return err; } if (tb[IFLA_BRPORT_STATE]) { err = br_set_port_state(p, nla_get_u8(tb[IFLA_BRPORT_STATE])); if (err) return err; } if (tb[IFLA_BRPORT_FLUSH]) br_fdb_delete_by_port(p->br, p, 0, 0); #ifdef CONFIG_BRIDGE_IGMP_SNOOPING if (tb[IFLA_BRPORT_MULTICAST_ROUTER]) { u8 mcast_router = nla_get_u8(tb[IFLA_BRPORT_MULTICAST_ROUTER]); err = br_multicast_set_port_router(&p->multicast_ctx, mcast_router); if (err) return err; } if (tb[IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT]) { u32 hlimit; hlimit = nla_get_u32(tb[IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT]); err = br_multicast_eht_set_hosts_limit(p, hlimit); if (err) return err; } if (tb[IFLA_BRPORT_MCAST_MAX_GROUPS]) { u32 max_groups; max_groups = nla_get_u32(tb[IFLA_BRPORT_MCAST_MAX_GROUPS]); br_multicast_ngroups_set_max(&p->multicast_ctx, max_groups); } #endif if (tb[IFLA_BRPORT_GROUP_FWD_MASK]) { u16 fwd_mask = nla_get_u16(tb[IFLA_BRPORT_GROUP_FWD_MASK]); if (fwd_mask & BR_GROUPFWD_MACPAUSE) return -EINVAL; p->group_fwd_mask = fwd_mask; } if (tb[IFLA_BRPORT_BACKUP_PORT]) { struct net_device *backup_dev = NULL; u32 backup_ifindex; backup_ifindex = nla_get_u32(tb[IFLA_BRPORT_BACKUP_PORT]); if (backup_ifindex) { backup_dev = __dev_get_by_index(dev_net(p->dev), backup_ifindex); if (!backup_dev) return -ENOENT; } err = nbp_backup_change(p, backup_dev); if (err) return err; } if (tb[IFLA_BRPORT_BACKUP_NHID]) { u32 backup_nhid = nla_get_u32(tb[IFLA_BRPORT_BACKUP_NHID]); WRITE_ONCE(p->backup_nhid, backup_nhid); } return 0; } /* Change state and parameters on port. */ int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags, struct netlink_ext_ack *extack) { struct net_bridge *br = (struct net_bridge *)netdev_priv(dev); struct nlattr *tb[IFLA_BRPORT_MAX + 1]; struct net_bridge_port *p; struct nlattr *protinfo; struct nlattr *afspec; bool changed = false; int err = 0; protinfo = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_PROTINFO); afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); if (!protinfo && !afspec) return 0; p = br_port_get_rtnl(dev); /* We want to accept dev as bridge itself if the AF_SPEC * is set to see if someone is setting vlan info on the bridge */ if (!p && !afspec) return -EINVAL; if (p && protinfo) { if (protinfo->nla_type & NLA_F_NESTED) { err = nla_parse_nested_deprecated(tb, IFLA_BRPORT_MAX, protinfo, br_port_policy, NULL); if (err) return err; spin_lock_bh(&p->br->lock); err = br_setport(p, tb, extack); spin_unlock_bh(&p->br->lock); } else { /* Binary compatibility with old RSTP */ if (nla_len(protinfo) < sizeof(u8)) return -EINVAL; spin_lock_bh(&p->br->lock); err = br_set_port_state(p, nla_get_u8(protinfo)); spin_unlock_bh(&p->br->lock); } if (err) goto out; changed = true; } if (afspec) err = br_afspec(br, p, afspec, RTM_SETLINK, &changed, extack); if (changed) br_ifinfo_notify(RTM_NEWLINK, br, p); out: return err; } /* Delete port information */ int br_dellink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags) { struct net_bridge *br = (struct net_bridge *)netdev_priv(dev); struct net_bridge_port *p; struct nlattr *afspec; bool changed = false; int err = 0; afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); if (!afspec) return 0; p = br_port_get_rtnl(dev); /* We want to accept dev as bridge itself as well */ if (!p && !netif_is_bridge_master(dev)) return -EINVAL; err = br_afspec(br, p, afspec, RTM_DELLINK, &changed, NULL); if (changed) /* Send RTM_NEWLINK because userspace * expects RTM_NEWLINK for vlan dels */ br_ifinfo_notify(RTM_NEWLINK, br, p); return err; } static int br_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) return -EINVAL; if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) return -EADDRNOTAVAIL; } if (!data) return 0; #ifdef CONFIG_BRIDGE_VLAN_FILTERING if (data[IFLA_BR_VLAN_PROTOCOL] && !eth_type_vlan(nla_get_be16(data[IFLA_BR_VLAN_PROTOCOL]))) return -EPROTONOSUPPORT; if (data[IFLA_BR_VLAN_DEFAULT_PVID]) { __u16 defpvid = nla_get_u16(data[IFLA_BR_VLAN_DEFAULT_PVID]); if (defpvid >= VLAN_VID_MASK) return -EINVAL; } #endif return 0; } static int br_port_slave_changelink(struct net_device *brdev, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct net_bridge *br = netdev_priv(brdev); int ret; if (!data) return 0; spin_lock_bh(&br->lock); ret = br_setport(br_port_get_rtnl(dev), data, extack); spin_unlock_bh(&br->lock); return ret; } static int br_port_fill_slave_info(struct sk_buff *skb, const struct net_device *brdev, const struct net_device *dev) { return br_port_fill_attrs(skb, br_port_get_rtnl(dev)); } static size_t br_port_get_slave_size(const struct net_device *brdev, const struct net_device *dev) { return br_port_info_size(); } static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { [IFLA_BR_FORWARD_DELAY] = { .type = NLA_U32 }, [IFLA_BR_HELLO_TIME] = { .type = NLA_U32 }, [IFLA_BR_MAX_AGE] = { .type = NLA_U32 }, [IFLA_BR_AGEING_TIME] = { .type = NLA_U32 }, [IFLA_BR_STP_STATE] = { .type = NLA_U32 }, [IFLA_BR_PRIORITY] = { .type = NLA_U16 }, [IFLA_BR_VLAN_FILTERING] = { .type = NLA_U8 }, [IFLA_BR_VLAN_PROTOCOL] = { .type = NLA_U16 }, [IFLA_BR_GROUP_FWD_MASK] = { .type = NLA_U16 }, [IFLA_BR_GROUP_ADDR] = { .type = NLA_BINARY, .len = ETH_ALEN }, [IFLA_BR_MCAST_ROUTER] = { .type = NLA_U8 }, [IFLA_BR_MCAST_SNOOPING] = { .type = NLA_U8 }, [IFLA_BR_MCAST_QUERY_USE_IFADDR] = { .type = NLA_U8 }, [IFLA_BR_MCAST_QUERIER] = { .type = NLA_U8 }, [IFLA_BR_MCAST_HASH_ELASTICITY] = { .type = NLA_U32 }, [IFLA_BR_MCAST_HASH_MAX] = { .type = NLA_U32 }, [IFLA_BR_MCAST_LAST_MEMBER_CNT] = { .type = NLA_U32 }, [IFLA_BR_MCAST_STARTUP_QUERY_CNT] = { .type = NLA_U32 }, [IFLA_BR_MCAST_LAST_MEMBER_INTVL] = { .type = NLA_U64 }, [IFLA_BR_MCAST_MEMBERSHIP_INTVL] = { .type = NLA_U64 }, [IFLA_BR_MCAST_QUERIER_INTVL] = { .type = NLA_U64 }, [IFLA_BR_MCAST_QUERY_INTVL] = { .type = NLA_U64 }, [IFLA_BR_MCAST_QUERY_RESPONSE_INTVL] = { .type = NLA_U64 }, [IFLA_BR_MCAST_STARTUP_QUERY_INTVL] = { .type = NLA_U64 }, [IFLA_BR_NF_CALL_IPTABLES] = { .type = NLA_U8 }, [IFLA_BR_NF_CALL_IP6TABLES] = { .type = NLA_U8 }, [IFLA_BR_NF_CALL_ARPTABLES] = { .type = NLA_U8 }, [IFLA_BR_VLAN_DEFAULT_PVID] = { .type = NLA_U16 }, [IFLA_BR_VLAN_STATS_ENABLED] = { .type = NLA_U8 }, [IFLA_BR_MCAST_STATS_ENABLED] = { .type = NLA_U8 }, [IFLA_BR_MCAST_IGMP_VERSION] = { .type = NLA_U8 }, [IFLA_BR_MCAST_MLD_VERSION] = { .type = NLA_U8 }, [IFLA_BR_VLAN_STATS_PER_PORT] = { .type = NLA_U8 }, [IFLA_BR_MULTI_BOOLOPT] = NLA_POLICY_EXACT_LEN(sizeof(struct br_boolopt_multi)), }; static int br_changelink(struct net_device *brdev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct net_bridge *br = netdev_priv(brdev); int err; if (!data) return 0; if (data[IFLA_BR_FORWARD_DELAY]) { err = br_set_forward_delay(br, nla_get_u32(data[IFLA_BR_FORWARD_DELAY])); if (err) return err; } if (data[IFLA_BR_HELLO_TIME]) { err = br_set_hello_time(br, nla_get_u32(data[IFLA_BR_HELLO_TIME])); if (err) return err; } if (data[IFLA_BR_MAX_AGE]) { err = br_set_max_age(br, nla_get_u32(data[IFLA_BR_MAX_AGE])); if (err) return err; } if (data[IFLA_BR_AGEING_TIME]) { err = br_set_ageing_time(br, nla_get_u32(data[IFLA_BR_AGEING_TIME])); if (err) return err; } if (data[IFLA_BR_STP_STATE]) { u32 stp_enabled = nla_get_u32(data[IFLA_BR_STP_STATE]); err = br_stp_set_enabled(br, stp_enabled, extack); if (err) return err; } if (data[IFLA_BR_PRIORITY]) { u32 priority = nla_get_u16(data[IFLA_BR_PRIORITY]); br_stp_set_bridge_priority(br, priority); } if (data[IFLA_BR_VLAN_FILTERING]) { u8 vlan_filter = nla_get_u8(data[IFLA_BR_VLAN_FILTERING]); err = br_vlan_filter_toggle(br, vlan_filter, extack); if (err) return err; } #ifdef CONFIG_BRIDGE_VLAN_FILTERING if (data[IFLA_BR_VLAN_PROTOCOL]) { __be16 vlan_proto = nla_get_be16(data[IFLA_BR_VLAN_PROTOCOL]); err = __br_vlan_set_proto(br, vlan_proto, extack); if (err) return err; } if (data[IFLA_BR_VLAN_DEFAULT_PVID]) { __u16 defpvid = nla_get_u16(data[IFLA_BR_VLAN_DEFAULT_PVID]); err = __br_vlan_set_default_pvid(br, defpvid, extack); if (err) return err; } if (data[IFLA_BR_VLAN_STATS_ENABLED]) { __u8 vlan_stats = nla_get_u8(data[IFLA_BR_VLAN_STATS_ENABLED]); err = br_vlan_set_stats(br, vlan_stats); if (err) return err; } if (data[IFLA_BR_VLAN_STATS_PER_PORT]) { __u8 per_port = nla_get_u8(data[IFLA_BR_VLAN_STATS_PER_PORT]); err = br_vlan_set_stats_per_port(br, per_port); if (err) return err; } #endif if (data[IFLA_BR_GROUP_FWD_MASK]) { u16 fwd_mask = nla_get_u16(data[IFLA_BR_GROUP_FWD_MASK]); if (fwd_mask & BR_GROUPFWD_RESTRICTED) return -EINVAL; br->group_fwd_mask = fwd_mask; } if (data[IFLA_BR_GROUP_ADDR]) { u8 new_addr[ETH_ALEN]; if (nla_len(data[IFLA_BR_GROUP_ADDR]) != ETH_ALEN) return -EINVAL; memcpy(new_addr, nla_data(data[IFLA_BR_GROUP_ADDR]), ETH_ALEN); if (!is_link_local_ether_addr(new_addr)) return -EINVAL; if (new_addr[5] == 1 || /* 802.3x Pause address */ new_addr[5] == 2 || /* 802.3ad Slow protocols */ new_addr[5] == 3) /* 802.1X PAE address */ return -EINVAL; spin_lock_bh(&br->lock); memcpy(br->group_addr, new_addr, sizeof(br->group_addr)); spin_unlock_bh(&br->lock); br_opt_toggle(br, BROPT_GROUP_ADDR_SET, true); br_recalculate_fwd_mask(br); } if (data[IFLA_BR_FDB_FLUSH]) { struct net_bridge_fdb_flush_desc desc = { .flags_mask = BIT(BR_FDB_STATIC) }; br_fdb_flush(br, &desc); } #ifdef CONFIG_BRIDGE_IGMP_SNOOPING if (data[IFLA_BR_MCAST_ROUTER]) { u8 multicast_router = nla_get_u8(data[IFLA_BR_MCAST_ROUTER]); err = br_multicast_set_router(&br->multicast_ctx, multicast_router); if (err) return err; } if (data[IFLA_BR_MCAST_SNOOPING]) { u8 mcast_snooping = nla_get_u8(data[IFLA_BR_MCAST_SNOOPING]); err = br_multicast_toggle(br, mcast_snooping, extack); if (err) return err; } if (data[IFLA_BR_MCAST_QUERY_USE_IFADDR]) { u8 val; val = nla_get_u8(data[IFLA_BR_MCAST_QUERY_USE_IFADDR]); br_opt_toggle(br, BROPT_MULTICAST_QUERY_USE_IFADDR, !!val); } if (data[IFLA_BR_MCAST_QUERIER]) { u8 mcast_querier = nla_get_u8(data[IFLA_BR_MCAST_QUERIER]); err = br_multicast_set_querier(&br->multicast_ctx, mcast_querier); if (err) return err; } if (data[IFLA_BR_MCAST_HASH_ELASTICITY]) br_warn(br, "the hash_elasticity option has been deprecated and is always %u\n", RHT_ELASTICITY); if (data[IFLA_BR_MCAST_HASH_MAX]) br->hash_max = nla_get_u32(data[IFLA_BR_MCAST_HASH_MAX]); if (data[IFLA_BR_MCAST_LAST_MEMBER_CNT]) { u32 val = nla_get_u32(data[IFLA_BR_MCAST_LAST_MEMBER_CNT]); br->multicast_ctx.multicast_last_member_count = val; } if (data[IFLA_BR_MCAST_STARTUP_QUERY_CNT]) { u32 val = nla_get_u32(data[IFLA_BR_MCAST_STARTUP_QUERY_CNT]); br->multicast_ctx.multicast_startup_query_count = val; } if (data[IFLA_BR_MCAST_LAST_MEMBER_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_LAST_MEMBER_INTVL]); br->multicast_ctx.multicast_last_member_interval = clock_t_to_jiffies(val); } if (data[IFLA_BR_MCAST_MEMBERSHIP_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_MEMBERSHIP_INTVL]); br->multicast_ctx.multicast_membership_interval = clock_t_to_jiffies(val); } if (data[IFLA_BR_MCAST_QUERIER_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_QUERIER_INTVL]); br->multicast_ctx.multicast_querier_interval = clock_t_to_jiffies(val); } if (data[IFLA_BR_MCAST_QUERY_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_QUERY_INTVL]); br_multicast_set_query_intvl(&br->multicast_ctx, val); } if (data[IFLA_BR_MCAST_QUERY_RESPONSE_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_QUERY_RESPONSE_INTVL]); br->multicast_ctx.multicast_query_response_interval = clock_t_to_jiffies(val); } if (data[IFLA_BR_MCAST_STARTUP_QUERY_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_STARTUP_QUERY_INTVL]); br_multicast_set_startup_query_intvl(&br->multicast_ctx, val); } if (data[IFLA_BR_MCAST_STATS_ENABLED]) { __u8 mcast_stats; mcast_stats = nla_get_u8(data[IFLA_BR_MCAST_STATS_ENABLED]); br_opt_toggle(br, BROPT_MULTICAST_STATS_ENABLED, !!mcast_stats); } if (data[IFLA_BR_MCAST_IGMP_VERSION]) { __u8 igmp_version; igmp_version = nla_get_u8(data[IFLA_BR_MCAST_IGMP_VERSION]); err = br_multicast_set_igmp_version(&br->multicast_ctx, igmp_version); if (err) return err; } #if IS_ENABLED(CONFIG_IPV6) if (data[IFLA_BR_MCAST_MLD_VERSION]) { __u8 mld_version; mld_version = nla_get_u8(data[IFLA_BR_MCAST_MLD_VERSION]); err = br_multicast_set_mld_version(&br->multicast_ctx, mld_version); if (err) return err; } #endif #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (data[IFLA_BR_NF_CALL_IPTABLES]) { u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_IPTABLES]); br_opt_toggle(br, BROPT_NF_CALL_IPTABLES, !!val); } if (data[IFLA_BR_NF_CALL_IP6TABLES]) { u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_IP6TABLES]); br_opt_toggle(br, BROPT_NF_CALL_IP6TABLES, !!val); } if (data[IFLA_BR_NF_CALL_ARPTABLES]) { u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_ARPTABLES]); br_opt_toggle(br, BROPT_NF_CALL_ARPTABLES, !!val); } #endif if (data[IFLA_BR_MULTI_BOOLOPT]) { struct br_boolopt_multi *bm; bm = nla_data(data[IFLA_BR_MULTI_BOOLOPT]); err = br_boolopt_multi_toggle(br, bm, extack); if (err) return err; } return 0; } static int br_dev_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct net_bridge *br = netdev_priv(dev); int err; err = register_netdevice(dev); if (err) return err; if (tb[IFLA_ADDRESS]) { spin_lock_bh(&br->lock); br_stp_change_bridge_id(br, nla_data(tb[IFLA_ADDRESS])); spin_unlock_bh(&br->lock); } err = br_changelink(dev, tb, data, extack); if (err) br_dev_delete(dev, NULL); return err; } static size_t br_get_size(const struct net_device *brdev) { return nla_total_size(sizeof(u32)) + /* IFLA_BR_FORWARD_DELAY */ nla_total_size(sizeof(u32)) + /* IFLA_BR_HELLO_TIME */ nla_total_size(sizeof(u32)) + /* IFLA_BR_MAX_AGE */ nla_total_size(sizeof(u32)) + /* IFLA_BR_AGEING_TIME */ nla_total_size(sizeof(u32)) + /* IFLA_BR_STP_STATE */ nla_total_size(sizeof(u16)) + /* IFLA_BR_PRIORITY */ nla_total_size(sizeof(u8)) + /* IFLA_BR_VLAN_FILTERING */ #ifdef CONFIG_BRIDGE_VLAN_FILTERING nla_total_size(sizeof(__be16)) + /* IFLA_BR_VLAN_PROTOCOL */ nla_total_size(sizeof(u16)) + /* IFLA_BR_VLAN_DEFAULT_PVID */ nla_total_size(sizeof(u8)) + /* IFLA_BR_VLAN_STATS_ENABLED */ nla_total_size(sizeof(u8)) + /* IFLA_BR_VLAN_STATS_PER_PORT */ #endif nla_total_size(sizeof(u16)) + /* IFLA_BR_GROUP_FWD_MASK */ nla_total_size(sizeof(struct ifla_bridge_id)) + /* IFLA_BR_ROOT_ID */ nla_total_size(sizeof(struct ifla_bridge_id)) + /* IFLA_BR_BRIDGE_ID */ nla_total_size(sizeof(u16)) + /* IFLA_BR_ROOT_PORT */ nla_total_size(sizeof(u32)) + /* IFLA_BR_ROOT_PATH_COST */ nla_total_size(sizeof(u8)) + /* IFLA_BR_TOPOLOGY_CHANGE */ nla_total_size(sizeof(u8)) + /* IFLA_BR_TOPOLOGY_CHANGE_DETECTED */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_HELLO_TIMER */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_TCN_TIMER */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_TOPOLOGY_CHANGE_TIMER */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_GC_TIMER */ nla_total_size(ETH_ALEN) + /* IFLA_BR_GROUP_ADDR */ #ifdef CONFIG_BRIDGE_IGMP_SNOOPING nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_ROUTER */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_SNOOPING */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_QUERY_USE_IFADDR */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_QUERIER */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_STATS_ENABLED */ nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_HASH_ELASTICITY */ nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_HASH_MAX */ nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_LAST_MEMBER_CNT */ nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_STARTUP_QUERY_CNT */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_LAST_MEMBER_INTVL */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_MEMBERSHIP_INTVL */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_QUERIER_INTVL */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_INTVL */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_RESPONSE_INTVL */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_STARTUP_QUERY_INTVL */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_IGMP_VERSION */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_MLD_VERSION */ br_multicast_querier_state_size() + /* IFLA_BR_MCAST_QUERIER_STATE */ #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_IPTABLES */ nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_IP6TABLES */ nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_ARPTABLES */ #endif nla_total_size(sizeof(struct br_boolopt_multi)) + /* IFLA_BR_MULTI_BOOLOPT */ 0; } static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) { struct net_bridge *br = netdev_priv(brdev); u32 forward_delay = jiffies_to_clock_t(br->forward_delay); u32 hello_time = jiffies_to_clock_t(br->hello_time); u32 age_time = jiffies_to_clock_t(br->max_age); u32 ageing_time = jiffies_to_clock_t(br->ageing_time); u32 stp_enabled = br->stp_enabled; u16 priority = (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]; u8 vlan_enabled = br_vlan_enabled(br->dev); struct br_boolopt_multi bm; u64 clockval; clockval = br_timer_value(&br->hello_timer); if (nla_put_u64_64bit(skb, IFLA_BR_HELLO_TIMER, clockval, IFLA_BR_PAD)) return -EMSGSIZE; clockval = br_timer_value(&br->tcn_timer); if (nla_put_u64_64bit(skb, IFLA_BR_TCN_TIMER, clockval, IFLA_BR_PAD)) return -EMSGSIZE; clockval = br_timer_value(&br->topology_change_timer); if (nla_put_u64_64bit(skb, IFLA_BR_TOPOLOGY_CHANGE_TIMER, clockval, IFLA_BR_PAD)) return -EMSGSIZE; clockval = br_timer_value(&br->gc_work.timer); if (nla_put_u64_64bit(skb, IFLA_BR_GC_TIMER, clockval, IFLA_BR_PAD)) return -EMSGSIZE; br_boolopt_multi_get(br, &bm); if (nla_put_u32(skb, IFLA_BR_FORWARD_DELAY, forward_delay) || nla_put_u32(skb, IFLA_BR_HELLO_TIME, hello_time) || nla_put_u32(skb, IFLA_BR_MAX_AGE, age_time) || nla_put_u32(skb, IFLA_BR_AGEING_TIME, ageing_time) || nla_put_u32(skb, IFLA_BR_STP_STATE, stp_enabled) || nla_put_u16(skb, IFLA_BR_PRIORITY, priority) || nla_put_u8(skb, IFLA_BR_VLAN_FILTERING, vlan_enabled) || nla_put_u16(skb, IFLA_BR_GROUP_FWD_MASK, br->group_fwd_mask) || nla_put(skb, IFLA_BR_BRIDGE_ID, sizeof(struct ifla_bridge_id), &br->bridge_id) || nla_put(skb, IFLA_BR_ROOT_ID, sizeof(struct ifla_bridge_id), &br->designated_root) || nla_put_u16(skb, IFLA_BR_ROOT_PORT, br->root_port) || nla_put_u32(skb, IFLA_BR_ROOT_PATH_COST, br->root_path_cost) || nla_put_u8(skb, IFLA_BR_TOPOLOGY_CHANGE, br->topology_change) || nla_put_u8(skb, IFLA_BR_TOPOLOGY_CHANGE_DETECTED, br->topology_change_detected) || nla_put(skb, IFLA_BR_GROUP_ADDR, ETH_ALEN, br->group_addr) || nla_put(skb, IFLA_BR_MULTI_BOOLOPT, sizeof(bm), &bm)) return -EMSGSIZE; #ifdef CONFIG_BRIDGE_VLAN_FILTERING if (nla_put_be16(skb, IFLA_BR_VLAN_PROTOCOL, br->vlan_proto) || nla_put_u16(skb, IFLA_BR_VLAN_DEFAULT_PVID, br->default_pvid) || nla_put_u8(skb, IFLA_BR_VLAN_STATS_ENABLED, br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) || nla_put_u8(skb, IFLA_BR_VLAN_STATS_PER_PORT, br_opt_get(br, BROPT_VLAN_STATS_PER_PORT))) return -EMSGSIZE; #endif #ifdef CONFIG_BRIDGE_IGMP_SNOOPING if (nla_put_u8(skb, IFLA_BR_MCAST_ROUTER, br->multicast_ctx.multicast_router) || nla_put_u8(skb, IFLA_BR_MCAST_SNOOPING, br_opt_get(br, BROPT_MULTICAST_ENABLED)) || nla_put_u8(skb, IFLA_BR_MCAST_QUERY_USE_IFADDR, br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR)) || nla_put_u8(skb, IFLA_BR_MCAST_QUERIER, br->multicast_ctx.multicast_querier) || nla_put_u8(skb, IFLA_BR_MCAST_STATS_ENABLED, br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED)) || nla_put_u32(skb, IFLA_BR_MCAST_HASH_ELASTICITY, RHT_ELASTICITY) || nla_put_u32(skb, IFLA_BR_MCAST_HASH_MAX, br->hash_max) || nla_put_u32(skb, IFLA_BR_MCAST_LAST_MEMBER_CNT, br->multicast_ctx.multicast_last_member_count) || nla_put_u32(skb, IFLA_BR_MCAST_STARTUP_QUERY_CNT, br->multicast_ctx.multicast_startup_query_count) || nla_put_u8(skb, IFLA_BR_MCAST_IGMP_VERSION, br->multicast_ctx.multicast_igmp_version) || br_multicast_dump_querier_state(skb, &br->multicast_ctx, IFLA_BR_MCAST_QUERIER_STATE)) return -EMSGSIZE; #if IS_ENABLED(CONFIG_IPV6) if (nla_put_u8(skb, IFLA_BR_MCAST_MLD_VERSION, br->multicast_ctx.multicast_mld_version)) return -EMSGSIZE; #endif clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_last_member_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_LAST_MEMBER_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_membership_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_MEMBERSHIP_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_querier_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_QUERIER_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_query_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_QUERY_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_query_response_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_QUERY_RESPONSE_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_startup_query_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_STARTUP_QUERY_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (nla_put_u8(skb, IFLA_BR_NF_CALL_IPTABLES, br_opt_get(br, BROPT_NF_CALL_IPTABLES) ? 1 : 0) || nla_put_u8(skb, IFLA_BR_NF_CALL_IP6TABLES, br_opt_get(br, BROPT_NF_CALL_IP6TABLES) ? 1 : 0) || nla_put_u8(skb, IFLA_BR_NF_CALL_ARPTABLES, br_opt_get(br, BROPT_NF_CALL_ARPTABLES) ? 1 : 0)) return -EMSGSIZE; #endif return 0; } static size_t br_get_linkxstats_size(const struct net_device *dev, int attr) { struct net_bridge_port *p = NULL; struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; struct net_bridge *br; int numvls = 0; switch (attr) { case IFLA_STATS_LINK_XSTATS: br = netdev_priv(dev); vg = br_vlan_group(br); break; case IFLA_STATS_LINK_XSTATS_SLAVE: p = br_port_get_rtnl(dev); if (!p) return 0; vg = nbp_vlan_group(p); break; default: return 0; } if (vg) { /* we need to count all, even placeholder entries */ list_for_each_entry(v, &vg->vlan_list, vlist) numvls++; } return numvls * nla_total_size(sizeof(struct bridge_vlan_xstats)) + nla_total_size_64bit(sizeof(struct br_mcast_stats)) + (p ? nla_total_size_64bit(sizeof(p->stp_xstats)) : 0) + nla_total_size(0); } static int br_fill_linkxstats(struct sk_buff *skb, const struct net_device *dev, int *prividx, int attr) { struct nlattr *nla __maybe_unused; struct net_bridge_port *p = NULL; struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; struct net_bridge *br; struct nlattr *nest; int vl_idx = 0; switch (attr) { case IFLA_STATS_LINK_XSTATS: br = netdev_priv(dev); vg = br_vlan_group(br); break; case IFLA_STATS_LINK_XSTATS_SLAVE: p = br_port_get_rtnl(dev); if (!p) return 0; br = p->br; vg = nbp_vlan_group(p); break; default: return -EINVAL; } nest = nla_nest_start_noflag(skb, LINK_XSTATS_TYPE_BRIDGE); if (!nest) return -EMSGSIZE; if (vg) { u16 pvid; pvid = br_get_pvid(vg); list_for_each_entry(v, &vg->vlan_list, vlist) { struct bridge_vlan_xstats vxi; struct pcpu_sw_netstats stats; if (++vl_idx < *prividx) continue; memset(&vxi, 0, sizeof(vxi)); vxi.vid = v->vid; vxi.flags = v->flags; if (v->vid == pvid) vxi.flags |= BRIDGE_VLAN_INFO_PVID; br_vlan_get_stats(v, &stats); vxi.rx_bytes = u64_stats_read(&stats.rx_bytes); vxi.rx_packets = u64_stats_read(&stats.rx_packets); vxi.tx_bytes = u64_stats_read(&stats.tx_bytes); vxi.tx_packets = u64_stats_read(&stats.tx_packets); if (nla_put(skb, BRIDGE_XSTATS_VLAN, sizeof(vxi), &vxi)) goto nla_put_failure; } } #ifdef CONFIG_BRIDGE_IGMP_SNOOPING if (++vl_idx >= *prividx) { nla = nla_reserve_64bit(skb, BRIDGE_XSTATS_MCAST, sizeof(struct br_mcast_stats), BRIDGE_XSTATS_PAD); if (!nla) goto nla_put_failure; br_multicast_get_stats(br, p, nla_data(nla)); } #endif if (p) { nla = nla_reserve_64bit(skb, BRIDGE_XSTATS_STP, sizeof(p->stp_xstats), BRIDGE_XSTATS_PAD); if (!nla) goto nla_put_failure; spin_lock_bh(&br->lock); memcpy(nla_data(nla), &p->stp_xstats, sizeof(p->stp_xstats)); spin_unlock_bh(&br->lock); } nla_nest_end(skb, nest); *prividx = 0; return 0; nla_put_failure: nla_nest_end(skb, nest); *prividx = vl_idx; return -EMSGSIZE; } static struct rtnl_af_ops br_af_ops __read_mostly = { .family = AF_BRIDGE, .get_link_af_size = br_get_link_af_size_filtered, }; struct rtnl_link_ops br_link_ops __read_mostly = { .kind = "bridge", .priv_size = sizeof(struct net_bridge), .setup = br_dev_setup, .maxtype = IFLA_BR_MAX, .policy = br_policy, .validate = br_validate, .newlink = br_dev_newlink, .changelink = br_changelink, .dellink = br_dev_delete, .get_size = br_get_size, .fill_info = br_fill_info, .fill_linkxstats = br_fill_linkxstats, .get_linkxstats_size = br_get_linkxstats_size, .slave_maxtype = IFLA_BRPORT_MAX, .slave_policy = br_port_policy, .slave_changelink = br_port_slave_changelink, .get_slave_size = br_port_get_slave_size, .fill_slave_info = br_port_fill_slave_info, }; int __init br_netlink_init(void) { int err; br_vlan_rtnl_init(); rtnl_af_register(&br_af_ops); err = rtnl_link_register(&br_link_ops); if (err) goto out_af; return 0; out_af: rtnl_af_unregister(&br_af_ops); return err; } void br_netlink_fini(void) { br_vlan_rtnl_uninit(); rtnl_af_unregister(&br_af_ops); rtnl_link_unregister(&br_link_ops); }
833 834 833 890 835 555 556 283 283 834 834 834 890 834 833 833 888 890 890 888 890 890 834 889 888 834 890 887 890 890 1257 1259 1259 65 1257 1113 1260 1259 1257 1144 891 889 890 890 890 889 328 285 285 1258 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 // SPDX-License-Identifier: GPL-2.0-only /* * (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2011 Patrick McHardy <kaber@trash.net> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/types.h> #include <linux/timer.h> #include <linux/skbuff.h> #include <linux/gfp.h> #include <net/xfrm.h> #include <linux/siphash.h> #include <linux/rtnetlink.h> #include <net/netfilter/nf_conntrack_bpf.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_helper.h> #include <uapi/linux/netfilter/nf_nat.h> #include "nf_internals.h" #define NF_NAT_MAX_ATTEMPTS 128 #define NF_NAT_HARDER_THRESH (NF_NAT_MAX_ATTEMPTS / 4) static spinlock_t nf_nat_locks[CONNTRACK_LOCKS]; static DEFINE_MUTEX(nf_nat_proto_mutex); static unsigned int nat_net_id __read_mostly; static struct hlist_head *nf_nat_bysource __read_mostly; static unsigned int nf_nat_htable_size __read_mostly; static siphash_aligned_key_t nf_nat_hash_rnd; struct nf_nat_lookup_hook_priv { struct nf_hook_entries __rcu *entries; struct rcu_head rcu_head; }; struct nf_nat_hooks_net { struct nf_hook_ops *nat_hook_ops; unsigned int users; }; struct nat_net { struct nf_nat_hooks_net nat_proto_net[NFPROTO_NUMPROTO]; }; #ifdef CONFIG_XFRM static void nf_nat_ipv4_decode_session(struct sk_buff *skb, const struct nf_conn *ct, enum ip_conntrack_dir dir, unsigned long statusbit, struct flowi *fl) { const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple; struct flowi4 *fl4 = &fl->u.ip4; if (ct->status & statusbit) { fl4->daddr = t->dst.u3.ip; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl4->fl4_dport = t->dst.u.all; } statusbit ^= IPS_NAT_MASK; if (ct->status & statusbit) { fl4->saddr = t->src.u3.ip; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl4->fl4_sport = t->src.u.all; } } static void nf_nat_ipv6_decode_session(struct sk_buff *skb, const struct nf_conn *ct, enum ip_conntrack_dir dir, unsigned long statusbit, struct flowi *fl) { #if IS_ENABLED(CONFIG_IPV6) const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple; struct flowi6 *fl6 = &fl->u.ip6; if (ct->status & statusbit) { fl6->daddr = t->dst.u3.in6; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl6->fl6_dport = t->dst.u.all; } statusbit ^= IPS_NAT_MASK; if (ct->status & statusbit) { fl6->saddr = t->src.u3.in6; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl6->fl6_sport = t->src.u.all; } #endif } static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl) { const struct nf_conn *ct; enum ip_conntrack_info ctinfo; enum ip_conntrack_dir dir; unsigned long statusbit; u8 family; ct = nf_ct_get(skb, &ctinfo); if (ct == NULL) return; family = nf_ct_l3num(ct); dir = CTINFO2DIR(ctinfo); if (dir == IP_CT_DIR_ORIGINAL) statusbit = IPS_DST_NAT; else statusbit = IPS_SRC_NAT; switch (family) { case NFPROTO_IPV4: nf_nat_ipv4_decode_session(skb, ct, dir, statusbit, fl); return; case NFPROTO_IPV6: nf_nat_ipv6_decode_session(skb, ct, dir, statusbit, fl); return; } } #endif /* CONFIG_XFRM */ /* We keep an extra hash for each conntrack, for fast searching. */ static unsigned int hash_by_src(const struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { unsigned int hash; struct { struct nf_conntrack_man src; u32 net_mix; u32 protonum; u32 zone; } __aligned(SIPHASH_ALIGNMENT) combined; get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd)); memset(&combined, 0, sizeof(combined)); /* Original src, to ensure we map it consistently if poss. */ combined.src = tuple->src; combined.net_mix = net_hash_mix(net); combined.protonum = tuple->dst.protonum; /* Zone ID can be used provided its valid for both directions */ if (zone->dir == NF_CT_DEFAULT_ZONE_DIR) combined.zone = zone->id; hash = siphash(&combined, sizeof(combined), &nf_nat_hash_rnd); return reciprocal_scale(hash, nf_nat_htable_size); } /* Is this tuple already taken? (not by us) */ static int nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_conntrack) { /* Conntrack tracking doesn't keep track of outgoing tuples; only * incoming ones. NAT means they don't have a fixed mapping, * so we invert the tuple and look for the incoming reply. * * We could keep a separate hash if this proves too slow. */ struct nf_conntrack_tuple reply; nf_ct_invert_tuple(&reply, tuple); return nf_conntrack_tuple_taken(&reply, ignored_conntrack); } static bool nf_nat_may_kill(struct nf_conn *ct, unsigned long flags) { static const unsigned long flags_refuse = IPS_FIXED_TIMEOUT | IPS_DYING; static const unsigned long flags_needed = IPS_SRC_NAT; enum tcp_conntrack old_state; old_state = READ_ONCE(ct->proto.tcp.state); if (old_state < TCP_CONNTRACK_TIME_WAIT) return false; if (flags & flags_refuse) return false; return (flags & flags_needed) == flags_needed; } /* reverse direction will send packets to new source, so * make sure such packets are invalid. */ static bool nf_seq_has_advanced(const struct nf_conn *old, const struct nf_conn *new) { return (__s32)(new->proto.tcp.seen[0].td_end - old->proto.tcp.seen[0].td_end) > 0; } static int nf_nat_used_tuple_harder(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_conntrack, unsigned int attempts_left) { static const unsigned long flags_offload = IPS_OFFLOAD | IPS_HW_OFFLOAD; struct nf_conntrack_tuple_hash *thash; const struct nf_conntrack_zone *zone; struct nf_conntrack_tuple reply; unsigned long flags; struct nf_conn *ct; bool taken = true; struct net *net; nf_ct_invert_tuple(&reply, tuple); if (attempts_left > NF_NAT_HARDER_THRESH || tuple->dst.protonum != IPPROTO_TCP || ignored_conntrack->proto.tcp.state != TCP_CONNTRACK_SYN_SENT) return nf_conntrack_tuple_taken(&reply, ignored_conntrack); /* :ast few attempts to find a free tcp port. Destructive * action: evict colliding if its in timewait state and the * tcp sequence number has advanced past the one used by the * old entry. */ net = nf_ct_net(ignored_conntrack); zone = nf_ct_zone(ignored_conntrack); thash = nf_conntrack_find_get(net, zone, &reply); if (!thash) return false; ct = nf_ct_tuplehash_to_ctrack(thash); if (thash->tuple.dst.dir == IP_CT_DIR_ORIGINAL) goto out; if (WARN_ON_ONCE(ct == ignored_conntrack)) goto out; flags = READ_ONCE(ct->status); if (!nf_nat_may_kill(ct, flags)) goto out; if (!nf_seq_has_advanced(ct, ignored_conntrack)) goto out; /* Even if we can evict do not reuse if entry is offloaded. */ if (nf_ct_kill(ct)) taken = flags & flags_offload; out: nf_ct_put(ct); return taken; } static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t, const struct nf_nat_range2 *range) { if (t->src.l3num == NFPROTO_IPV4) return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) && ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip); return ipv6_addr_cmp(&t->src.u3.in6, &range->min_addr.in6) >= 0 && ipv6_addr_cmp(&t->src.u3.in6, &range->max_addr.in6) <= 0; } /* Is the manipable part of the tuple between min and max incl? */ static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype, const union nf_conntrack_man_proto *min, const union nf_conntrack_man_proto *max) { __be16 port; switch (tuple->dst.protonum) { case IPPROTO_ICMP: case IPPROTO_ICMPV6: return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) && ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id); case IPPROTO_GRE: /* all fall though */ case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_UDPLITE: case IPPROTO_DCCP: case IPPROTO_SCTP: if (maniptype == NF_NAT_MANIP_SRC) port = tuple->src.u.all; else port = tuple->dst.u.all; return ntohs(port) >= ntohs(min->all) && ntohs(port) <= ntohs(max->all); default: return true; } } /* If we source map this tuple so reply looks like reply_tuple, will * that meet the constraints of range. */ static int nf_in_range(const struct nf_conntrack_tuple *tuple, const struct nf_nat_range2 *range) { /* If we are supposed to map IPs, then we must be in the * range specified, otherwise let this drag us onto a new src IP. */ if (range->flags & NF_NAT_RANGE_MAP_IPS && !nf_nat_inet_in_range(tuple, range)) return 0; if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) return 1; return l4proto_in_range(tuple, NF_NAT_MANIP_SRC, &range->min_proto, &range->max_proto); } static inline int same_src(const struct nf_conn *ct, const struct nf_conntrack_tuple *tuple) { const struct nf_conntrack_tuple *t; t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; return (t->dst.protonum == tuple->dst.protonum && nf_inet_addr_cmp(&t->src.u3, &tuple->src.u3) && t->src.u.all == tuple->src.u.all); } /* Only called for SRC manip */ static int find_appropriate_src(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *result, const struct nf_nat_range2 *range) { unsigned int h = hash_by_src(net, zone, tuple); const struct nf_conn *ct; hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) { if (same_src(ct, tuple) && net_eq(net, nf_ct_net(ct)) && nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) { /* Copy source part from reply tuple. */ nf_ct_invert_tuple(result, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); result->dst = tuple->dst; if (nf_in_range(result, range)) return 1; } } return 0; } /* For [FUTURE] fragmentation handling, we want the least-used * src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus * if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports * 1-65535, we don't do pro-rata allocation based on ports; we choose * the ip with the lowest src-ip/dst-ip/proto usage. */ static void find_best_ips_proto(const struct nf_conntrack_zone *zone, struct nf_conntrack_tuple *tuple, const struct nf_nat_range2 *range, const struct nf_conn *ct, enum nf_nat_manip_type maniptype) { union nf_inet_addr *var_ipp; unsigned int i, max; /* Host order */ u32 minip, maxip, j, dist; bool full_range; /* No IP mapping? Do nothing. */ if (!(range->flags & NF_NAT_RANGE_MAP_IPS)) return; if (maniptype == NF_NAT_MANIP_SRC) var_ipp = &tuple->src.u3; else var_ipp = &tuple->dst.u3; /* Fast path: only one choice. */ if (nf_inet_addr_cmp(&range->min_addr, &range->max_addr)) { *var_ipp = range->min_addr; return; } if (nf_ct_l3num(ct) == NFPROTO_IPV4) max = sizeof(var_ipp->ip) / sizeof(u32) - 1; else max = sizeof(var_ipp->ip6) / sizeof(u32) - 1; /* Hashing source and destination IPs gives a fairly even * spread in practice (if there are a small number of IPs * involved, there usually aren't that many connections * anyway). The consistency means that servers see the same * client coming from the same IP (some Internet Banking sites * like this), even across reboots. */ j = jhash2((u32 *)&tuple->src.u3, sizeof(tuple->src.u3) / sizeof(u32), range->flags & NF_NAT_RANGE_PERSISTENT ? 0 : (__force u32)tuple->dst.u3.all[max] ^ zone->id); full_range = false; for (i = 0; i <= max; i++) { /* If first bytes of the address are at the maximum, use the * distance. Otherwise use the full range. */ if (!full_range) { minip = ntohl((__force __be32)range->min_addr.all[i]); maxip = ntohl((__force __be32)range->max_addr.all[i]); dist = maxip - minip + 1; } else { minip = 0; dist = ~0; } var_ipp->all[i] = (__force __u32) htonl(minip + reciprocal_scale(j, dist)); if (var_ipp->all[i] != range->max_addr.all[i]) full_range = true; if (!(range->flags & NF_NAT_RANGE_PERSISTENT)) j ^= (__force u32)tuple->dst.u3.all[i]; } } /* Alter the per-proto part of the tuple (depending on maniptype), to * give a unique tuple in the given range if possible. * * Per-protocol part of tuple is initialized to the incoming packet. */ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { unsigned int range_size, min, max, i, attempts; __be16 *keyptr; u16 off; switch (tuple->dst.protonum) { case IPPROTO_ICMP: case IPPROTO_ICMPV6: /* id is same for either direction... */ keyptr = &tuple->src.u.icmp.id; if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { min = 0; range_size = 65536; } else { min = ntohs(range->min_proto.icmp.id); range_size = ntohs(range->max_proto.icmp.id) - ntohs(range->min_proto.icmp.id) + 1; } goto find_free_id; #if IS_ENABLED(CONFIG_NF_CT_PROTO_GRE) case IPPROTO_GRE: /* If there is no master conntrack we are not PPTP, do not change tuples */ if (!ct->master) return; if (maniptype == NF_NAT_MANIP_SRC) keyptr = &tuple->src.u.gre.key; else keyptr = &tuple->dst.u.gre.key; if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { min = 1; range_size = 65535; } else { min = ntohs(range->min_proto.gre.key); range_size = ntohs(range->max_proto.gre.key) - min + 1; } goto find_free_id; #endif case IPPROTO_UDP: case IPPROTO_UDPLITE: case IPPROTO_TCP: case IPPROTO_SCTP: case IPPROTO_DCCP: if (maniptype == NF_NAT_MANIP_SRC) keyptr = &tuple->src.u.all; else keyptr = &tuple->dst.u.all; break; default: return; } /* If no range specified... */ if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { /* If it's dst rewrite, can't change port */ if (maniptype == NF_NAT_MANIP_DST) return; if (ntohs(*keyptr) < 1024) { /* Loose convention: >> 512 is credential passing */ if (ntohs(*keyptr) < 512) { min = 1; range_size = 511 - min + 1; } else { min = 600; range_size = 1023 - min + 1; } } else { min = 1024; range_size = 65535 - 1024 + 1; } } else { min = ntohs(range->min_proto.all); max = ntohs(range->max_proto.all); if (unlikely(max < min)) swap(max, min); range_size = max - min + 1; } find_free_id: if (range->flags & NF_NAT_RANGE_PROTO_OFFSET) off = (ntohs(*keyptr) - ntohs(range->base_proto.all)); else off = get_random_u16(); attempts = range_size; if (attempts > NF_NAT_MAX_ATTEMPTS) attempts = NF_NAT_MAX_ATTEMPTS; /* We are in softirq; doing a search of the entire range risks * soft lockup when all tuples are already used. * * If we can't find any free port from first offset, pick a new * one and try again, with ever smaller search window. */ another_round: for (i = 0; i < attempts; i++, off++) { *keyptr = htons(min + off % range_size); if (!nf_nat_used_tuple_harder(tuple, ct, attempts - i)) return; } if (attempts >= range_size || attempts < 16) return; attempts /= 2; off = get_random_u16(); goto another_round; } /* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING, * we change the source to map into the range. For NF_INET_PRE_ROUTING * and NF_INET_LOCAL_OUT, we change the destination to map into the * range. It might not be possible to get a unique tuple, but we try. * At worst (or if we race), we will end up with a final duplicate in * __nf_conntrack_confirm and drop the packet. */ static void get_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig_tuple, const struct nf_nat_range2 *range, struct nf_conn *ct, enum nf_nat_manip_type maniptype) { const struct nf_conntrack_zone *zone; struct net *net = nf_ct_net(ct); zone = nf_ct_zone(ct); /* 1) If this srcip/proto/src-proto-part is currently mapped, * and that same mapping gives a unique tuple within the given * range, use that. * * This is only required for source (ie. NAT/masq) mappings. * So far, we don't do local source mappings, so multiple * manips not an issue. */ if (maniptype == NF_NAT_MANIP_SRC && !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { /* try the original tuple first */ if (nf_in_range(orig_tuple, range)) { if (!nf_nat_used_tuple(orig_tuple, ct)) { *tuple = *orig_tuple; return; } } else if (find_appropriate_src(net, zone, orig_tuple, tuple, range)) { pr_debug("get_unique_tuple: Found current src map\n"); if (!nf_nat_used_tuple(tuple, ct)) return; } } /* 2) Select the least-used IP/proto combination in the given range */ *tuple = *orig_tuple; find_best_ips_proto(zone, tuple, range, ct, maniptype); /* 3) The per-protocol part of the manip is made to map into * the range to make a unique tuple. */ /* Only bother mapping if it's not already in range and unique */ if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) && l4proto_in_range(tuple, maniptype, &range->min_proto, &range->max_proto) && (range->min_proto.all == range->max_proto.all || !nf_nat_used_tuple(tuple, ct))) return; } else if (!nf_nat_used_tuple(tuple, ct)) { return; } } /* Last chance: get protocol to try to obtain unique tuple. */ nf_nat_l4proto_unique_tuple(tuple, range, maniptype, ct); } struct nf_conn_nat *nf_ct_nat_ext_add(struct nf_conn *ct) { struct nf_conn_nat *nat = nfct_nat(ct); if (nat) return nat; if (!nf_ct_is_confirmed(ct)) nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); return nat; } EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add); unsigned int nf_nat_setup_info(struct nf_conn *ct, const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype) { struct net *net = nf_ct_net(ct); struct nf_conntrack_tuple curr_tuple, new_tuple; /* Can't setup nat info for confirmed ct. */ if (nf_ct_is_confirmed(ct)) return NF_ACCEPT; WARN_ON(maniptype != NF_NAT_MANIP_SRC && maniptype != NF_NAT_MANIP_DST); if (WARN_ON(nf_nat_initialized(ct, maniptype))) return NF_DROP; /* What we've got will look like inverse of reply. Normally * this is what is in the conntrack, except for prior * manipulations (future optimization: if num_manips == 0, * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */ nf_ct_invert_tuple(&curr_tuple, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype); if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) { struct nf_conntrack_tuple reply; /* Alter conntrack table so will recognize replies. */ nf_ct_invert_tuple(&reply, &new_tuple); nf_conntrack_alter_reply(ct, &reply); /* Non-atomic: we own this at the moment. */ if (maniptype == NF_NAT_MANIP_SRC) ct->status |= IPS_SRC_NAT; else ct->status |= IPS_DST_NAT; if (nfct_help(ct) && !nfct_seqadj(ct)) if (!nfct_seqadj_ext_add(ct)) return NF_DROP; } if (maniptype == NF_NAT_MANIP_SRC) { unsigned int srchash; spinlock_t *lock; srchash = hash_by_src(net, nf_ct_zone(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS]; spin_lock_bh(lock); hlist_add_head_rcu(&ct->nat_bysource, &nf_nat_bysource[srchash]); spin_unlock_bh(lock); } /* It's done. */ if (maniptype == NF_NAT_MANIP_DST) ct->status |= IPS_DST_NAT_DONE; else ct->status |= IPS_SRC_NAT_DONE; return NF_ACCEPT; } EXPORT_SYMBOL(nf_nat_setup_info); static unsigned int __nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip) { /* Force range to this IP; let proto decide mapping for * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). * Use reply in case it's already been mangled (eg local packet). */ union nf_inet_addr ip = (manip == NF_NAT_MANIP_SRC ? ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 : ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3); struct nf_nat_range2 range = { .flags = NF_NAT_RANGE_MAP_IPS, .min_addr = ip, .max_addr = ip, }; return nf_nat_setup_info(ct, &range, manip); } unsigned int nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) { return __nf_nat_alloc_null_binding(ct, HOOK2MANIP(hooknum)); } EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding); /* Do packet manipulations according to nf_nat_setup_info. */ unsigned int nf_nat_packet(struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int hooknum, struct sk_buff *skb) { enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum); enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); unsigned int verdict = NF_ACCEPT; unsigned long statusbit; if (mtype == NF_NAT_MANIP_SRC) statusbit = IPS_SRC_NAT; else statusbit = IPS_DST_NAT; /* Invert if this is reply dir. */ if (dir == IP_CT_DIR_REPLY) statusbit ^= IPS_NAT_MASK; /* Non-atomic: these bits don't change. */ if (ct->status & statusbit) verdict = nf_nat_manip_pkt(skb, ct, mtype, dir); return verdict; } EXPORT_SYMBOL_GPL(nf_nat_packet); static bool in_vrf_postrouting(const struct nf_hook_state *state) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) if (state->hook == NF_INET_POST_ROUTING && netif_is_l3_master(state->out)) return true; #endif return false; } unsigned int nf_nat_inet_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; struct nf_conn_nat *nat; /* maniptype == SRC for postrouting. */ enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook); ct = nf_ct_get(skb, &ctinfo); /* Can't track? It's not due to stress, or conntrack would * have dropped it. Hence it's the user's responsibilty to * packet filter it out, or implement conntrack/NAT for that * protocol. 8) --RR */ if (!ct || in_vrf_postrouting(state)) return NF_ACCEPT; nat = nfct_nat(ct); switch (ctinfo) { case IP_CT_RELATED: case IP_CT_RELATED_REPLY: /* Only ICMPs can be IP_CT_IS_REPLY. Fallthrough */ case IP_CT_NEW: /* Seen it before? This can happen for loopback, retrans, * or local packets. */ if (!nf_nat_initialized(ct, maniptype)) { struct nf_nat_lookup_hook_priv *lpriv = priv; struct nf_hook_entries *e = rcu_dereference(lpriv->entries); unsigned int ret; int i; if (!e) goto null_bind; for (i = 0; i < e->num_hook_entries; i++) { ret = e->hooks[i].hook(e->hooks[i].priv, skb, state); if (ret != NF_ACCEPT) return ret; if (nf_nat_initialized(ct, maniptype)) goto do_nat; } null_bind: ret = nf_nat_alloc_null_binding(ct, state->hook); if (ret != NF_ACCEPT) return ret; } else { pr_debug("Already setup manip %s for ct %p (status bits 0x%lx)\n", maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", ct, ct->status); if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out)) goto oif_changed; } break; default: /* ESTABLISHED */ WARN_ON(ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY); if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out)) goto oif_changed; } do_nat: return nf_nat_packet(ct, ctinfo, state->hook, skb); oif_changed: nf_ct_kill_acct(ct, ctinfo, skb); return NF_DROP; } EXPORT_SYMBOL_GPL(nf_nat_inet_fn); struct nf_nat_proto_clean { u8 l3proto; u8 l4proto; }; /* kill conntracks with affected NAT section */ static int nf_nat_proto_remove(struct nf_conn *i, void *data) { const struct nf_nat_proto_clean *clean = data; if ((clean->l3proto && nf_ct_l3num(i) != clean->l3proto) || (clean->l4proto && nf_ct_protonum(i) != clean->l4proto)) return 0; return i->status & IPS_NAT_MASK ? 1 : 0; } static void nf_nat_cleanup_conntrack(struct nf_conn *ct) { unsigned int h; h = hash_by_src(nf_ct_net(ct), nf_ct_zone(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); spin_lock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); hlist_del_rcu(&ct->nat_bysource); spin_unlock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); } static int nf_nat_proto_clean(struct nf_conn *ct, void *data) { if (nf_nat_proto_remove(ct, data)) return 1; /* This module is being removed and conntrack has nat null binding. * Remove it from bysource hash, as the table will be freed soon. * * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack() * will delete entry from already-freed table. */ if (test_and_clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status)) nf_nat_cleanup_conntrack(ct); /* don't delete conntrack. Although that would make things a lot * simpler, we'd end up flushing all conntracks on nat rmmod. */ return 0; } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = { [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 }, [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 }, }; static int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range2 *range) { if (tb[CTA_PROTONAT_PORT_MIN]) { range->min_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]); range->max_proto.all = range->min_proto.all; range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } if (tb[CTA_PROTONAT_PORT_MAX]) { range->max_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]); range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } return 0; } static int nfnetlink_parse_nat_proto(struct nlattr *attr, const struct nf_conn *ct, struct nf_nat_range2 *range) { struct nlattr *tb[CTA_PROTONAT_MAX+1]; int err; err = nla_parse_nested_deprecated(tb, CTA_PROTONAT_MAX, attr, protonat_nla_policy, NULL); if (err < 0) return err; return nf_nat_l4proto_nlattr_to_range(tb, range); } static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = { [CTA_NAT_V4_MINIP] = { .type = NLA_U32 }, [CTA_NAT_V4_MAXIP] = { .type = NLA_U32 }, [CTA_NAT_V6_MINIP] = { .len = sizeof(struct in6_addr) }, [CTA_NAT_V6_MAXIP] = { .len = sizeof(struct in6_addr) }, [CTA_NAT_PROTO] = { .type = NLA_NESTED }, }; static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range2 *range) { if (tb[CTA_NAT_V4_MINIP]) { range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]); range->flags |= NF_NAT_RANGE_MAP_IPS; } if (tb[CTA_NAT_V4_MAXIP]) range->max_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MAXIP]); else range->max_addr.ip = range->min_addr.ip; return 0; } static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range2 *range) { if (tb[CTA_NAT_V6_MINIP]) { nla_memcpy(&range->min_addr.ip6, tb[CTA_NAT_V6_MINIP], sizeof(struct in6_addr)); range->flags |= NF_NAT_RANGE_MAP_IPS; } if (tb[CTA_NAT_V6_MAXIP]) nla_memcpy(&range->max_addr.ip6, tb[CTA_NAT_V6_MAXIP], sizeof(struct in6_addr)); else range->max_addr = range->min_addr; return 0; } static int nfnetlink_parse_nat(const struct nlattr *nat, const struct nf_conn *ct, struct nf_nat_range2 *range) { struct nlattr *tb[CTA_NAT_MAX+1]; int err; memset(range, 0, sizeof(*range)); err = nla_parse_nested_deprecated(tb, CTA_NAT_MAX, nat, nat_nla_policy, NULL); if (err < 0) return err; switch (nf_ct_l3num(ct)) { case NFPROTO_IPV4: err = nf_nat_ipv4_nlattr_to_range(tb, range); break; case NFPROTO_IPV6: err = nf_nat_ipv6_nlattr_to_range(tb, range); break; default: err = -EPROTONOSUPPORT; break; } if (err) return err; if (!tb[CTA_NAT_PROTO]) return 0; return nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range); } /* This function is called under rcu_read_lock() */ static int nfnetlink_parse_nat_setup(struct nf_conn *ct, enum nf_nat_manip_type manip, const struct nlattr *attr) { struct nf_nat_range2 range; int err; /* Should not happen, restricted to creating new conntracks * via ctnetlink. */ if (WARN_ON_ONCE(nf_nat_initialized(ct, manip))) return -EEXIST; /* No NAT information has been passed, allocate the null-binding */ if (attr == NULL) return __nf_nat_alloc_null_binding(ct, manip) == NF_DROP ? -ENOMEM : 0; err = nfnetlink_parse_nat(attr, ct, &range); if (err < 0) return err; return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0; } #else static int nfnetlink_parse_nat_setup(struct nf_conn *ct, enum nf_nat_manip_type manip, const struct nlattr *attr) { return -EOPNOTSUPP; } #endif static struct nf_ct_helper_expectfn follow_master_nat = { .name = "nat-follow-master", .expectfn = nf_nat_follow_master, }; int nf_nat_register_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, const struct nf_hook_ops *orig_nat_ops, unsigned int ops_count) { struct nat_net *nat_net = net_generic(net, nat_net_id); struct nf_nat_hooks_net *nat_proto_net; struct nf_nat_lookup_hook_priv *priv; unsigned int hooknum = ops->hooknum; struct nf_hook_ops *nat_ops; int i, ret; if (WARN_ON_ONCE(pf >= ARRAY_SIZE(nat_net->nat_proto_net))) return -EINVAL; nat_proto_net = &nat_net->nat_proto_net[pf]; for (i = 0; i < ops_count; i++) { if (orig_nat_ops[i].hooknum == hooknum) { hooknum = i; break; } } if (WARN_ON_ONCE(i == ops_count)) return -EINVAL; mutex_lock(&nf_nat_proto_mutex); if (!nat_proto_net->nat_hook_ops) { WARN_ON(nat_proto_net->users != 0); nat_ops = kmemdup(orig_nat_ops, sizeof(*orig_nat_ops) * ops_count, GFP_KERNEL); if (!nat_ops) { mutex_unlock(&nf_nat_proto_mutex); return -ENOMEM; } for (i = 0; i < ops_count; i++) { priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (priv) { nat_ops[i].priv = priv; continue; } mutex_unlock(&nf_nat_proto_mutex); while (i) kfree(nat_ops[--i].priv); kfree(nat_ops); return -ENOMEM; } ret = nf_register_net_hooks(net, nat_ops, ops_count); if (ret < 0) { mutex_unlock(&nf_nat_proto_mutex); for (i = 0; i < ops_count; i++) kfree(nat_ops[i].priv); kfree(nat_ops); return ret; } nat_proto_net->nat_hook_ops = nat_ops; } nat_ops = nat_proto_net->nat_hook_ops; priv = nat_ops[hooknum].priv; if (WARN_ON_ONCE(!priv)) { mutex_unlock(&nf_nat_proto_mutex); return -EOPNOTSUPP; } ret = nf_hook_entries_insert_raw(&priv->entries, ops); if (ret == 0) nat_proto_net->users++; mutex_unlock(&nf_nat_proto_mutex); return ret; } void nf_nat_unregister_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, unsigned int ops_count) { struct nat_net *nat_net = net_generic(net, nat_net_id); struct nf_nat_hooks_net *nat_proto_net; struct nf_nat_lookup_hook_priv *priv; struct nf_hook_ops *nat_ops; int hooknum = ops->hooknum; int i; if (pf >= ARRAY_SIZE(nat_net->nat_proto_net)) return; nat_proto_net = &nat_net->nat_proto_net[pf]; mutex_lock(&nf_nat_proto_mutex); if (WARN_ON(nat_proto_net->users == 0)) goto unlock; nat_proto_net->users--; nat_ops = nat_proto_net->nat_hook_ops; for (i = 0; i < ops_count; i++) { if (nat_ops[i].hooknum == hooknum) { hooknum = i; break; } } if (WARN_ON_ONCE(i == ops_count)) goto unlock; priv = nat_ops[hooknum].priv; nf_hook_entries_delete_raw(&priv->entries, ops); if (nat_proto_net->users == 0) { nf_unregister_net_hooks(net, nat_ops, ops_count); for (i = 0; i < ops_count; i++) { priv = nat_ops[i].priv; kfree_rcu(priv, rcu_head); } nat_proto_net->nat_hook_ops = NULL; kfree(nat_ops); } unlock: mutex_unlock(&nf_nat_proto_mutex); } static struct pernet_operations nat_net_ops = { .id = &nat_net_id, .size = sizeof(struct nat_net), }; static const struct nf_nat_hook nat_hook = { .parse_nat_setup = nfnetlink_parse_nat_setup, #ifdef CONFIG_XFRM .decode_session = __nf_nat_decode_session, #endif .manip_pkt = nf_nat_manip_pkt, .remove_nat_bysrc = nf_nat_cleanup_conntrack, }; static int __init nf_nat_init(void) { int ret, i; /* Leave them the same for the moment. */ nf_nat_htable_size = nf_conntrack_htable_size; if (nf_nat_htable_size < CONNTRACK_LOCKS) nf_nat_htable_size = CONNTRACK_LOCKS; nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0); if (!nf_nat_bysource) return -ENOMEM; for (i = 0; i < CONNTRACK_LOCKS; i++) spin_lock_init(&nf_nat_locks[i]); ret = register_pernet_subsys(&nat_net_ops); if (ret < 0) { kvfree(nf_nat_bysource); return ret; } nf_ct_helper_expectfn_register(&follow_master_nat); WARN_ON(nf_nat_hook != NULL); RCU_INIT_POINTER(nf_nat_hook, &nat_hook); ret = register_nf_nat_bpf(); if (ret < 0) { RCU_INIT_POINTER(nf_nat_hook, NULL); nf_ct_helper_expectfn_unregister(&follow_master_nat); synchronize_net(); unregister_pernet_subsys(&nat_net_ops); kvfree(nf_nat_bysource); } return ret; } static void __exit nf_nat_cleanup(void) { struct nf_nat_proto_clean clean = {}; nf_ct_iterate_destroy(nf_nat_proto_clean, &clean); nf_ct_helper_expectfn_unregister(&follow_master_nat); RCU_INIT_POINTER(nf_nat_hook, NULL); synchronize_net(); kvfree(nf_nat_bysource); unregister_pernet_subsys(&nat_net_ops); } MODULE_LICENSE("GPL"); module_init(nf_nat_init); module_exit(nf_nat_cleanup);
2 2 2 2 61 224 330 330 329 330 330 330 331 330 330 96 46 46 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/tcp.h> #include <linux/rcupdate.h> #include <net/tcp.h> void tcp_fastopen_init_key_once(struct net *net) { u8 key[TCP_FASTOPEN_KEY_LENGTH]; struct tcp_fastopen_context *ctxt; rcu_read_lock(); ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx); if (ctxt) { rcu_read_unlock(); return; } rcu_read_unlock(); /* tcp_fastopen_reset_cipher publishes the new context * atomically, so we allow this race happening here. * * All call sites of tcp_fastopen_cookie_gen also check * for a valid cookie, so this is an acceptable risk. */ get_random_bytes(key, sizeof(key)); tcp_fastopen_reset_cipher(net, NULL, key, NULL); } static void tcp_fastopen_ctx_free(struct rcu_head *head) { struct tcp_fastopen_context *ctx = container_of(head, struct tcp_fastopen_context, rcu); kfree_sensitive(ctx); } void tcp_fastopen_destroy_cipher(struct sock *sk) { struct tcp_fastopen_context *ctx; ctx = rcu_dereference_protected( inet_csk(sk)->icsk_accept_queue.fastopenq.ctx, 1); if (ctx) call_rcu(&ctx->rcu, tcp_fastopen_ctx_free); } void tcp_fastopen_ctx_destroy(struct net *net) { struct tcp_fastopen_context *ctxt; ctxt = xchg((__force struct tcp_fastopen_context **)&net->ipv4.tcp_fastopen_ctx, NULL); if (ctxt) call_rcu(&ctxt->rcu, tcp_fastopen_ctx_free); } int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk, void *primary_key, void *backup_key) { struct tcp_fastopen_context *ctx, *octx; struct fastopen_queue *q; int err = 0; ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) { err = -ENOMEM; goto out; } ctx->key[0].key[0] = get_unaligned_le64(primary_key); ctx->key[0].key[1] = get_unaligned_le64(primary_key + 8); if (backup_key) { ctx->key[1].key[0] = get_unaligned_le64(backup_key); ctx->key[1].key[1] = get_unaligned_le64(backup_key + 8); ctx->num = 2; } else { ctx->num = 1; } if (sk) { q = &inet_csk(sk)->icsk_accept_queue.fastopenq; octx = xchg((__force struct tcp_fastopen_context **)&q->ctx, ctx); } else { octx = xchg((__force struct tcp_fastopen_context **)&net->ipv4.tcp_fastopen_ctx, ctx); } if (octx) call_rcu(&octx->rcu, tcp_fastopen_ctx_free); out: return err; } int tcp_fastopen_get_cipher(struct net *net, struct inet_connection_sock *icsk, u64 *key) { struct tcp_fastopen_context *ctx; int n_keys = 0, i; rcu_read_lock(); if (icsk) ctx = rcu_dereference(icsk->icsk_accept_queue.fastopenq.ctx); else ctx = rcu_dereference(net->ipv4.tcp_fastopen_ctx); if (ctx) { n_keys = tcp_fastopen_context_len(ctx); for (i = 0; i < n_keys; i++) { put_unaligned_le64(ctx->key[i].key[0], key + (i * 2)); put_unaligned_le64(ctx->key[i].key[1], key + (i * 2) + 1); } } rcu_read_unlock(); return n_keys; } static bool __tcp_fastopen_cookie_gen_cipher(struct request_sock *req, struct sk_buff *syn, const siphash_key_t *key, struct tcp_fastopen_cookie *foc) { BUILD_BUG_ON(TCP_FASTOPEN_COOKIE_SIZE != sizeof(u64)); if (req->rsk_ops->family == AF_INET) { const struct iphdr *iph = ip_hdr(syn); foc->val[0] = cpu_to_le64(siphash(&iph->saddr, sizeof(iph->saddr) + sizeof(iph->daddr), key)); foc->len = TCP_FASTOPEN_COOKIE_SIZE; return true; } #if IS_ENABLED(CONFIG_IPV6) if (req->rsk_ops->family == AF_INET6) { const struct ipv6hdr *ip6h = ipv6_hdr(syn); foc->val[0] = cpu_to_le64(siphash(&ip6h->saddr, sizeof(ip6h->saddr) + sizeof(ip6h->daddr), key)); foc->len = TCP_FASTOPEN_COOKIE_SIZE; return true; } #endif return false; } /* Generate the fastopen cookie by applying SipHash to both the source and * destination addresses. */ static void tcp_fastopen_cookie_gen(struct sock *sk, struct request_sock *req, struct sk_buff *syn, struct tcp_fastopen_cookie *foc) { struct tcp_fastopen_context *ctx; rcu_read_lock(); ctx = tcp_fastopen_get_ctx(sk); if (ctx) __tcp_fastopen_cookie_gen_cipher(req, syn, &ctx->key[0], foc); rcu_read_unlock(); } /* If an incoming SYN or SYNACK frame contains a payload and/or FIN, * queue this additional data / FIN. */ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); if (TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt) return; skb = skb_clone(skb, GFP_ATOMIC); if (!skb) return; skb_dst_drop(skb); /* segs_in has been initialized to 1 in tcp_create_openreq_child(). * Hence, reset segs_in to 0 before calling tcp_segs_in() * to avoid double counting. Also, tcp_segs_in() expects * skb->len to include the tcp_hdrlen. Hence, it should * be called before __skb_pull(). */ tp->segs_in = 0; tcp_segs_in(tp, skb); __skb_pull(skb, tcp_hdrlen(skb)); sk_forced_mem_schedule(sk, skb->truesize); skb_set_owner_r(skb, sk); TCP_SKB_CB(skb)->seq++; TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN; tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; __skb_queue_tail(&sk->sk_receive_queue, skb); tp->syn_data_acked = 1; /* u64_stats_update_begin(&tp->syncp) not needed here, * as we certainly are not changing upper 32bit value (0) */ tp->bytes_received = skb->len; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) tcp_fin(sk); } /* returns 0 - no key match, 1 for primary, 2 for backup */ static int tcp_fastopen_cookie_gen_check(struct sock *sk, struct request_sock *req, struct sk_buff *syn, struct tcp_fastopen_cookie *orig, struct tcp_fastopen_cookie *valid_foc) { struct tcp_fastopen_cookie search_foc = { .len = -1 }; struct tcp_fastopen_cookie *foc = valid_foc; struct tcp_fastopen_context *ctx; int i, ret = 0; rcu_read_lock(); ctx = tcp_fastopen_get_ctx(sk); if (!ctx) goto out; for (i = 0; i < tcp_fastopen_context_len(ctx); i++) { __tcp_fastopen_cookie_gen_cipher(req, syn, &ctx->key[i], foc); if (tcp_fastopen_cookie_match(foc, orig)) { ret = i + 1; goto out; } foc = &search_foc; } out: rcu_read_unlock(); return ret; } static struct sock *tcp_fastopen_create_child(struct sock *sk, struct sk_buff *skb, struct request_sock *req) { struct tcp_sock *tp; struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; struct sock *child; bool own_req; child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, NULL, &own_req); if (!child) return NULL; spin_lock(&queue->fastopenq.lock); queue->fastopenq.qlen++; spin_unlock(&queue->fastopenq.lock); /* Initialize the child socket. Have to fix some values to take * into account the child is a Fast Open socket and is created * only out of the bits carried in the SYN packet. */ tp = tcp_sk(child); rcu_assign_pointer(tp->fastopen_rsk, req); tcp_rsk(req)->tfo_listener = true; /* RFC1323: The window in SYN & SYN/ACK segments is never * scaled. So correct it appropriately. */ tp->snd_wnd = ntohs(tcp_hdr(skb)->window); tp->max_window = tp->snd_wnd; /* Activate the retrans timer so that SYNACK can be retransmitted. * The request socket is not added to the ehash * because it's been added to the accept queue directly. */ req->timeout = tcp_timeout_init(child); inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS, req->timeout, TCP_RTO_MAX); refcount_set(&req->rsk_refcnt, 2); /* Now finish processing the fastopen child socket. */ tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, skb); tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; tcp_fastopen_add_skb(child, skb); tcp_rsk(req)->rcv_nxt = tp->rcv_nxt; tp->rcv_wup = tp->rcv_nxt; /* tcp_conn_request() is sending the SYNACK, * and queues the child into listener accept queue. */ return child; } static bool tcp_fastopen_queue_check(struct sock *sk) { struct fastopen_queue *fastopenq; int max_qlen; /* Make sure the listener has enabled fastopen, and we don't * exceed the max # of pending TFO requests allowed before trying * to validating the cookie in order to avoid burning CPU cycles * unnecessarily. * * XXX (TFO) - The implication of checking the max_qlen before * processing a cookie request is that clients can't differentiate * between qlen overflow causing Fast Open to be disabled * temporarily vs a server not supporting Fast Open at all. */ fastopenq = &inet_csk(sk)->icsk_accept_queue.fastopenq; max_qlen = READ_ONCE(fastopenq->max_qlen); if (max_qlen == 0) return false; if (fastopenq->qlen >= max_qlen) { struct request_sock *req1; spin_lock(&fastopenq->lock); req1 = fastopenq->rskq_rst_head; if (!req1 || time_after(req1->rsk_timer.expires, jiffies)) { __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENLISTENOVERFLOW); spin_unlock(&fastopenq->lock); return false; } fastopenq->rskq_rst_head = req1->dl_next; fastopenq->qlen--; spin_unlock(&fastopenq->lock); reqsk_put(req1); } return true; } static bool tcp_fastopen_no_cookie(const struct sock *sk, const struct dst_entry *dst, int flag) { return (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen) & flag) || tcp_sk(sk)->fastopen_no_cookie || (dst && dst_metric(dst, RTAX_FASTOPEN_NO_COOKIE)); } /* Returns true if we should perform Fast Open on the SYN. The cookie (foc) * may be updated and return the client in the SYN-ACK later. E.g., Fast Open * cookie request (foc->len == 0). */ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct tcp_fastopen_cookie *foc, const struct dst_entry *dst) { bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1; int tcp_fastopen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen); struct tcp_fastopen_cookie valid_foc = { .len = -1 }; struct sock *child; int ret = 0; if (foc->len == 0) /* Client requests a cookie */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD); if (!((tcp_fastopen & TFO_SERVER_ENABLE) && (syn_data || foc->len >= 0) && tcp_fastopen_queue_check(sk))) { foc->len = -1; return NULL; } if (tcp_fastopen_no_cookie(sk, dst, TFO_SERVER_COOKIE_NOT_REQD)) goto fastopen; if (foc->len == 0) { /* Client requests a cookie. */ tcp_fastopen_cookie_gen(sk, req, skb, &valid_foc); } else if (foc->len > 0) { ret = tcp_fastopen_cookie_gen_check(sk, req, skb, foc, &valid_foc); if (!ret) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL); } else { /* Cookie is valid. Create a (full) child socket to * accept the data in SYN before returning a SYN-ACK to * ack the data. If we fail to create the socket, fall * back and ack the ISN only but includes the same * cookie. * * Note: Data-less SYN with valid cookie is allowed to * send data in SYN_RECV state. */ fastopen: child = tcp_fastopen_create_child(sk, skb, req); if (child) { if (ret == 2) { valid_foc.exp = foc->exp; *foc = valid_foc; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEALTKEY); } else { foc->len = -1; } NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE); return child; } NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL); } } valid_foc.exp = foc->exp; *foc = valid_foc; return NULL; } bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie) { const struct dst_entry *dst; tcp_fastopen_cache_get(sk, mss, cookie); /* Firewall blackhole issue check */ if (tcp_fastopen_active_should_disable(sk)) { cookie->len = -1; return false; } dst = __sk_dst_get(sk); if (tcp_fastopen_no_cookie(sk, dst, TFO_CLIENT_NO_COOKIE)) { cookie->len = -1; return true; } if (cookie->len > 0) return true; tcp_sk(sk)->fastopen_client_fail = TFO_COOKIE_UNAVAILABLE; return false; } /* This function checks if we want to defer sending SYN until the first * write(). We defer under the following conditions: * 1. fastopen_connect sockopt is set * 2. we have a valid cookie * Return value: return true if we want to defer until application writes data * return false if we want to send out SYN immediately */ bool tcp_fastopen_defer_connect(struct sock *sk, int *err) { struct tcp_fastopen_cookie cookie = { .len = 0 }; struct tcp_sock *tp = tcp_sk(sk); u16 mss; if (tp->fastopen_connect && !tp->fastopen_req) { if (tcp_fastopen_cookie_check(sk, &mss, &cookie)) { inet_set_bit(DEFER_CONNECT, sk); return true; } /* Alloc fastopen_req in order for FO option to be included * in SYN */ tp->fastopen_req = kzalloc(sizeof(*tp->fastopen_req), sk->sk_allocation); if (tp->fastopen_req) tp->fastopen_req->cookie = cookie; else *err = -ENOBUFS; } return false; } EXPORT_SYMBOL(tcp_fastopen_defer_connect); /* * The following code block is to deal with middle box issues with TFO: * Middlebox firewall issues can potentially cause server's data being * blackholed after a successful 3WHS using TFO. * The proposed solution is to disable active TFO globally under the * following circumstances: * 1. client side TFO socket receives out of order FIN * 2. client side TFO socket receives out of order RST * 3. client side TFO socket has timed out three times consecutively during * or after handshake * We disable active side TFO globally for 1hr at first. Then if it * happens again, we disable it for 2h, then 4h, 8h, ... * And we reset the timeout back to 1hr when we see a successful active * TFO connection with data exchanges. */ /* Disable active TFO and record current jiffies and * tfo_active_disable_times */ void tcp_fastopen_active_disable(struct sock *sk) { struct net *net = sock_net(sk); if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout)) return; /* Paired with READ_ONCE() in tcp_fastopen_active_should_disable() */ WRITE_ONCE(net->ipv4.tfo_active_disable_stamp, jiffies); /* Paired with smp_rmb() in tcp_fastopen_active_should_disable(). * We want net->ipv4.tfo_active_disable_stamp to be updated first. */ smp_mb__before_atomic(); atomic_inc(&net->ipv4.tfo_active_disable_times); NET_INC_STATS(net, LINUX_MIB_TCPFASTOPENBLACKHOLE); } /* Calculate timeout for tfo active disable * Return true if we are still in the active TFO disable period * Return false if timeout already expired and we should use active TFO */ bool tcp_fastopen_active_should_disable(struct sock *sk) { unsigned int tfo_bh_timeout = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout); unsigned long timeout; int tfo_da_times; int multiplier; if (!tfo_bh_timeout) return false; tfo_da_times = atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times); if (!tfo_da_times) return false; /* Paired with smp_mb__before_atomic() in tcp_fastopen_active_disable() */ smp_rmb(); /* Limit timeout to max: 2^6 * initial timeout */ multiplier = 1 << min(tfo_da_times - 1, 6); /* Paired with the WRITE_ONCE() in tcp_fastopen_active_disable(). */ timeout = READ_ONCE(sock_net(sk)->ipv4.tfo_active_disable_stamp) + multiplier * tfo_bh_timeout * HZ; if (time_before(jiffies, timeout)) return true; /* Mark check bit so we can check for successful active TFO * condition and reset tfo_active_disable_times */ tcp_sk(sk)->syn_fastopen_ch = 1; return false; } /* Disable active TFO if FIN is the only packet in the ofo queue * and no data is received. * Also check if we can reset tfo_active_disable_times if data is * received successfully on a marked active TFO sockets opened on * a non-loopback interface */ void tcp_fastopen_active_disable_ofo_check(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct dst_entry *dst; struct sk_buff *skb; if (!tp->syn_fastopen) return; if (!tp->data_segs_in) { skb = skb_rb_first(&tp->out_of_order_queue); if (skb && !skb_rb_next(skb)) { if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { tcp_fastopen_active_disable(sk); return; } } } else if (tp->syn_fastopen_ch && atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times)) { dst = sk_dst_get(sk); if (!(dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK))) atomic_set(&sock_net(sk)->ipv4.tfo_active_disable_times, 0); dst_release(dst); } } void tcp_fastopen_active_detect_blackhole(struct sock *sk, bool expired) { u32 timeouts = inet_csk(sk)->icsk_retransmits; struct tcp_sock *tp = tcp_sk(sk); /* Broken middle-boxes may black-hole Fast Open connection during or * even after the handshake. Be extremely conservative and pause * Fast Open globally after hitting the third consecutive timeout or * exceeding the configured timeout limit. */ if ((tp->syn_fastopen || tp->syn_data || tp->syn_data_acked) && (timeouts == 2 || (timeouts < 2 && expired))) { tcp_fastopen_active_disable(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); } }
3816 495 1890 1536 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 #ifndef _LINUX_HASH_H #define _LINUX_HASH_H /* Fast hashing routine for ints, longs and pointers. (C) 2002 Nadia Yvette Chambers, IBM */ #include <asm/types.h> #include <linux/compiler.h> /* * The "GOLDEN_RATIO_PRIME" is used in ifs/btrfs/brtfs_inode.h and * fs/inode.c. It's not actually prime any more (the previous primes * were actively bad for hashing), but the name remains. */ #if BITS_PER_LONG == 32 #define GOLDEN_RATIO_PRIME GOLDEN_RATIO_32 #define hash_long(val, bits) hash_32(val, bits) #elif BITS_PER_LONG == 64 #define hash_long(val, bits) hash_64(val, bits) #define GOLDEN_RATIO_PRIME GOLDEN_RATIO_64 #else #error Wordsize not 32 or 64 #endif /* * This hash multiplies the input by a large odd number and takes the * high bits. Since multiplication propagates changes to the most * significant end only, it is essential that the high bits of the * product be used for the hash value. * * Chuck Lever verified the effectiveness of this technique: * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf * * Although a random odd number will do, it turns out that the golden * ratio phi = (sqrt(5)-1)/2, or its negative, has particularly nice * properties. (See Knuth vol 3, section 6.4, exercise 9.) * * These are the negative, (1 - phi) = phi**2 = (3 - sqrt(5))/2, * which is very slightly easier to multiply by and makes no * difference to the hash distribution. */ #define GOLDEN_RATIO_32 0x61C88647 #define GOLDEN_RATIO_64 0x61C8864680B583EBull #ifdef CONFIG_HAVE_ARCH_HASH /* This header may use the GOLDEN_RATIO_xx constants */ #include <asm/hash.h> #endif /* * The _generic versions exist only so lib/test_hash.c can compare * the arch-optimized versions with the generic. * * Note that if you change these, any <asm/hash.h> that aren't updated * to match need to have their HAVE_ARCH_* define values updated so the * self-test will not false-positive. */ #ifndef HAVE_ARCH__HASH_32 #define __hash_32 __hash_32_generic #endif static inline u32 __hash_32_generic(u32 val) { return val * GOLDEN_RATIO_32; } static inline u32 hash_32(u32 val, unsigned int bits) { /* High bits are more random, so use them. */ return __hash_32(val) >> (32 - bits); } #ifndef HAVE_ARCH_HASH_64 #define hash_64 hash_64_generic #endif static __always_inline u32 hash_64_generic(u64 val, unsigned int bits) { #if BITS_PER_LONG == 64 /* 64x64-bit multiply is efficient on all 64-bit processors */ return val * GOLDEN_RATIO_64 >> (64 - bits); #else /* Hash 64 bits using only 32x32-bit multiply. */ return hash_32((u32)val ^ __hash_32(val >> 32), bits); #endif } static inline u32 hash_ptr(const void *ptr, unsigned int bits) { return hash_long((unsigned long)ptr, bits); } /* This really should be called fold32_ptr; it does no hashing to speak of. */ static inline u32 hash32_ptr(const void *ptr) { unsigned long val = (unsigned long)ptr; #if BITS_PER_LONG == 64 val ^= (val >> 32); #endif return (u32)val; } #endif /* _LINUX_HASH_H */
77 76 2 2 2 68 33 5 5 5 107 106 5 105 101 91 9 8 171 185 193 75 75 75 199 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 /* License: GPL */ #include <linux/filter.h> #include <linux/mutex.h> #include <linux/socket.h> #include <linux/skbuff.h> #include <net/netlink.h> #include <net/net_namespace.h> #include <linux/module.h> #include <net/sock.h> #include <linux/kernel.h> #include <linux/tcp.h> #include <linux/workqueue.h> #include <linux/nospec.h> #include <linux/cookie.h> #include <linux/inet_diag.h> #include <linux/sock_diag.h> static const struct sock_diag_handler *sock_diag_handlers[AF_MAX]; static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh); static DEFINE_MUTEX(sock_diag_table_mutex); static struct workqueue_struct *broadcast_wq; DEFINE_COOKIE(sock_cookie); u64 __sock_gen_cookie(struct sock *sk) { u64 res = atomic64_read(&sk->sk_cookie); if (!res) { u64 new = gen_cookie_next(&sock_cookie); atomic64_cmpxchg(&sk->sk_cookie, res, new); /* Another thread might have changed sk_cookie before us. */ res = atomic64_read(&sk->sk_cookie); } return res; } int sock_diag_check_cookie(struct sock *sk, const __u32 *cookie) { u64 res; if (cookie[0] == INET_DIAG_NOCOOKIE && cookie[1] == INET_DIAG_NOCOOKIE) return 0; res = sock_gen_cookie(sk); if ((u32)res != cookie[0] || (u32)(res >> 32) != cookie[1]) return -ESTALE; return 0; } EXPORT_SYMBOL_GPL(sock_diag_check_cookie); void sock_diag_save_cookie(struct sock *sk, __u32 *cookie) { u64 res = sock_gen_cookie(sk); cookie[0] = (u32)res; cookie[1] = (u32)(res >> 32); } EXPORT_SYMBOL_GPL(sock_diag_save_cookie); int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attrtype) { u32 mem[SK_MEMINFO_VARS]; sk_get_meminfo(sk, mem); return nla_put(skb, attrtype, sizeof(mem), &mem); } EXPORT_SYMBOL_GPL(sock_diag_put_meminfo); int sock_diag_put_filterinfo(bool may_report_filterinfo, struct sock *sk, struct sk_buff *skb, int attrtype) { struct sock_fprog_kern *fprog; struct sk_filter *filter; struct nlattr *attr; unsigned int flen; int err = 0; if (!may_report_filterinfo) { nla_reserve(skb, attrtype, 0); return 0; } rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); if (!filter) goto out; fprog = filter->prog->orig_prog; if (!fprog) goto out; flen = bpf_classic_proglen(fprog); attr = nla_reserve(skb, attrtype, flen); if (attr == NULL) { err = -EMSGSIZE; goto out; } memcpy(nla_data(attr), fprog->filter, flen); out: rcu_read_unlock(); return err; } EXPORT_SYMBOL(sock_diag_put_filterinfo); struct broadcast_sk { struct sock *sk; struct work_struct work; }; static size_t sock_diag_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct inet_diag_msg) + nla_total_size(sizeof(u8)) /* INET_DIAG_PROTOCOL */ + nla_total_size_64bit(sizeof(struct tcp_info))); /* INET_DIAG_INFO */ } static void sock_diag_broadcast_destroy_work(struct work_struct *work) { struct broadcast_sk *bsk = container_of(work, struct broadcast_sk, work); struct sock *sk = bsk->sk; const struct sock_diag_handler *hndl; struct sk_buff *skb; const enum sknetlink_groups group = sock_diag_destroy_group(sk); int err = -1; WARN_ON(group == SKNLGRP_NONE); skb = nlmsg_new(sock_diag_nlmsg_size(), GFP_KERNEL); if (!skb) goto out; mutex_lock(&sock_diag_table_mutex); hndl = sock_diag_handlers[sk->sk_family]; if (hndl && hndl->get_info) err = hndl->get_info(skb, sk); mutex_unlock(&sock_diag_table_mutex); if (!err) nlmsg_multicast(sock_net(sk)->diag_nlsk, skb, 0, group, GFP_KERNEL); else kfree_skb(skb); out: sk_destruct(sk); kfree(bsk); } void sock_diag_broadcast_destroy(struct sock *sk) { /* Note, this function is often called from an interrupt context. */ struct broadcast_sk *bsk = kmalloc(sizeof(struct broadcast_sk), GFP_ATOMIC); if (!bsk) return sk_destruct(sk); bsk->sk = sk; INIT_WORK(&bsk->work, sock_diag_broadcast_destroy_work); queue_work(broadcast_wq, &bsk->work); } void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)) { mutex_lock(&sock_diag_table_mutex); inet_rcv_compat = fn; mutex_unlock(&sock_diag_table_mutex); } EXPORT_SYMBOL_GPL(sock_diag_register_inet_compat); void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)) { mutex_lock(&sock_diag_table_mutex); inet_rcv_compat = NULL; mutex_unlock(&sock_diag_table_mutex); } EXPORT_SYMBOL_GPL(sock_diag_unregister_inet_compat); int sock_diag_register(const struct sock_diag_handler *hndl) { int err = 0; if (hndl->family >= AF_MAX) return -EINVAL; mutex_lock(&sock_diag_table_mutex); if (sock_diag_handlers[hndl->family]) err = -EBUSY; else sock_diag_handlers[hndl->family] = hndl; mutex_unlock(&sock_diag_table_mutex); return err; } EXPORT_SYMBOL_GPL(sock_diag_register); void sock_diag_unregister(const struct sock_diag_handler *hnld) { int family = hnld->family; if (family >= AF_MAX) return; mutex_lock(&sock_diag_table_mutex); BUG_ON(sock_diag_handlers[family] != hnld); sock_diag_handlers[family] = NULL; mutex_unlock(&sock_diag_table_mutex); } EXPORT_SYMBOL_GPL(sock_diag_unregister); static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh) { int err; struct sock_diag_req *req = nlmsg_data(nlh); const struct sock_diag_handler *hndl; if (nlmsg_len(nlh) < sizeof(*req)) return -EINVAL; if (req->sdiag_family >= AF_MAX) return -EINVAL; req->sdiag_family = array_index_nospec(req->sdiag_family, AF_MAX); if (sock_diag_handlers[req->sdiag_family] == NULL) sock_load_diag_module(req->sdiag_family, 0); mutex_lock(&sock_diag_table_mutex); hndl = sock_diag_handlers[req->sdiag_family]; if (hndl == NULL) err = -ENOENT; else if (nlh->nlmsg_type == SOCK_DIAG_BY_FAMILY) err = hndl->dump(skb, nlh); else if (nlh->nlmsg_type == SOCK_DESTROY && hndl->destroy) err = hndl->destroy(skb, nlh); else err = -EOPNOTSUPP; mutex_unlock(&sock_diag_table_mutex); return err; } static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { int ret; switch (nlh->nlmsg_type) { case TCPDIAG_GETSOCK: case DCCPDIAG_GETSOCK: if (inet_rcv_compat == NULL) sock_load_diag_module(AF_INET, 0); mutex_lock(&sock_diag_table_mutex); if (inet_rcv_compat != NULL) ret = inet_rcv_compat(skb, nlh); else ret = -EOPNOTSUPP; mutex_unlock(&sock_diag_table_mutex); return ret; case SOCK_DIAG_BY_FAMILY: case SOCK_DESTROY: return __sock_diag_cmd(skb, nlh); default: return -EINVAL; } } static DEFINE_MUTEX(sock_diag_mutex); static void sock_diag_rcv(struct sk_buff *skb) { mutex_lock(&sock_diag_mutex); netlink_rcv_skb(skb, &sock_diag_rcv_msg); mutex_unlock(&sock_diag_mutex); } static int sock_diag_bind(struct net *net, int group) { switch (group) { case SKNLGRP_INET_TCP_DESTROY: case SKNLGRP_INET_UDP_DESTROY: if (!sock_diag_handlers[AF_INET]) sock_load_diag_module(AF_INET, 0); break; case SKNLGRP_INET6_TCP_DESTROY: case SKNLGRP_INET6_UDP_DESTROY: if (!sock_diag_handlers[AF_INET6]) sock_load_diag_module(AF_INET6, 0); break; } return 0; } int sock_diag_destroy(struct sock *sk, int err) { if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (!sk->sk_prot->diag_destroy) return -EOPNOTSUPP; return sk->sk_prot->diag_destroy(sk, err); } EXPORT_SYMBOL_GPL(sock_diag_destroy); static int __net_init diag_net_init(struct net *net) { struct netlink_kernel_cfg cfg = { .groups = SKNLGRP_MAX, .input = sock_diag_rcv, .bind = sock_diag_bind, .flags = NL_CFG_F_NONROOT_RECV, }; net->diag_nlsk = netlink_kernel_create(net, NETLINK_SOCK_DIAG, &cfg); return net->diag_nlsk == NULL ? -ENOMEM : 0; } static void __net_exit diag_net_exit(struct net *net) { netlink_kernel_release(net->diag_nlsk); net->diag_nlsk = NULL; } static struct pernet_operations diag_net_ops = { .init = diag_net_init, .exit = diag_net_exit, }; static int __init sock_diag_init(void) { broadcast_wq = alloc_workqueue("sock_diag_events", 0, 0); BUG_ON(!broadcast_wq); return register_pernet_subsys(&diag_net_ops); } device_initcall(sock_diag_init);
992 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 #undef TRACE_SYSTEM #define TRACE_SYSTEM netlink #if !defined(_TRACE_NETLINK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_NETLINK_H #include <linux/tracepoint.h> TRACE_EVENT(netlink_extack, TP_PROTO(const char *msg), TP_ARGS(msg), TP_STRUCT__entry( __string( msg, msg ) ), TP_fast_assign( __assign_str(msg, msg); ), TP_printk("msg=%s", __get_str(msg)) ); #endif /* _TRACE_NETLINK_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
697 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 /* SPDX-License-Identifier: GPL-2.0 */ /* * connection tracking event cache. */ #ifndef _NF_CONNTRACK_ECACHE_H #define _NF_CONNTRACK_ECACHE_H #include <net/netfilter/nf_conntrack.h> #include <net/net_namespace.h> #include <net/netfilter/nf_conntrack_expect.h> #include <linux/netfilter/nf_conntrack_common.h> #include <linux/netfilter/nf_conntrack_tuple_common.h> #include <net/netfilter/nf_conntrack_extend.h> enum nf_ct_ecache_state { NFCT_ECACHE_DESTROY_FAIL, /* tried but failed to send destroy event */ NFCT_ECACHE_DESTROY_SENT, /* sent destroy event after failure */ }; struct nf_conntrack_ecache { unsigned long cache; /* bitops want long */ u16 ctmask; /* bitmask of ct events to be delivered */ u16 expmask; /* bitmask of expect events to be delivered */ u32 missed; /* missed events */ u32 portid; /* netlink portid of destroyer */ }; static inline struct nf_conntrack_ecache * nf_ct_ecache_find(const struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_EVENTS return nf_ct_ext_find(ct, NF_CT_EXT_ECACHE); #else return NULL; #endif } static inline bool nf_ct_ecache_exist(const struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_EVENTS return nf_ct_ext_exist(ct, NF_CT_EXT_ECACHE); #else return false; #endif } #ifdef CONFIG_NF_CONNTRACK_EVENTS /* This structure is passed to event handler */ struct nf_ct_event { struct nf_conn *ct; u32 portid; int report; }; struct nf_exp_event { struct nf_conntrack_expect *exp; u32 portid; int report; }; struct nf_ct_event_notifier { int (*ct_event)(unsigned int events, const struct nf_ct_event *item); int (*exp_event)(unsigned int events, const struct nf_exp_event *item); }; void nf_conntrack_register_notifier(struct net *net, const struct nf_ct_event_notifier *nb); void nf_conntrack_unregister_notifier(struct net *net); void nf_ct_deliver_cached_events(struct nf_conn *ct); int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct, u32 portid, int report); bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp); #else static inline void nf_ct_deliver_cached_events(const struct nf_conn *ct) { } static inline int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct, u32 portid, int report) { return 0; } static inline bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp) { return false; } #endif static inline void nf_conntrack_event_cache(enum ip_conntrack_events event, struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_EVENTS struct net *net = nf_ct_net(ct); struct nf_conntrack_ecache *e; if (!rcu_access_pointer(net->ct.nf_conntrack_event_cb)) return; e = nf_ct_ecache_find(ct); if (e == NULL) return; set_bit(event, &e->cache); #endif } static inline int nf_conntrack_event_report(enum ip_conntrack_events event, struct nf_conn *ct, u32 portid, int report) { #ifdef CONFIG_NF_CONNTRACK_EVENTS if (nf_ct_ecache_exist(ct)) return nf_conntrack_eventmask_report(1 << event, ct, portid, report); #endif return 0; } static inline int nf_conntrack_event(enum ip_conntrack_events event, struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_EVENTS if (nf_ct_ecache_exist(ct)) return nf_conntrack_eventmask_report(1 << event, ct, 0, 0); #endif return 0; } #ifdef CONFIG_NF_CONNTRACK_EVENTS void nf_ct_expect_event_report(enum ip_conntrack_expect_events event, struct nf_conntrack_expect *exp, u32 portid, int report); void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state); void nf_conntrack_ecache_pernet_init(struct net *net); void nf_conntrack_ecache_pernet_fini(struct net *net); struct nf_conntrack_net_ecache *nf_conn_pernet_ecache(const struct net *net); static inline bool nf_conntrack_ecache_dwork_pending(const struct net *net) { return net->ct.ecache_dwork_pending; } #else /* CONFIG_NF_CONNTRACK_EVENTS */ static inline void nf_ct_expect_event_report(enum ip_conntrack_expect_events e, struct nf_conntrack_expect *exp, u32 portid, int report) { } static inline void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state s) { } static inline void nf_conntrack_ecache_pernet_init(struct net *net) { } static inline void nf_conntrack_ecache_pernet_fini(struct net *net) { } static inline bool nf_conntrack_ecache_dwork_pending(const struct net *net) { return false; } #endif /* CONFIG_NF_CONNTRACK_EVENTS */ #endif /*_NF_CONNTRACK_ECACHE_H*/
3 435 130 2451 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM sock #if !defined(_TRACE_SOCK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SOCK_H #include <net/sock.h> #include <net/ipv6.h> #include <linux/tracepoint.h> #include <linux/ipv6.h> #include <linux/tcp.h> #define family_names \ EM(AF_INET) \ EMe(AF_INET6) /* The protocol traced by inet_sock_set_state */ #define inet_protocol_names \ EM(IPPROTO_TCP) \ EM(IPPROTO_DCCP) \ EM(IPPROTO_SCTP) \ EMe(IPPROTO_MPTCP) #define tcp_state_names \ EM(TCP_ESTABLISHED) \ EM(TCP_SYN_SENT) \ EM(TCP_SYN_RECV) \ EM(TCP_FIN_WAIT1) \ EM(TCP_FIN_WAIT2) \ EM(TCP_TIME_WAIT) \ EM(TCP_CLOSE) \ EM(TCP_CLOSE_WAIT) \ EM(TCP_LAST_ACK) \ EM(TCP_LISTEN) \ EM(TCP_CLOSING) \ EMe(TCP_NEW_SYN_RECV) #define skmem_kind_names \ EM(SK_MEM_SEND) \ EMe(SK_MEM_RECV) /* enums need to be exported to user space */ #undef EM #undef EMe #define EM(a) TRACE_DEFINE_ENUM(a); #define EMe(a) TRACE_DEFINE_ENUM(a); family_names inet_protocol_names tcp_state_names skmem_kind_names #undef EM #undef EMe #define EM(a) { a, #a }, #define EMe(a) { a, #a } #define show_family_name(val) \ __print_symbolic(val, family_names) #define show_inet_protocol_name(val) \ __print_symbolic(val, inet_protocol_names) #define show_tcp_state_name(val) \ __print_symbolic(val, tcp_state_names) #define show_skmem_kind_names(val) \ __print_symbolic(val, skmem_kind_names) TRACE_EVENT(sock_rcvqueue_full, TP_PROTO(struct sock *sk, struct sk_buff *skb), TP_ARGS(sk, skb), TP_STRUCT__entry( __field(int, rmem_alloc) __field(unsigned int, truesize) __field(int, sk_rcvbuf) ), TP_fast_assign( __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc); __entry->truesize = skb->truesize; __entry->sk_rcvbuf = READ_ONCE(sk->sk_rcvbuf); ), TP_printk("rmem_alloc=%d truesize=%u sk_rcvbuf=%d", __entry->rmem_alloc, __entry->truesize, __entry->sk_rcvbuf) ); TRACE_EVENT(sock_exceed_buf_limit, TP_PROTO(struct sock *sk, struct proto *prot, long allocated, int kind), TP_ARGS(sk, prot, allocated, kind), TP_STRUCT__entry( __array(char, name, 32) __array(long, sysctl_mem, 3) __field(long, allocated) __field(int, sysctl_rmem) __field(int, rmem_alloc) __field(int, sysctl_wmem) __field(int, wmem_alloc) __field(int, wmem_queued) __field(int, kind) ), TP_fast_assign( strncpy(__entry->name, prot->name, 32); __entry->sysctl_mem[0] = READ_ONCE(prot->sysctl_mem[0]); __entry->sysctl_mem[1] = READ_ONCE(prot->sysctl_mem[1]); __entry->sysctl_mem[2] = READ_ONCE(prot->sysctl_mem[2]); __entry->allocated = allocated; __entry->sysctl_rmem = sk_get_rmem0(sk, prot); __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc); __entry->sysctl_wmem = sk_get_wmem0(sk, prot); __entry->wmem_alloc = refcount_read(&sk->sk_wmem_alloc); __entry->wmem_queued = READ_ONCE(sk->sk_wmem_queued); __entry->kind = kind; ), TP_printk("proto:%s sysctl_mem=%ld,%ld,%ld allocated=%ld sysctl_rmem=%d rmem_alloc=%d sysctl_wmem=%d wmem_alloc=%d wmem_queued=%d kind=%s", __entry->name, __entry->sysctl_mem[0], __entry->sysctl_mem[1], __entry->sysctl_mem[2], __entry->allocated, __entry->sysctl_rmem, __entry->rmem_alloc, __entry->sysctl_wmem, __entry->wmem_alloc, __entry->wmem_queued, show_skmem_kind_names(__entry->kind) ) ); TRACE_EVENT(inet_sock_set_state, TP_PROTO(const struct sock *sk, const int oldstate, const int newstate), TP_ARGS(sk, oldstate, newstate), TP_STRUCT__entry( __field(const void *, skaddr) __field(int, oldstate) __field(int, newstate) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __field(__u16, protocol) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) ), TP_fast_assign( const struct inet_sock *inet = inet_sk(sk); struct in6_addr *pin6; __be32 *p32; __entry->skaddr = sk; __entry->oldstate = oldstate; __entry->newstate = newstate; __entry->family = sk->sk_family; __entry->protocol = sk->sk_protocol; __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) { pin6 = (struct in6_addr *)__entry->saddr_v6; *pin6 = sk->sk_v6_rcv_saddr; pin6 = (struct in6_addr *)__entry->daddr_v6; *pin6 = sk->sk_v6_daddr; } else #endif { pin6 = (struct in6_addr *)__entry->saddr_v6; ipv6_addr_set_v4mapped(inet->inet_saddr, pin6); pin6 = (struct in6_addr *)__entry->daddr_v6; ipv6_addr_set_v4mapped(inet->inet_daddr, pin6); } ), TP_printk("family=%s protocol=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c oldstate=%s newstate=%s", show_family_name(__entry->family), show_inet_protocol_name(__entry->protocol), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, show_tcp_state_name(__entry->oldstate), show_tcp_state_name(__entry->newstate)) ); TRACE_EVENT(inet_sk_error_report, TP_PROTO(const struct sock *sk), TP_ARGS(sk), TP_STRUCT__entry( __field(int, error) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __field(__u16, protocol) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) ), TP_fast_assign( const struct inet_sock *inet = inet_sk(sk); struct in6_addr *pin6; __be32 *p32; __entry->error = sk->sk_err; __entry->family = sk->sk_family; __entry->protocol = sk->sk_protocol; __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) { pin6 = (struct in6_addr *)__entry->saddr_v6; *pin6 = sk->sk_v6_rcv_saddr; pin6 = (struct in6_addr *)__entry->daddr_v6; *pin6 = sk->sk_v6_daddr; } else #endif { pin6 = (struct in6_addr *)__entry->saddr_v6; ipv6_addr_set_v4mapped(inet->inet_saddr, pin6); pin6 = (struct in6_addr *)__entry->daddr_v6; ipv6_addr_set_v4mapped(inet->inet_daddr, pin6); } ), TP_printk("family=%s protocol=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c error=%d", show_family_name(__entry->family), show_inet_protocol_name(__entry->protocol), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, __entry->error) ); TRACE_EVENT(sk_data_ready, TP_PROTO(const struct sock *sk), TP_ARGS(sk), TP_STRUCT__entry( __field(const void *, skaddr) __field(__u16, family) __field(__u16, protocol) __field(unsigned long, ip) ), TP_fast_assign( __entry->skaddr = sk; __entry->family = sk->sk_family; __entry->protocol = sk->sk_protocol; __entry->ip = _RET_IP_; ), TP_printk("family=%u protocol=%u func=%ps", __entry->family, __entry->protocol, (void *)__entry->ip) ); /* * sock send/recv msg length */ DECLARE_EVENT_CLASS(sock_msg_length, TP_PROTO(struct sock *sk, int ret, int flags), TP_ARGS(sk, ret, flags), TP_STRUCT__entry( __field(void *, sk) __field(__u16, family) __field(__u16, protocol) __field(int, ret) __field(int, flags) ), TP_fast_assign( __entry->sk = sk; __entry->family = sk->sk_family; __entry->protocol = sk->sk_protocol; __entry->ret = ret; __entry->flags = flags; ), TP_printk("sk address = %p, family = %s protocol = %s, length = %d, error = %d, flags = 0x%x", __entry->sk, show_family_name(__entry->family), show_inet_protocol_name(__entry->protocol), !(__entry->flags & MSG_PEEK) ? (__entry->ret > 0 ? __entry->ret : 0) : 0, __entry->ret < 0 ? __entry->ret : 0, __entry->flags) ); DEFINE_EVENT(sock_msg_length, sock_send_length, TP_PROTO(struct sock *sk, int ret, int flags), TP_ARGS(sk, ret, flags) ); DEFINE_EVENT(sock_msg_length, sock_recv_length, TP_PROTO(struct sock *sk, int ret, int flags), TP_ARGS(sk, ret, flags) ); #endif /* _TRACE_SOCK_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 // SPDX-License-Identifier: GPL-2.0 #include <linux/types.h> #include <linux/kconfig.h> #include <linux/list.h> #include <linux/security.h> #include <linux/umh.h> #include <linux/sysctl.h> #include <linux/module.h> #include "fallback.h" #include "firmware.h" /* * firmware fallback mechanism */ /* * use small loading timeout for caching devices' firmware because all these * firmware images have been loaded successfully at lease once, also system is * ready for completing firmware loading now. The maximum size of firmware in * current distributions is about 2M bytes, so 10 secs should be enough. */ void fw_fallback_set_cache_timeout(void) { fw_fallback_config.old_timeout = __firmware_loading_timeout(); __fw_fallback_set_timeout(10); } /* Restores the timeout to the value last configured during normal operation */ void fw_fallback_set_default_timeout(void) { __fw_fallback_set_timeout(fw_fallback_config.old_timeout); } static long firmware_loading_timeout(void) { return __firmware_loading_timeout() > 0 ? __firmware_loading_timeout() * HZ : MAX_JIFFY_OFFSET; } static inline int fw_sysfs_wait_timeout(struct fw_priv *fw_priv, long timeout) { return __fw_state_wait_common(fw_priv, timeout); } static LIST_HEAD(pending_fw_head); void kill_pending_fw_fallback_reqs(bool only_kill_custom) { struct fw_priv *fw_priv; struct fw_priv *next; mutex_lock(&fw_lock); list_for_each_entry_safe(fw_priv, next, &pending_fw_head, pending_list) { if (!fw_priv->need_uevent || !only_kill_custom) __fw_load_abort(fw_priv); } mutex_unlock(&fw_lock); } /** * fw_load_sysfs_fallback() - load a firmware via the sysfs fallback mechanism * @fw_sysfs: firmware sysfs information for the firmware to load * @timeout: timeout to wait for the load * * In charge of constructing a sysfs fallback interface for firmware loading. **/ static int fw_load_sysfs_fallback(struct fw_sysfs *fw_sysfs, long timeout) { int retval = 0; struct device *f_dev = &fw_sysfs->dev; struct fw_priv *fw_priv = fw_sysfs->fw_priv; /* fall back on userspace loading */ if (!fw_priv->data) fw_priv->is_paged_buf = true; dev_set_uevent_suppress(f_dev, true); retval = device_add(f_dev); if (retval) { dev_err(f_dev, "%s: device_register failed\n", __func__); goto err_put_dev; } mutex_lock(&fw_lock); if (fw_state_is_aborted(fw_priv)) { mutex_unlock(&fw_lock); retval = -EINTR; goto out; } list_add(&fw_priv->pending_list, &pending_fw_head); mutex_unlock(&fw_lock); if (fw_priv->opt_flags & FW_OPT_UEVENT) { fw_priv->need_uevent = true; dev_set_uevent_suppress(f_dev, false); dev_dbg(f_dev, "firmware: requesting %s\n", fw_priv->fw_name); kobject_uevent(&fw_sysfs->dev.kobj, KOBJ_ADD); } else { timeout = MAX_JIFFY_OFFSET; } retval = fw_sysfs_wait_timeout(fw_priv, timeout); if (retval < 0 && retval != -ENOENT) { mutex_lock(&fw_lock); fw_load_abort(fw_sysfs); mutex_unlock(&fw_lock); } if (fw_state_is_aborted(fw_priv)) { if (retval == -ERESTARTSYS) retval = -EINTR; } else if (fw_priv->is_paged_buf && !fw_priv->data) retval = -ENOMEM; out: device_del(f_dev); err_put_dev: put_device(f_dev); return retval; } static int fw_load_from_user_helper(struct firmware *firmware, const char *name, struct device *device, u32 opt_flags) { struct fw_sysfs *fw_sysfs; long timeout; int ret; timeout = firmware_loading_timeout(); if (opt_flags & FW_OPT_NOWAIT) { timeout = usermodehelper_read_lock_wait(timeout); if (!timeout) { dev_dbg(device, "firmware: %s loading timed out\n", name); return -EBUSY; } } else { ret = usermodehelper_read_trylock(); if (WARN_ON(ret)) { dev_err(device, "firmware: %s will not be loaded\n", name); return ret; } } fw_sysfs = fw_create_instance(firmware, name, device, opt_flags); if (IS_ERR(fw_sysfs)) { ret = PTR_ERR(fw_sysfs); goto out_unlock; } fw_sysfs->fw_priv = firmware->priv; ret = fw_load_sysfs_fallback(fw_sysfs, timeout); if (!ret) ret = assign_fw(firmware, device); out_unlock: usermodehelper_read_unlock(); return ret; } static bool fw_force_sysfs_fallback(u32 opt_flags) { if (fw_fallback_config.force_sysfs_fallback) return true; if (!(opt_flags & FW_OPT_USERHELPER)) return false; return true; } static bool fw_run_sysfs_fallback(u32 opt_flags) { int ret; if (fw_fallback_config.ignore_sysfs_fallback) { pr_info_once("Ignoring firmware sysfs fallback due to sysctl knob\n"); return false; } if ((opt_flags & FW_OPT_NOFALLBACK_SYSFS)) return false; /* Also permit LSMs and IMA to fail firmware sysfs fallback */ ret = security_kernel_load_data(LOADING_FIRMWARE, true); if (ret < 0) return false; return fw_force_sysfs_fallback(opt_flags); } /** * firmware_fallback_sysfs() - use the fallback mechanism to find firmware * @fw: pointer to firmware image * @name: name of firmware file to look for * @device: device for which firmware is being loaded * @opt_flags: options to control firmware loading behaviour, as defined by * &enum fw_opt * @ret: return value from direct lookup which triggered the fallback mechanism * * This function is called if direct lookup for the firmware failed, it enables * a fallback mechanism through userspace by exposing a sysfs loading * interface. Userspace is in charge of loading the firmware through the sysfs * loading interface. This sysfs fallback mechanism may be disabled completely * on a system by setting the proc sysctl value ignore_sysfs_fallback to true. * If this is false we check if the internal API caller set the * @FW_OPT_NOFALLBACK_SYSFS flag, if so it would also disable the fallback * mechanism. A system may want to enforce the sysfs fallback mechanism at all * times, it can do this by setting ignore_sysfs_fallback to false and * force_sysfs_fallback to true. * Enabling force_sysfs_fallback is functionally equivalent to build a kernel * with CONFIG_FW_LOADER_USER_HELPER_FALLBACK. **/ int firmware_fallback_sysfs(struct firmware *fw, const char *name, struct device *device, u32 opt_flags, int ret) { if (!fw_run_sysfs_fallback(opt_flags)) return ret; if (!(opt_flags & FW_OPT_NO_WARN)) dev_warn(device, "Falling back to sysfs fallback for: %s\n", name); else dev_dbg(device, "Falling back to sysfs fallback for: %s\n", name); return fw_load_from_user_helper(fw, name, device, opt_flags); }
18 18 69 69 69 3 3 1 1 1 63 62 64 64 31 62 42 42 38 18 18 18 20 20 64 21 21 3 2 2 3 3 3 3 3 1 1 1 1534 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ #include "soft-interface.h" #include "main.h" #include <linux/atomic.h> #include <linux/byteorder/generic.h> #include <linux/cache.h> #include <linux/compiler.h> #include <linux/container_of.h> #include <linux/cpumask.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/jiffies.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/percpu.h> #include <linux/random.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/socket.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> #include <net/net_namespace.h> #include <net/netlink.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "bat_algo.h" #include "bridge_loop_avoidance.h" #include "distributed-arp-table.h" #include "gateway_client.h" #include "hard-interface.h" #include "multicast.h" #include "network-coding.h" #include "send.h" #include "translation-table.h" /** * batadv_skb_head_push() - Increase header size and move (push) head pointer * @skb: packet buffer which should be modified * @len: number of bytes to add * * Return: 0 on success or negative error number in case of failure */ int batadv_skb_head_push(struct sk_buff *skb, unsigned int len) { int result; /* TODO: We must check if we can release all references to non-payload * data using __skb_header_release in our skbs to allow skb_cow_header * to work optimally. This means that those skbs are not allowed to read * or write any data which is before the current position of skb->data * after that call and thus allow other skbs with the same data buffer * to write freely in that area. */ result = skb_cow_head(skb, len); if (result < 0) return result; skb_push(skb, len); return 0; } static int batadv_interface_open(struct net_device *dev) { netif_start_queue(dev); return 0; } static int batadv_interface_release(struct net_device *dev) { netif_stop_queue(dev); return 0; } /** * batadv_sum_counter() - Sum the cpu-local counters for index 'idx' * @bat_priv: the bat priv with all the soft interface information * @idx: index of counter to sum up * * Return: sum of all cpu-local counters */ static u64 batadv_sum_counter(struct batadv_priv *bat_priv, size_t idx) { u64 *counters, sum = 0; int cpu; for_each_possible_cpu(cpu) { counters = per_cpu_ptr(bat_priv->bat_counters, cpu); sum += counters[idx]; } return sum; } static struct net_device_stats *batadv_interface_stats(struct net_device *dev) { struct batadv_priv *bat_priv = netdev_priv(dev); struct net_device_stats *stats = &dev->stats; stats->tx_packets = batadv_sum_counter(bat_priv, BATADV_CNT_TX); stats->tx_bytes = batadv_sum_counter(bat_priv, BATADV_CNT_TX_BYTES); stats->tx_dropped = batadv_sum_counter(bat_priv, BATADV_CNT_TX_DROPPED); stats->rx_packets = batadv_sum_counter(bat_priv, BATADV_CNT_RX); stats->rx_bytes = batadv_sum_counter(bat_priv, BATADV_CNT_RX_BYTES); return stats; } static int batadv_interface_set_mac_addr(struct net_device *dev, void *p) { struct batadv_priv *bat_priv = netdev_priv(dev); struct batadv_softif_vlan *vlan; struct sockaddr *addr = p; u8 old_addr[ETH_ALEN]; if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; ether_addr_copy(old_addr, dev->dev_addr); eth_hw_addr_set(dev, addr->sa_data); /* only modify transtable if it has been initialized before */ if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) return 0; rcu_read_lock(); hlist_for_each_entry_rcu(vlan, &bat_priv->softif_vlan_list, list) { batadv_tt_local_remove(bat_priv, old_addr, vlan->vid, "mac address changed", false); batadv_tt_local_add(dev, addr->sa_data, vlan->vid, BATADV_NULL_IFINDEX, BATADV_NO_MARK); } rcu_read_unlock(); return 0; } static int batadv_interface_change_mtu(struct net_device *dev, int new_mtu) { struct batadv_priv *bat_priv = netdev_priv(dev); /* check ranges */ if (new_mtu < ETH_MIN_MTU || new_mtu > batadv_hardif_min_mtu(dev)) return -EINVAL; dev->mtu = new_mtu; bat_priv->mtu_set_by_user = new_mtu; return 0; } /** * batadv_interface_set_rx_mode() - set the rx mode of a device * @dev: registered network device to modify * * We do not actually need to set any rx filters for the virtual batman * soft interface. However a dummy handler enables a user to set static * multicast listeners for instance. */ static void batadv_interface_set_rx_mode(struct net_device *dev) { } static netdev_tx_t batadv_interface_tx(struct sk_buff *skb, struct net_device *soft_iface) { struct ethhdr *ethhdr; struct batadv_priv *bat_priv = netdev_priv(soft_iface); struct batadv_hard_iface *primary_if = NULL; struct batadv_bcast_packet *bcast_packet; static const u8 stp_addr[ETH_ALEN] = {0x01, 0x80, 0xC2, 0x00, 0x00, 0x00}; static const u8 ectp_addr[ETH_ALEN] = {0xCF, 0x00, 0x00, 0x00, 0x00, 0x00}; enum batadv_dhcp_recipient dhcp_rcp = BATADV_DHCP_NO; u8 *dst_hint = NULL, chaddr[ETH_ALEN]; struct vlan_ethhdr *vhdr; unsigned int header_len = 0; int data_len = skb->len, ret; unsigned long brd_delay = 0; bool do_bcast = false, client_added; unsigned short vid; u32 seqno; int gw_mode; enum batadv_forw_mode forw_mode = BATADV_FORW_BCAST; int mcast_is_routable = 0; int network_offset = ETH_HLEN; __be16 proto; if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) goto dropped; /* reset control block to avoid left overs from previous users */ memset(skb->cb, 0, sizeof(struct batadv_skb_cb)); netif_trans_update(soft_iface); vid = batadv_get_vid(skb, 0); skb_reset_mac_header(skb); ethhdr = eth_hdr(skb); proto = ethhdr->h_proto; switch (ntohs(proto)) { case ETH_P_8021Q: if (!pskb_may_pull(skb, sizeof(*vhdr))) goto dropped; vhdr = vlan_eth_hdr(skb); proto = vhdr->h_vlan_encapsulated_proto; /* drop batman-in-batman packets to prevent loops */ if (proto != htons(ETH_P_BATMAN)) { network_offset += VLAN_HLEN; break; } fallthrough; case ETH_P_BATMAN: goto dropped; } skb_set_network_header(skb, network_offset); if (batadv_bla_tx(bat_priv, skb, vid)) goto dropped; /* skb->data might have been reallocated by batadv_bla_tx() */ ethhdr = eth_hdr(skb); /* Register the client MAC in the transtable */ if (!is_multicast_ether_addr(ethhdr->h_source) && !batadv_bla_is_loopdetect_mac(ethhdr->h_source)) { client_added = batadv_tt_local_add(soft_iface, ethhdr->h_source, vid, skb->skb_iif, skb->mark); if (!client_added) goto dropped; } /* Snoop address candidates from DHCPACKs for early DAT filling */ batadv_dat_snoop_outgoing_dhcp_ack(bat_priv, skb, proto, vid); /* don't accept stp packets. STP does not help in meshes. * better use the bridge loop avoidance ... * * The same goes for ECTP sent at least by some Cisco Switches, * it might confuse the mesh when used with bridge loop avoidance. */ if (batadv_compare_eth(ethhdr->h_dest, stp_addr)) goto dropped; if (batadv_compare_eth(ethhdr->h_dest, ectp_addr)) goto dropped; gw_mode = atomic_read(&bat_priv->gw.mode); if (is_multicast_ether_addr(ethhdr->h_dest)) { /* if gw mode is off, broadcast every packet */ if (gw_mode == BATADV_GW_MODE_OFF) { do_bcast = true; goto send; } dhcp_rcp = batadv_gw_dhcp_recipient_get(skb, &header_len, chaddr); /* skb->data may have been modified by * batadv_gw_dhcp_recipient_get() */ ethhdr = eth_hdr(skb); /* if gw_mode is on, broadcast any non-DHCP message. * All the DHCP packets are going to be sent as unicast */ if (dhcp_rcp == BATADV_DHCP_NO) { do_bcast = true; goto send; } if (dhcp_rcp == BATADV_DHCP_TO_CLIENT) dst_hint = chaddr; else if ((gw_mode == BATADV_GW_MODE_SERVER) && (dhcp_rcp == BATADV_DHCP_TO_SERVER)) /* gateways should not forward any DHCP message if * directed to a DHCP server */ goto dropped; send: if (do_bcast && !is_broadcast_ether_addr(ethhdr->h_dest)) { forw_mode = batadv_mcast_forw_mode(bat_priv, skb, &mcast_is_routable); switch (forw_mode) { case BATADV_FORW_BCAST: break; case BATADV_FORW_UCASTS: do_bcast = false; break; case BATADV_FORW_NONE: fallthrough; default: goto dropped; } } } batadv_skb_set_priority(skb, 0); /* ethernet packet should be broadcasted */ if (do_bcast) { primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) goto dropped; /* in case of ARP request, we do not immediately broadcasti the * packet, instead we first wait for DAT to try to retrieve the * correct ARP entry */ if (batadv_dat_snoop_outgoing_arp_request(bat_priv, skb)) brd_delay = msecs_to_jiffies(ARP_REQ_DELAY); if (batadv_skb_head_push(skb, sizeof(*bcast_packet)) < 0) goto dropped; bcast_packet = (struct batadv_bcast_packet *)skb->data; bcast_packet->version = BATADV_COMPAT_VERSION; bcast_packet->ttl = BATADV_TTL - 1; /* batman packet type: broadcast */ bcast_packet->packet_type = BATADV_BCAST; bcast_packet->reserved = 0; /* hw address of first interface is the orig mac because only * this mac is known throughout the mesh */ ether_addr_copy(bcast_packet->orig, primary_if->net_dev->dev_addr); /* set broadcast sequence number */ seqno = atomic_inc_return(&bat_priv->bcast_seqno); bcast_packet->seqno = htonl(seqno); batadv_send_bcast_packet(bat_priv, skb, brd_delay, true); /* unicast packet */ } else { /* DHCP packets going to a server will use the GW feature */ if (dhcp_rcp == BATADV_DHCP_TO_SERVER) { ret = batadv_gw_out_of_range(bat_priv, skb); if (ret) goto dropped; ret = batadv_send_skb_via_gw(bat_priv, skb, vid); } else if (forw_mode == BATADV_FORW_UCASTS) { ret = batadv_mcast_forw_send(bat_priv, skb, vid, mcast_is_routable); } else { if (batadv_dat_snoop_outgoing_arp_request(bat_priv, skb)) goto dropped; batadv_dat_snoop_outgoing_arp_reply(bat_priv, skb); ret = batadv_send_skb_via_tt(bat_priv, skb, dst_hint, vid); } if (ret != NET_XMIT_SUCCESS) goto dropped_freed; } batadv_inc_counter(bat_priv, BATADV_CNT_TX); batadv_add_counter(bat_priv, BATADV_CNT_TX_BYTES, data_len); goto end; dropped: kfree_skb(skb); dropped_freed: batadv_inc_counter(bat_priv, BATADV_CNT_TX_DROPPED); end: batadv_hardif_put(primary_if); return NETDEV_TX_OK; } /** * batadv_interface_rx() - receive ethernet frame on local batman-adv interface * @soft_iface: local interface which will receive the ethernet frame * @skb: ethernet frame for @soft_iface * @hdr_size: size of already parsed batman-adv header * @orig_node: originator from which the batman-adv packet was sent * * Sends an ethernet frame to the receive path of the local @soft_iface. * skb->data has still point to the batman-adv header with the size @hdr_size. * The caller has to have parsed this header already and made sure that at least * @hdr_size bytes are still available for pull in @skb. * * The packet may still get dropped. This can happen when the encapsulated * ethernet frame is invalid or contains again an batman-adv packet. Also * unicast packets will be dropped directly when it was sent between two * isolated clients. */ void batadv_interface_rx(struct net_device *soft_iface, struct sk_buff *skb, int hdr_size, struct batadv_orig_node *orig_node) { struct batadv_bcast_packet *batadv_bcast_packet; struct batadv_priv *bat_priv = netdev_priv(soft_iface); struct vlan_ethhdr *vhdr; struct ethhdr *ethhdr; unsigned short vid; int packet_type; batadv_bcast_packet = (struct batadv_bcast_packet *)skb->data; packet_type = batadv_bcast_packet->packet_type; skb_pull_rcsum(skb, hdr_size); skb_reset_mac_header(skb); /* clean the netfilter state now that the batman-adv header has been * removed */ nf_reset_ct(skb); if (unlikely(!pskb_may_pull(skb, ETH_HLEN))) goto dropped; vid = batadv_get_vid(skb, 0); ethhdr = eth_hdr(skb); switch (ntohs(ethhdr->h_proto)) { case ETH_P_8021Q: if (!pskb_may_pull(skb, VLAN_ETH_HLEN)) goto dropped; vhdr = skb_vlan_eth_hdr(skb); /* drop batman-in-batman packets to prevent loops */ if (vhdr->h_vlan_encapsulated_proto != htons(ETH_P_BATMAN)) break; fallthrough; case ETH_P_BATMAN: goto dropped; } /* skb->dev & skb->pkt_type are set here */ skb->protocol = eth_type_trans(skb, soft_iface); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); batadv_inc_counter(bat_priv, BATADV_CNT_RX); batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES, skb->len + ETH_HLEN); /* Let the bridge loop avoidance check the packet. If will * not handle it, we can safely push it up. */ if (batadv_bla_rx(bat_priv, skb, vid, packet_type)) goto out; if (orig_node) batadv_tt_add_temporary_global_entry(bat_priv, orig_node, ethhdr->h_source, vid); if (is_multicast_ether_addr(ethhdr->h_dest)) { /* set the mark on broadcast packets if AP isolation is ON and * the packet is coming from an "isolated" client */ if (batadv_vlan_ap_isola_get(bat_priv, vid) && batadv_tt_global_is_isolated(bat_priv, ethhdr->h_source, vid)) { /* save bits in skb->mark not covered by the mask and * apply the mark on the rest */ skb->mark &= ~bat_priv->isolation_mark_mask; skb->mark |= bat_priv->isolation_mark; } } else if (batadv_is_ap_isolated(bat_priv, ethhdr->h_source, ethhdr->h_dest, vid)) { goto dropped; } netif_rx(skb); goto out; dropped: kfree_skb(skb); out: return; } /** * batadv_softif_vlan_release() - release vlan from lists and queue for free * after rcu grace period * @ref: kref pointer of the vlan object */ void batadv_softif_vlan_release(struct kref *ref) { struct batadv_softif_vlan *vlan; vlan = container_of(ref, struct batadv_softif_vlan, refcount); spin_lock_bh(&vlan->bat_priv->softif_vlan_list_lock); hlist_del_rcu(&vlan->list); spin_unlock_bh(&vlan->bat_priv->softif_vlan_list_lock); kfree_rcu(vlan, rcu); } /** * batadv_softif_vlan_get() - get the vlan object for a specific vid * @bat_priv: the bat priv with all the soft interface information * @vid: the identifier of the vlan object to retrieve * * Return: the private data of the vlan matching the vid passed as argument or * NULL otherwise. The refcounter of the returned object is incremented by 1. */ struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv, unsigned short vid) { struct batadv_softif_vlan *vlan_tmp, *vlan = NULL; rcu_read_lock(); hlist_for_each_entry_rcu(vlan_tmp, &bat_priv->softif_vlan_list, list) { if (vlan_tmp->vid != vid) continue; if (!kref_get_unless_zero(&vlan_tmp->refcount)) continue; vlan = vlan_tmp; break; } rcu_read_unlock(); return vlan; } /** * batadv_softif_create_vlan() - allocate the needed resources for a new vlan * @bat_priv: the bat priv with all the soft interface information * @vid: the VLAN identifier * * Return: 0 on success, a negative error otherwise. */ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) { struct batadv_softif_vlan *vlan; spin_lock_bh(&bat_priv->softif_vlan_list_lock); vlan = batadv_softif_vlan_get(bat_priv, vid); if (vlan) { batadv_softif_vlan_put(vlan); spin_unlock_bh(&bat_priv->softif_vlan_list_lock); return -EEXIST; } vlan = kzalloc(sizeof(*vlan), GFP_ATOMIC); if (!vlan) { spin_unlock_bh(&bat_priv->softif_vlan_list_lock); return -ENOMEM; } vlan->bat_priv = bat_priv; vlan->vid = vid; kref_init(&vlan->refcount); atomic_set(&vlan->ap_isolation, 0); kref_get(&vlan->refcount); hlist_add_head_rcu(&vlan->list, &bat_priv->softif_vlan_list); spin_unlock_bh(&bat_priv->softif_vlan_list_lock); /* add a new TT local entry. This one will be marked with the NOPURGE * flag */ batadv_tt_local_add(bat_priv->soft_iface, bat_priv->soft_iface->dev_addr, vid, BATADV_NULL_IFINDEX, BATADV_NO_MARK); /* don't return reference to new softif_vlan */ batadv_softif_vlan_put(vlan); return 0; } /** * batadv_softif_destroy_vlan() - remove and destroy a softif_vlan object * @bat_priv: the bat priv with all the soft interface information * @vlan: the object to remove */ static void batadv_softif_destroy_vlan(struct batadv_priv *bat_priv, struct batadv_softif_vlan *vlan) { /* explicitly remove the associated TT local entry because it is marked * with the NOPURGE flag */ batadv_tt_local_remove(bat_priv, bat_priv->soft_iface->dev_addr, vlan->vid, "vlan interface destroyed", false); batadv_softif_vlan_put(vlan); } /** * batadv_interface_add_vid() - ndo_add_vid API implementation * @dev: the netdev of the mesh interface * @proto: protocol of the vlan id * @vid: identifier of the new vlan * * Set up all the internal structures for handling the new vlan on top of the * mesh interface * * Return: 0 on success or a negative error code in case of failure. */ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto, unsigned short vid) { struct batadv_priv *bat_priv = netdev_priv(dev); struct batadv_softif_vlan *vlan; /* only 802.1Q vlans are supported. * batman-adv does not know how to handle other types */ if (proto != htons(ETH_P_8021Q)) return -EINVAL; vid |= BATADV_VLAN_HAS_TAG; /* if a new vlan is getting created and it already exists, it means that * it was not deleted yet. batadv_softif_vlan_get() increases the * refcount in order to revive the object. * * if it does not exist then create it. */ vlan = batadv_softif_vlan_get(bat_priv, vid); if (!vlan) return batadv_softif_create_vlan(bat_priv, vid); /* add a new TT local entry. This one will be marked with the NOPURGE * flag. This must be added again, even if the vlan object already * exists, because the entry was deleted by kill_vid() */ batadv_tt_local_add(bat_priv->soft_iface, bat_priv->soft_iface->dev_addr, vid, BATADV_NULL_IFINDEX, BATADV_NO_MARK); return 0; } /** * batadv_interface_kill_vid() - ndo_kill_vid API implementation * @dev: the netdev of the mesh interface * @proto: protocol of the vlan id * @vid: identifier of the deleted vlan * * Destroy all the internal structures used to handle the vlan identified by vid * on top of the mesh interface * * Return: 0 on success, -EINVAL if the specified prototype is not ETH_P_8021Q * or -ENOENT if the specified vlan id wasn't registered. */ static int batadv_interface_kill_vid(struct net_device *dev, __be16 proto, unsigned short vid) { struct batadv_priv *bat_priv = netdev_priv(dev); struct batadv_softif_vlan *vlan; /* only 802.1Q vlans are supported. batman-adv does not know how to * handle other types */ if (proto != htons(ETH_P_8021Q)) return -EINVAL; vlan = batadv_softif_vlan_get(bat_priv, vid | BATADV_VLAN_HAS_TAG); if (!vlan) return -ENOENT; batadv_softif_destroy_vlan(bat_priv, vlan); /* finally free the vlan object */ batadv_softif_vlan_put(vlan); return 0; } /* batman-adv network devices have devices nesting below it and are a special * "super class" of normal network devices; split their locks off into a * separate class since they always nest. */ static struct lock_class_key batadv_netdev_xmit_lock_key; static struct lock_class_key batadv_netdev_addr_lock_key; /** * batadv_set_lockdep_class_one() - Set lockdep class for a single tx queue * @dev: device which owns the tx queue * @txq: tx queue to modify * @_unused: always NULL */ static void batadv_set_lockdep_class_one(struct net_device *dev, struct netdev_queue *txq, void *_unused) { lockdep_set_class(&txq->_xmit_lock, &batadv_netdev_xmit_lock_key); } /** * batadv_set_lockdep_class() - Set txq and addr_list lockdep class * @dev: network device to modify */ static void batadv_set_lockdep_class(struct net_device *dev) { lockdep_set_class(&dev->addr_list_lock, &batadv_netdev_addr_lock_key); netdev_for_each_tx_queue(dev, batadv_set_lockdep_class_one, NULL); } /** * batadv_softif_init_late() - late stage initialization of soft interface * @dev: registered network device to modify * * Return: error code on failures */ static int batadv_softif_init_late(struct net_device *dev) { struct batadv_priv *bat_priv; u32 random_seqno; int ret; size_t cnt_len = sizeof(u64) * BATADV_CNT_NUM; batadv_set_lockdep_class(dev); bat_priv = netdev_priv(dev); bat_priv->soft_iface = dev; /* batadv_interface_stats() needs to be available as soon as * register_netdevice() has been called */ bat_priv->bat_counters = __alloc_percpu(cnt_len, __alignof__(u64)); if (!bat_priv->bat_counters) return -ENOMEM; atomic_set(&bat_priv->aggregated_ogms, 1); atomic_set(&bat_priv->bonding, 0); #ifdef CONFIG_BATMAN_ADV_BLA atomic_set(&bat_priv->bridge_loop_avoidance, 1); #endif #ifdef CONFIG_BATMAN_ADV_DAT atomic_set(&bat_priv->distributed_arp_table, 1); #endif #ifdef CONFIG_BATMAN_ADV_MCAST atomic_set(&bat_priv->multicast_mode, 1); atomic_set(&bat_priv->multicast_fanout, 16); atomic_set(&bat_priv->mcast.num_want_all_unsnoopables, 0); atomic_set(&bat_priv->mcast.num_want_all_ipv4, 0); atomic_set(&bat_priv->mcast.num_want_all_ipv6, 0); #endif atomic_set(&bat_priv->gw.mode, BATADV_GW_MODE_OFF); atomic_set(&bat_priv->gw.bandwidth_down, 100); atomic_set(&bat_priv->gw.bandwidth_up, 20); atomic_set(&bat_priv->orig_interval, 1000); atomic_set(&bat_priv->hop_penalty, 30); #ifdef CONFIG_BATMAN_ADV_DEBUG atomic_set(&bat_priv->log_level, 0); #endif atomic_set(&bat_priv->fragmentation, 1); atomic_set(&bat_priv->packet_size_max, ETH_DATA_LEN); atomic_set(&bat_priv->bcast_queue_left, BATADV_BCAST_QUEUE_LEN); atomic_set(&bat_priv->batman_queue_left, BATADV_BATMAN_QUEUE_LEN); atomic_set(&bat_priv->mesh_state, BATADV_MESH_INACTIVE); atomic_set(&bat_priv->bcast_seqno, 1); atomic_set(&bat_priv->tt.vn, 0); atomic_set(&bat_priv->tt.local_changes, 0); atomic_set(&bat_priv->tt.ogm_append_cnt, 0); #ifdef CONFIG_BATMAN_ADV_BLA atomic_set(&bat_priv->bla.num_requests, 0); #endif atomic_set(&bat_priv->tp_num, 0); bat_priv->tt.last_changeset = NULL; bat_priv->tt.last_changeset_len = 0; bat_priv->isolation_mark = 0; bat_priv->isolation_mark_mask = 0; /* randomize initial seqno to avoid collision */ get_random_bytes(&random_seqno, sizeof(random_seqno)); atomic_set(&bat_priv->frag_seqno, random_seqno); bat_priv->primary_if = NULL; batadv_nc_init_bat_priv(bat_priv); if (!bat_priv->algo_ops) { ret = batadv_algo_select(bat_priv, batadv_routing_algo); if (ret < 0) goto free_bat_counters; } ret = batadv_mesh_init(dev); if (ret < 0) goto free_bat_counters; return 0; free_bat_counters: free_percpu(bat_priv->bat_counters); bat_priv->bat_counters = NULL; return ret; } /** * batadv_softif_slave_add() - Add a slave interface to a batadv_soft_interface * @dev: batadv_soft_interface used as master interface * @slave_dev: net_device which should become the slave interface * @extack: extended ACK report struct * * Return: 0 if successful or error otherwise. */ static int batadv_softif_slave_add(struct net_device *dev, struct net_device *slave_dev, struct netlink_ext_ack *extack) { struct batadv_hard_iface *hard_iface; int ret = -EINVAL; hard_iface = batadv_hardif_get_by_netdev(slave_dev); if (!hard_iface || hard_iface->soft_iface) goto out; ret = batadv_hardif_enable_interface(hard_iface, dev); out: batadv_hardif_put(hard_iface); return ret; } /** * batadv_softif_slave_del() - Delete a slave iface from a batadv_soft_interface * @dev: batadv_soft_interface used as master interface * @slave_dev: net_device which should be removed from the master interface * * Return: 0 if successful or error otherwise. */ static int batadv_softif_slave_del(struct net_device *dev, struct net_device *slave_dev) { struct batadv_hard_iface *hard_iface; int ret = -EINVAL; hard_iface = batadv_hardif_get_by_netdev(slave_dev); if (!hard_iface || hard_iface->soft_iface != dev) goto out; batadv_hardif_disable_interface(hard_iface); ret = 0; out: batadv_hardif_put(hard_iface); return ret; } static const struct net_device_ops batadv_netdev_ops = { .ndo_init = batadv_softif_init_late, .ndo_open = batadv_interface_open, .ndo_stop = batadv_interface_release, .ndo_get_stats = batadv_interface_stats, .ndo_vlan_rx_add_vid = batadv_interface_add_vid, .ndo_vlan_rx_kill_vid = batadv_interface_kill_vid, .ndo_set_mac_address = batadv_interface_set_mac_addr, .ndo_change_mtu = batadv_interface_change_mtu, .ndo_set_rx_mode = batadv_interface_set_rx_mode, .ndo_start_xmit = batadv_interface_tx, .ndo_validate_addr = eth_validate_addr, .ndo_add_slave = batadv_softif_slave_add, .ndo_del_slave = batadv_softif_slave_del, }; static void batadv_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { strscpy(info->driver, "B.A.T.M.A.N. advanced", sizeof(info->driver)); strscpy(info->version, BATADV_SOURCE_VERSION, sizeof(info->version)); strscpy(info->fw_version, "N/A", sizeof(info->fw_version)); strscpy(info->bus_info, "batman", sizeof(info->bus_info)); } /* Inspired by drivers/net/ethernet/dlink/sundance.c:1702 * Declare each description string in struct.name[] to get fixed sized buffer * and compile time checking for strings longer than ETH_GSTRING_LEN. */ static const struct { const char name[ETH_GSTRING_LEN]; } batadv_counters_strings[] = { { "tx" }, { "tx_bytes" }, { "tx_dropped" }, { "rx" }, { "rx_bytes" }, { "forward" }, { "forward_bytes" }, { "mgmt_tx" }, { "mgmt_tx_bytes" }, { "mgmt_rx" }, { "mgmt_rx_bytes" }, { "frag_tx" }, { "frag_tx_bytes" }, { "frag_rx" }, { "frag_rx_bytes" }, { "frag_fwd" }, { "frag_fwd_bytes" }, { "tt_request_tx" }, { "tt_request_rx" }, { "tt_response_tx" }, { "tt_response_rx" }, { "tt_roam_adv_tx" }, { "tt_roam_adv_rx" }, #ifdef CONFIG_BATMAN_ADV_DAT { "dat_get_tx" }, { "dat_get_rx" }, { "dat_put_tx" }, { "dat_put_rx" }, { "dat_cached_reply_tx" }, #endif #ifdef CONFIG_BATMAN_ADV_NC { "nc_code" }, { "nc_code_bytes" }, { "nc_recode" }, { "nc_recode_bytes" }, { "nc_buffer" }, { "nc_decode" }, { "nc_decode_bytes" }, { "nc_decode_failed" }, { "nc_sniffed" }, #endif }; static void batadv_get_strings(struct net_device *dev, u32 stringset, u8 *data) { if (stringset == ETH_SS_STATS) memcpy(data, batadv_counters_strings, sizeof(batadv_counters_strings)); } static void batadv_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, u64 *data) { struct batadv_priv *bat_priv = netdev_priv(dev); int i; for (i = 0; i < BATADV_CNT_NUM; i++) data[i] = batadv_sum_counter(bat_priv, i); } static int batadv_get_sset_count(struct net_device *dev, int stringset) { if (stringset == ETH_SS_STATS) return BATADV_CNT_NUM; return -EOPNOTSUPP; } static const struct ethtool_ops batadv_ethtool_ops = { .get_drvinfo = batadv_get_drvinfo, .get_link = ethtool_op_get_link, .get_strings = batadv_get_strings, .get_ethtool_stats = batadv_get_ethtool_stats, .get_sset_count = batadv_get_sset_count, }; /** * batadv_softif_free() - Deconstructor of batadv_soft_interface * @dev: Device to cleanup and remove */ static void batadv_softif_free(struct net_device *dev) { batadv_mesh_free(dev); /* some scheduled RCU callbacks need the bat_priv struct to accomplish * their tasks. Wait for them all to be finished before freeing the * netdev and its private data (bat_priv) */ rcu_barrier(); } /** * batadv_softif_init_early() - early stage initialization of soft interface * @dev: registered network device to modify */ static void batadv_softif_init_early(struct net_device *dev) { ether_setup(dev); dev->netdev_ops = &batadv_netdev_ops; dev->needs_free_netdev = true; dev->priv_destructor = batadv_softif_free; dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_NETNS_LOCAL; dev->features |= NETIF_F_LLTX; dev->priv_flags |= IFF_NO_QUEUE; /* can't call min_mtu, because the needed variables * have not been initialized yet */ dev->mtu = ETH_DATA_LEN; /* generate random address */ eth_hw_addr_random(dev); dev->ethtool_ops = &batadv_ethtool_ops; } /** * batadv_softif_validate() - validate configuration of new batadv link * @tb: IFLA_INFO_DATA netlink attributes * @data: enum batadv_ifla_attrs attributes * @extack: extended ACK report struct * * Return: 0 if successful or error otherwise. */ static int batadv_softif_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct batadv_algo_ops *algo_ops; if (!data) return 0; if (data[IFLA_BATADV_ALGO_NAME]) { algo_ops = batadv_algo_get(nla_data(data[IFLA_BATADV_ALGO_NAME])); if (!algo_ops) return -EINVAL; } return 0; } /** * batadv_softif_newlink() - pre-initialize and register new batadv link * @src_net: the applicable net namespace * @dev: network device to register * @tb: IFLA_INFO_DATA netlink attributes * @data: enum batadv_ifla_attrs attributes * @extack: extended ACK report struct * * Return: 0 if successful or error otherwise. */ static int batadv_softif_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct batadv_priv *bat_priv = netdev_priv(dev); const char *algo_name; int err; if (data && data[IFLA_BATADV_ALGO_NAME]) { algo_name = nla_data(data[IFLA_BATADV_ALGO_NAME]); err = batadv_algo_select(bat_priv, algo_name); if (err) return -EINVAL; } return register_netdevice(dev); } /** * batadv_softif_destroy_netlink() - deletion of batadv_soft_interface via * netlink * @soft_iface: the to-be-removed batman-adv interface * @head: list pointer */ static void batadv_softif_destroy_netlink(struct net_device *soft_iface, struct list_head *head) { struct batadv_priv *bat_priv = netdev_priv(soft_iface); struct batadv_hard_iface *hard_iface; struct batadv_softif_vlan *vlan; list_for_each_entry(hard_iface, &batadv_hardif_list, list) { if (hard_iface->soft_iface == soft_iface) batadv_hardif_disable_interface(hard_iface); } /* destroy the "untagged" VLAN */ vlan = batadv_softif_vlan_get(bat_priv, BATADV_NO_FLAGS); if (vlan) { batadv_softif_destroy_vlan(bat_priv, vlan); batadv_softif_vlan_put(vlan); } unregister_netdevice_queue(soft_iface, head); } /** * batadv_softif_is_valid() - Check whether device is a batadv soft interface * @net_dev: device which should be checked * * Return: true when net_dev is a batman-adv interface, false otherwise */ bool batadv_softif_is_valid(const struct net_device *net_dev) { if (net_dev->netdev_ops->ndo_start_xmit == batadv_interface_tx) return true; return false; } static const struct nla_policy batadv_ifla_policy[IFLA_BATADV_MAX + 1] = { [IFLA_BATADV_ALGO_NAME] = { .type = NLA_NUL_STRING }, }; struct rtnl_link_ops batadv_link_ops __read_mostly = { .kind = "batadv", .priv_size = sizeof(struct batadv_priv), .setup = batadv_softif_init_early, .maxtype = IFLA_BATADV_MAX, .policy = batadv_ifla_policy, .validate = batadv_softif_validate, .newlink = batadv_softif_newlink, .dellink = batadv_softif_destroy_netlink, };
56 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 // SPDX-License-Identifier: GPL-2.0-or-later /* * Handle incoming frames * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> */ #include <linux/slab.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/netfilter_bridge.h> #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE #include <net/netfilter/nf_queue.h> #endif #include <linux/neighbour.h> #include <net/arp.h> #include <net/dsa.h> #include <linux/export.h> #include <linux/rculist.h> #include "br_private.h" #include "br_private_tunnel.h" static int br_netif_receive_skb(struct net *net, struct sock *sk, struct sk_buff *skb) { br_drop_fake_rtable(skb); return netif_receive_skb(skb); } static int br_pass_frame_up(struct sk_buff *skb) { struct net_device *indev, *brdev = BR_INPUT_SKB_CB(skb)->brdev; struct net_bridge *br = netdev_priv(brdev); struct net_bridge_vlan_group *vg; dev_sw_netstats_rx_add(brdev, skb->len); vg = br_vlan_group_rcu(br); /* Reset the offload_fwd_mark because there could be a stacked * bridge above, and it should not think this bridge it doing * that bridge's work forwarding out its ports. */ br_switchdev_frame_unmark(skb); /* Bridge is just like any other port. Make sure the * packet is allowed except in promisc mode when someone * may be running packet capture. */ if (!(brdev->flags & IFF_PROMISC) && !br_allowed_egress(vg, skb)) { kfree_skb(skb); return NET_RX_DROP; } indev = skb->dev; skb->dev = brdev; skb = br_handle_vlan(br, NULL, vg, skb); if (!skb) return NET_RX_DROP; /* update the multicast stats if the packet is IGMP/MLD */ br_multicast_count(br, NULL, skb, br_multicast_igmp_type(skb), BR_MCAST_DIR_TX); return NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, dev_net(indev), NULL, skb, indev, NULL, br_netif_receive_skb); } /* note: already called with rcu_read_lock */ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_bridge_port *p = br_port_get_rcu(skb->dev); enum br_pkt_type pkt_type = BR_PKT_UNICAST; struct net_bridge_fdb_entry *dst = NULL; struct net_bridge_mcast_port *pmctx; struct net_bridge_mdb_entry *mdst; bool local_rcv, mcast_hit = false; struct net_bridge_mcast *brmctx; struct net_bridge_vlan *vlan; struct net_bridge *br; u16 vid = 0; u8 state; if (!p) goto drop; br = p->br; if (br_mst_is_enabled(br)) { state = BR_STATE_FORWARDING; } else { if (p->state == BR_STATE_DISABLED) goto drop; state = p->state; } brmctx = &p->br->multicast_ctx; pmctx = &p->multicast_ctx; if (!br_allowed_ingress(p->br, nbp_vlan_group_rcu(p), skb, &vid, &state, &vlan)) goto out; if (p->flags & BR_PORT_LOCKED) { struct net_bridge_fdb_entry *fdb_src = br_fdb_find_rcu(br, eth_hdr(skb)->h_source, vid); if (!fdb_src) { /* FDB miss. Create locked FDB entry if MAB is enabled * and drop the packet. */ if (p->flags & BR_PORT_MAB) br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, BIT(BR_FDB_LOCKED)); goto drop; } else if (READ_ONCE(fdb_src->dst) != p || test_bit(BR_FDB_LOCAL, &fdb_src->flags)) { /* FDB mismatch. Drop the packet without roaming. */ goto drop; } else if (test_bit(BR_FDB_LOCKED, &fdb_src->flags)) { /* FDB match, but entry is locked. Refresh it and drop * the packet. */ br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, BIT(BR_FDB_LOCKED)); goto drop; } } nbp_switchdev_frame_mark(p, skb); /* insert into forwarding database after filtering to avoid spoofing */ if (p->flags & BR_LEARNING) br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, 0); local_rcv = !!(br->dev->flags & IFF_PROMISC); if (is_multicast_ether_addr(eth_hdr(skb)->h_dest)) { /* by definition the broadcast is also a multicast address */ if (is_broadcast_ether_addr(eth_hdr(skb)->h_dest)) { pkt_type = BR_PKT_BROADCAST; local_rcv = true; } else { pkt_type = BR_PKT_MULTICAST; if (br_multicast_rcv(&brmctx, &pmctx, vlan, skb, vid)) goto drop; } } if (state == BR_STATE_LEARNING) goto drop; BR_INPUT_SKB_CB(skb)->brdev = br->dev; BR_INPUT_SKB_CB(skb)->src_port_isolated = !!(p->flags & BR_ISOLATED); if (IS_ENABLED(CONFIG_INET) && (skb->protocol == htons(ETH_P_ARP) || skb->protocol == htons(ETH_P_RARP))) { br_do_proxy_suppress_arp(skb, br, vid, p); } else if (IS_ENABLED(CONFIG_IPV6) && skb->protocol == htons(ETH_P_IPV6) && br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED) && pskb_may_pull(skb, sizeof(struct ipv6hdr) + sizeof(struct nd_msg)) && ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) { struct nd_msg *msg, _msg; msg = br_is_nd_neigh_msg(skb, &_msg); if (msg) br_do_suppress_nd(skb, br, vid, p, msg); } switch (pkt_type) { case BR_PKT_MULTICAST: mdst = br_mdb_get(brmctx, skb, vid); if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) && br_multicast_querier_exists(brmctx, eth_hdr(skb), mdst)) { if ((mdst && mdst->host_joined) || br_multicast_is_router(brmctx, skb)) { local_rcv = true; DEV_STATS_INC(br->dev, multicast); } mcast_hit = true; } else { local_rcv = true; DEV_STATS_INC(br->dev, multicast); } break; case BR_PKT_UNICAST: dst = br_fdb_find_rcu(br, eth_hdr(skb)->h_dest, vid); break; default: break; } if (dst) { unsigned long now = jiffies; if (test_bit(BR_FDB_LOCAL, &dst->flags)) return br_pass_frame_up(skb); if (now != dst->used) dst->used = now; br_forward(dst->dst, skb, local_rcv, false); } else { if (!mcast_hit) br_flood(br, skb, pkt_type, local_rcv, false, vid); else br_multicast_flood(mdst, skb, brmctx, local_rcv, false); } if (local_rcv) return br_pass_frame_up(skb); out: return 0; drop: kfree_skb(skb); goto out; } EXPORT_SYMBOL_GPL(br_handle_frame_finish); static void __br_handle_local_finish(struct sk_buff *skb) { struct net_bridge_port *p = br_port_get_rcu(skb->dev); u16 vid = 0; /* check if vlan is allowed, to avoid spoofing */ if ((p->flags & BR_LEARNING) && nbp_state_should_learn(p) && !br_opt_get(p->br, BROPT_NO_LL_LEARN) && br_should_learn(p, skb, &vid)) br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid, 0); } /* note: already called with rcu_read_lock */ static int br_handle_local_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { __br_handle_local_finish(skb); /* return 1 to signal the okfn() was called so it's ok to use the skb */ return 1; } static int nf_hook_bridge_pre(struct sk_buff *skb, struct sk_buff **pskb) { #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE struct nf_hook_entries *e = NULL; struct nf_hook_state state; unsigned int verdict, i; struct net *net; int ret; net = dev_net(skb->dev); #ifdef HAVE_JUMP_LABEL if (!static_key_false(&nf_hooks_needed[NFPROTO_BRIDGE][NF_BR_PRE_ROUTING])) goto frame_finish; #endif e = rcu_dereference(net->nf.hooks_bridge[NF_BR_PRE_ROUTING]); if (!e) goto frame_finish; nf_hook_state_init(&state, NF_BR_PRE_ROUTING, NFPROTO_BRIDGE, skb->dev, NULL, NULL, net, br_handle_frame_finish); for (i = 0; i < e->num_hook_entries; i++) { verdict = nf_hook_entry_hookfn(&e->hooks[i], skb, &state); switch (verdict & NF_VERDICT_MASK) { case NF_ACCEPT: if (BR_INPUT_SKB_CB(skb)->br_netfilter_broute) { *pskb = skb; return RX_HANDLER_PASS; } break; case NF_DROP: kfree_skb(skb); return RX_HANDLER_CONSUMED; case NF_QUEUE: ret = nf_queue(skb, &state, i, verdict); if (ret == 1) continue; return RX_HANDLER_CONSUMED; default: /* STOLEN */ return RX_HANDLER_CONSUMED; } } frame_finish: net = dev_net(skb->dev); br_handle_frame_finish(net, NULL, skb); #else br_handle_frame_finish(dev_net(skb->dev), NULL, skb); #endif return RX_HANDLER_CONSUMED; } /* Return 0 if the frame was not processed otherwise 1 * note: already called with rcu_read_lock */ static int br_process_frame_type(struct net_bridge_port *p, struct sk_buff *skb) { struct br_frame_type *tmp; hlist_for_each_entry_rcu(tmp, &p->br->frame_type_list, list) if (unlikely(tmp->type == skb->protocol)) return tmp->frame_handler(p, skb); return 0; } /* * Return NULL if skb is handled * note: already called with rcu_read_lock */ static rx_handler_result_t br_handle_frame(struct sk_buff **pskb) { struct net_bridge_port *p; struct sk_buff *skb = *pskb; const unsigned char *dest = eth_hdr(skb)->h_dest; if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) return RX_HANDLER_PASS; if (!is_valid_ether_addr(eth_hdr(skb)->h_source)) goto drop; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) return RX_HANDLER_CONSUMED; memset(skb->cb, 0, sizeof(struct br_input_skb_cb)); br_tc_skb_miss_set(skb, false); p = br_port_get_rcu(skb->dev); if (p->flags & BR_VLAN_TUNNEL) br_handle_ingress_vlan_tunnel(skb, p, nbp_vlan_group_rcu(p)); if (unlikely(is_link_local_ether_addr(dest))) { u16 fwd_mask = p->br->group_fwd_mask_required; /* * See IEEE 802.1D Table 7-10 Reserved addresses * * Assignment Value * Bridge Group Address 01-80-C2-00-00-00 * (MAC Control) 802.3 01-80-C2-00-00-01 * (Link Aggregation) 802.3 01-80-C2-00-00-02 * 802.1X PAE address 01-80-C2-00-00-03 * * 802.1AB LLDP 01-80-C2-00-00-0E * * Others reserved for future standardization */ fwd_mask |= p->group_fwd_mask; switch (dest[5]) { case 0x00: /* Bridge Group Address */ /* If STP is turned off, then must forward to keep loop detection */ if (p->br->stp_enabled == BR_NO_STP || fwd_mask & (1u << dest[5])) goto forward; *pskb = skb; __br_handle_local_finish(skb); return RX_HANDLER_PASS; case 0x01: /* IEEE MAC (Pause) */ goto drop; case 0x0E: /* 802.1AB LLDP */ fwd_mask |= p->br->group_fwd_mask; if (fwd_mask & (1u << dest[5])) goto forward; *pskb = skb; __br_handle_local_finish(skb); return RX_HANDLER_PASS; default: /* Allow selective forwarding for most other protocols */ fwd_mask |= p->br->group_fwd_mask; if (fwd_mask & (1u << dest[5])) goto forward; } /* The else clause should be hit when nf_hook(): * - returns < 0 (drop/error) * - returns = 0 (stolen/nf_queue) * Thus return 1 from the okfn() to signal the skb is ok to pass */ if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, dev_net(skb->dev), NULL, skb, skb->dev, NULL, br_handle_local_finish) == 1) { return RX_HANDLER_PASS; } else { return RX_HANDLER_CONSUMED; } } if (unlikely(br_process_frame_type(p, skb))) return RX_HANDLER_PASS; forward: if (br_mst_is_enabled(p->br)) goto defer_stp_filtering; switch (p->state) { case BR_STATE_FORWARDING: case BR_STATE_LEARNING: defer_stp_filtering: if (ether_addr_equal(p->br->dev->dev_addr, dest)) skb->pkt_type = PACKET_HOST; return nf_hook_bridge_pre(skb, pskb); default: drop: kfree_skb(skb); } return RX_HANDLER_CONSUMED; } /* This function has no purpose other than to appease the br_port_get_rcu/rtnl * helpers which identify bridged ports according to the rx_handler installed * on them (so there _needs_ to be a bridge rx_handler even if we don't need it * to do anything useful). This bridge won't support traffic to/from the stack, * but only hardware bridging. So return RX_HANDLER_PASS so we don't steal * frames from the ETH_P_XDSA packet_type handler. */ static rx_handler_result_t br_handle_frame_dummy(struct sk_buff **pskb) { return RX_HANDLER_PASS; } rx_handler_func_t *br_get_rx_handler(const struct net_device *dev) { if (netdev_uses_dsa(dev)) return br_handle_frame_dummy; return br_handle_frame; } void br_add_frame(struct net_bridge *br, struct br_frame_type *ft) { hlist_add_head_rcu(&ft->list, &br->frame_type_list); } void br_del_frame(struct net_bridge *br, struct br_frame_type *ft) { struct br_frame_type *tmp; hlist_for_each_entry(tmp, &br->frame_type_list, list) if (ft == tmp) { hlist_del_rcu(&ft->list); return; } }
1518 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 // SPDX-License-Identifier: GPL-2.0-only // Copyright (c) 2020 Facebook Inc. #include <linux/ethtool_netlink.h> #include <linux/netdevice.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/workqueue.h> #include <net/udp_tunnel.h> #include <net/vxlan.h> enum udp_tunnel_nic_table_entry_flags { UDP_TUNNEL_NIC_ENTRY_ADD = BIT(0), UDP_TUNNEL_NIC_ENTRY_DEL = BIT(1), UDP_TUNNEL_NIC_ENTRY_OP_FAIL = BIT(2), UDP_TUNNEL_NIC_ENTRY_FROZEN = BIT(3), }; struct udp_tunnel_nic_table_entry { __be16 port; u8 type; u8 flags; u16 use_cnt; #define UDP_TUNNEL_NIC_USE_CNT_MAX U16_MAX u8 hw_priv; }; /** * struct udp_tunnel_nic - UDP tunnel port offload state * @work: async work for talking to hardware from process context * @dev: netdev pointer * @need_sync: at least one port start changed * @need_replay: space was freed, we need a replay of all ports * @work_pending: @work is currently scheduled * @n_tables: number of tables under @entries * @missed: bitmap of tables which overflown * @entries: table of tables of ports currently offloaded */ struct udp_tunnel_nic { struct work_struct work; struct net_device *dev; u8 need_sync:1; u8 need_replay:1; u8 work_pending:1; unsigned int n_tables; unsigned long missed; struct udp_tunnel_nic_table_entry **entries; }; /* We ensure all work structs are done using driver state, but not the code. * We need a workqueue we can flush before module gets removed. */ static struct workqueue_struct *udp_tunnel_nic_workqueue; static const char *udp_tunnel_nic_tunnel_type_name(unsigned int type) { switch (type) { case UDP_TUNNEL_TYPE_VXLAN: return "vxlan"; case UDP_TUNNEL_TYPE_GENEVE: return "geneve"; case UDP_TUNNEL_TYPE_VXLAN_GPE: return "vxlan-gpe"; default: return "unknown"; } } static bool udp_tunnel_nic_entry_is_free(struct udp_tunnel_nic_table_entry *entry) { return entry->use_cnt == 0 && !entry->flags; } static bool udp_tunnel_nic_entry_is_present(struct udp_tunnel_nic_table_entry *entry) { return entry->use_cnt && !(entry->flags & ~UDP_TUNNEL_NIC_ENTRY_FROZEN); } static bool udp_tunnel_nic_entry_is_frozen(struct udp_tunnel_nic_table_entry *entry) { return entry->flags & UDP_TUNNEL_NIC_ENTRY_FROZEN; } static void udp_tunnel_nic_entry_freeze_used(struct udp_tunnel_nic_table_entry *entry) { if (!udp_tunnel_nic_entry_is_free(entry)) entry->flags |= UDP_TUNNEL_NIC_ENTRY_FROZEN; } static void udp_tunnel_nic_entry_unfreeze(struct udp_tunnel_nic_table_entry *entry) { entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_FROZEN; } static bool udp_tunnel_nic_entry_is_queued(struct udp_tunnel_nic_table_entry *entry) { return entry->flags & (UDP_TUNNEL_NIC_ENTRY_ADD | UDP_TUNNEL_NIC_ENTRY_DEL); } static void udp_tunnel_nic_entry_queue(struct udp_tunnel_nic *utn, struct udp_tunnel_nic_table_entry *entry, unsigned int flag) { entry->flags |= flag; utn->need_sync = 1; } static void udp_tunnel_nic_ti_from_entry(struct udp_tunnel_nic_table_entry *entry, struct udp_tunnel_info *ti) { memset(ti, 0, sizeof(*ti)); ti->port = entry->port; ti->type = entry->type; ti->hw_priv = entry->hw_priv; } static bool udp_tunnel_nic_is_empty(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) if (!udp_tunnel_nic_entry_is_free(&utn->entries[i][j])) return false; return true; } static bool udp_tunnel_nic_should_replay(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_table_info *table; unsigned int i, j; if (!utn->missed) return false; for (i = 0; i < utn->n_tables; i++) { table = &dev->udp_tunnel_nic_info->tables[i]; if (!test_bit(i, &utn->missed)) continue; for (j = 0; j < table->n_entries; j++) if (udp_tunnel_nic_entry_is_free(&utn->entries[i][j])) return true; } return false; } static void __udp_tunnel_nic_get_port(struct net_device *dev, unsigned int table, unsigned int idx, struct udp_tunnel_info *ti) { struct udp_tunnel_nic_table_entry *entry; struct udp_tunnel_nic *utn; utn = dev->udp_tunnel_nic; entry = &utn->entries[table][idx]; if (entry->use_cnt) udp_tunnel_nic_ti_from_entry(entry, ti); } static void __udp_tunnel_nic_set_port_priv(struct net_device *dev, unsigned int table, unsigned int idx, u8 priv) { dev->udp_tunnel_nic->entries[table][idx].hw_priv = priv; } static void udp_tunnel_nic_entry_update_done(struct udp_tunnel_nic_table_entry *entry, int err) { bool dodgy = entry->flags & UDP_TUNNEL_NIC_ENTRY_OP_FAIL; WARN_ON_ONCE(entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD && entry->flags & UDP_TUNNEL_NIC_ENTRY_DEL); if (entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD && (!err || (err == -EEXIST && dodgy))) entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_ADD; if (entry->flags & UDP_TUNNEL_NIC_ENTRY_DEL && (!err || (err == -ENOENT && dodgy))) entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_DEL; if (!err) entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_OP_FAIL; else entry->flags |= UDP_TUNNEL_NIC_ENTRY_OP_FAIL; } static void udp_tunnel_nic_device_sync_one(struct net_device *dev, struct udp_tunnel_nic *utn, unsigned int table, unsigned int idx) { struct udp_tunnel_nic_table_entry *entry; struct udp_tunnel_info ti; int err; entry = &utn->entries[table][idx]; if (!udp_tunnel_nic_entry_is_queued(entry)) return; udp_tunnel_nic_ti_from_entry(entry, &ti); if (entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD) err = dev->udp_tunnel_nic_info->set_port(dev, table, idx, &ti); else err = dev->udp_tunnel_nic_info->unset_port(dev, table, idx, &ti); udp_tunnel_nic_entry_update_done(entry, err); if (err) netdev_warn(dev, "UDP tunnel port sync failed port %d type %s: %d\n", be16_to_cpu(entry->port), udp_tunnel_nic_tunnel_type_name(entry->type), err); } static void udp_tunnel_nic_device_sync_by_port(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) udp_tunnel_nic_device_sync_one(dev, utn, i, j); } static void udp_tunnel_nic_device_sync_by_table(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; unsigned int i, j; int err; for (i = 0; i < utn->n_tables; i++) { /* Find something that needs sync in this table */ for (j = 0; j < info->tables[i].n_entries; j++) if (udp_tunnel_nic_entry_is_queued(&utn->entries[i][j])) break; if (j == info->tables[i].n_entries) continue; err = info->sync_table(dev, i); if (err) netdev_warn(dev, "UDP tunnel port sync failed for table %d: %d\n", i, err); for (j = 0; j < info->tables[i].n_entries; j++) { struct udp_tunnel_nic_table_entry *entry; entry = &utn->entries[i][j]; if (udp_tunnel_nic_entry_is_queued(entry)) udp_tunnel_nic_entry_update_done(entry, err); } } } static void __udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn) { if (!utn->need_sync) return; if (dev->udp_tunnel_nic_info->sync_table) udp_tunnel_nic_device_sync_by_table(dev, utn); else udp_tunnel_nic_device_sync_by_port(dev, utn); utn->need_sync = 0; /* Can't replay directly here, in case we come from the tunnel driver's * notification - trying to replay may deadlock inside tunnel driver. */ utn->need_replay = udp_tunnel_nic_should_replay(dev, utn); } static void udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; bool may_sleep; if (!utn->need_sync) return; /* Drivers which sleep in the callback need to update from * the workqueue, if we come from the tunnel driver's notification. */ may_sleep = info->flags & UDP_TUNNEL_NIC_INFO_MAY_SLEEP; if (!may_sleep) __udp_tunnel_nic_device_sync(dev, utn); if (may_sleep || utn->need_replay) { queue_work(udp_tunnel_nic_workqueue, &utn->work); utn->work_pending = 1; } } static bool udp_tunnel_nic_table_is_capable(const struct udp_tunnel_nic_table_info *table, struct udp_tunnel_info *ti) { return table->tunnel_types & ti->type; } static bool udp_tunnel_nic_is_capable(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; unsigned int i; /* Special case IPv4-only NICs */ if (info->flags & UDP_TUNNEL_NIC_INFO_IPV4_ONLY && ti->sa_family != AF_INET) return false; for (i = 0; i < utn->n_tables; i++) if (udp_tunnel_nic_table_is_capable(&info->tables[i], ti)) return true; return false; } static int udp_tunnel_nic_has_collision(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic_table_entry *entry; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) { entry = &utn->entries[i][j]; if (!udp_tunnel_nic_entry_is_free(entry) && entry->port == ti->port && entry->type != ti->type) { __set_bit(i, &utn->missed); return true; } } return false; } static void udp_tunnel_nic_entry_adj(struct udp_tunnel_nic *utn, unsigned int table, unsigned int idx, int use_cnt_adj) { struct udp_tunnel_nic_table_entry *entry = &utn->entries[table][idx]; bool dodgy = entry->flags & UDP_TUNNEL_NIC_ENTRY_OP_FAIL; unsigned int from, to; WARN_ON(entry->use_cnt + (u32)use_cnt_adj > U16_MAX); /* If not going from used to unused or vice versa - all done. * For dodgy entries make sure we try to sync again (queue the entry). */ entry->use_cnt += use_cnt_adj; if (!dodgy && !entry->use_cnt == !(entry->use_cnt - use_cnt_adj)) return; /* Cancel the op before it was sent to the device, if possible, * otherwise we'd need to take special care to issue commands * in the same order the ports arrived. */ if (use_cnt_adj < 0) { from = UDP_TUNNEL_NIC_ENTRY_ADD; to = UDP_TUNNEL_NIC_ENTRY_DEL; } else { from = UDP_TUNNEL_NIC_ENTRY_DEL; to = UDP_TUNNEL_NIC_ENTRY_ADD; } if (entry->flags & from) { entry->flags &= ~from; if (!dodgy) return; } udp_tunnel_nic_entry_queue(utn, entry, to); } static bool udp_tunnel_nic_entry_try_adj(struct udp_tunnel_nic *utn, unsigned int table, unsigned int idx, struct udp_tunnel_info *ti, int use_cnt_adj) { struct udp_tunnel_nic_table_entry *entry = &utn->entries[table][idx]; if (udp_tunnel_nic_entry_is_free(entry) || entry->port != ti->port || entry->type != ti->type) return false; if (udp_tunnel_nic_entry_is_frozen(entry)) return true; udp_tunnel_nic_entry_adj(utn, table, idx, use_cnt_adj); return true; } /* Try to find existing matching entry and adjust its use count, instead of * adding a new one. Returns true if entry was found. In case of delete the * entry may have gotten removed in the process, in which case it will be * queued for removal. */ static bool udp_tunnel_nic_try_existing(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti, int use_cnt_adj) { const struct udp_tunnel_nic_table_info *table; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) { table = &dev->udp_tunnel_nic_info->tables[i]; if (!udp_tunnel_nic_table_is_capable(table, ti)) continue; for (j = 0; j < table->n_entries; j++) if (udp_tunnel_nic_entry_try_adj(utn, i, j, ti, use_cnt_adj)) return true; } return false; } static bool udp_tunnel_nic_add_existing(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti) { return udp_tunnel_nic_try_existing(dev, utn, ti, +1); } static bool udp_tunnel_nic_del_existing(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti) { return udp_tunnel_nic_try_existing(dev, utn, ti, -1); } static bool udp_tunnel_nic_add_new(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti) { const struct udp_tunnel_nic_table_info *table; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) { table = &dev->udp_tunnel_nic_info->tables[i]; if (!udp_tunnel_nic_table_is_capable(table, ti)) continue; for (j = 0; j < table->n_entries; j++) { struct udp_tunnel_nic_table_entry *entry; entry = &utn->entries[i][j]; if (!udp_tunnel_nic_entry_is_free(entry)) continue; entry->port = ti->port; entry->type = ti->type; entry->use_cnt = 1; udp_tunnel_nic_entry_queue(utn, entry, UDP_TUNNEL_NIC_ENTRY_ADD); return true; } /* The different table may still fit this port in, but there * are no devices currently which have multiple tables accepting * the same tunnel type, and false positives are okay. */ __set_bit(i, &utn->missed); } return false; } static void __udp_tunnel_nic_add_port(struct net_device *dev, struct udp_tunnel_info *ti) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic *utn; utn = dev->udp_tunnel_nic; if (!utn) return; if (!netif_running(dev) && info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY) return; if (info->flags & UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN && ti->port == htons(IANA_VXLAN_UDP_PORT)) { if (ti->type != UDP_TUNNEL_TYPE_VXLAN) netdev_warn(dev, "device assumes port 4789 will be used by vxlan tunnels\n"); return; } if (!udp_tunnel_nic_is_capable(dev, utn, ti)) return; /* It may happen that a tunnel of one type is removed and different * tunnel type tries to reuse its port before the device was informed. * Rely on utn->missed to re-add this port later. */ if (udp_tunnel_nic_has_collision(dev, utn, ti)) return; if (!udp_tunnel_nic_add_existing(dev, utn, ti)) udp_tunnel_nic_add_new(dev, utn, ti); udp_tunnel_nic_device_sync(dev, utn); } static void __udp_tunnel_nic_del_port(struct net_device *dev, struct udp_tunnel_info *ti) { struct udp_tunnel_nic *utn; utn = dev->udp_tunnel_nic; if (!utn) return; if (!udp_tunnel_nic_is_capable(dev, utn, ti)) return; udp_tunnel_nic_del_existing(dev, utn, ti); udp_tunnel_nic_device_sync(dev, utn); } static void __udp_tunnel_nic_reset_ntf(struct net_device *dev) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic *utn; unsigned int i, j; ASSERT_RTNL(); utn = dev->udp_tunnel_nic; if (!utn) return; utn->need_sync = false; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) { struct udp_tunnel_nic_table_entry *entry; entry = &utn->entries[i][j]; entry->flags &= ~(UDP_TUNNEL_NIC_ENTRY_DEL | UDP_TUNNEL_NIC_ENTRY_OP_FAIL); /* We don't release rtnl across ops */ WARN_ON(entry->flags & UDP_TUNNEL_NIC_ENTRY_FROZEN); if (!entry->use_cnt) continue; udp_tunnel_nic_entry_queue(utn, entry, UDP_TUNNEL_NIC_ENTRY_ADD); } __udp_tunnel_nic_device_sync(dev, utn); } static size_t __udp_tunnel_nic_dump_size(struct net_device *dev, unsigned int table) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic *utn; unsigned int j; size_t size; utn = dev->udp_tunnel_nic; if (!utn) return 0; size = 0; for (j = 0; j < info->tables[table].n_entries; j++) { if (!udp_tunnel_nic_entry_is_present(&utn->entries[table][j])) continue; size += nla_total_size(0) + /* _TABLE_ENTRY */ nla_total_size(sizeof(__be16)) + /* _ENTRY_PORT */ nla_total_size(sizeof(u32)); /* _ENTRY_TYPE */ } return size; } static int __udp_tunnel_nic_dump_write(struct net_device *dev, unsigned int table, struct sk_buff *skb) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic *utn; struct nlattr *nest; unsigned int j; utn = dev->udp_tunnel_nic; if (!utn) return 0; for (j = 0; j < info->tables[table].n_entries; j++) { if (!udp_tunnel_nic_entry_is_present(&utn->entries[table][j])) continue; nest = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY); if (!nest) return -EMSGSIZE; if (nla_put_be16(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT, utn->entries[table][j].port) || nla_put_u32(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_TYPE, ilog2(utn->entries[table][j].type))) goto err_cancel; nla_nest_end(skb, nest); } return 0; err_cancel: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static const struct udp_tunnel_nic_ops __udp_tunnel_nic_ops = { .get_port = __udp_tunnel_nic_get_port, .set_port_priv = __udp_tunnel_nic_set_port_priv, .add_port = __udp_tunnel_nic_add_port, .del_port = __udp_tunnel_nic_del_port, .reset_ntf = __udp_tunnel_nic_reset_ntf, .dump_size = __udp_tunnel_nic_dump_size, .dump_write = __udp_tunnel_nic_dump_write, }; static void udp_tunnel_nic_flush(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) { int adj_cnt = -utn->entries[i][j].use_cnt; if (adj_cnt) udp_tunnel_nic_entry_adj(utn, i, j, adj_cnt); } __udp_tunnel_nic_device_sync(dev, utn); for (i = 0; i < utn->n_tables; i++) memset(utn->entries[i], 0, array_size(info->tables[i].n_entries, sizeof(**utn->entries))); WARN_ON(utn->need_sync); utn->need_replay = 0; } static void udp_tunnel_nic_replay(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic_shared_node *node; unsigned int i, j; /* Freeze all the ports we are already tracking so that the replay * does not double up the refcount. */ for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) udp_tunnel_nic_entry_freeze_used(&utn->entries[i][j]); utn->missed = 0; utn->need_replay = 0; if (!info->shared) { udp_tunnel_get_rx_info(dev); } else { list_for_each_entry(node, &info->shared->devices, list) udp_tunnel_get_rx_info(node->dev); } for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) udp_tunnel_nic_entry_unfreeze(&utn->entries[i][j]); } static void udp_tunnel_nic_device_sync_work(struct work_struct *work) { struct udp_tunnel_nic *utn = container_of(work, struct udp_tunnel_nic, work); rtnl_lock(); utn->work_pending = 0; __udp_tunnel_nic_device_sync(utn->dev, utn); if (utn->need_replay) udp_tunnel_nic_replay(utn->dev, utn); rtnl_unlock(); } static struct udp_tunnel_nic * udp_tunnel_nic_alloc(const struct udp_tunnel_nic_info *info, unsigned int n_tables) { struct udp_tunnel_nic *utn; unsigned int i; utn = kzalloc(sizeof(*utn), GFP_KERNEL); if (!utn) return NULL; utn->n_tables = n_tables; INIT_WORK(&utn->work, udp_tunnel_nic_device_sync_work); utn->entries = kmalloc_array(n_tables, sizeof(void *), GFP_KERNEL); if (!utn->entries) goto err_free_utn; for (i = 0; i < n_tables; i++) { utn->entries[i] = kcalloc(info->tables[i].n_entries, sizeof(*utn->entries[i]), GFP_KERNEL); if (!utn->entries[i]) goto err_free_prev_entries; } return utn; err_free_prev_entries: while (i--) kfree(utn->entries[i]); kfree(utn->entries); err_free_utn: kfree(utn); return NULL; } static void udp_tunnel_nic_free(struct udp_tunnel_nic *utn) { unsigned int i; for (i = 0; i < utn->n_tables; i++) kfree(utn->entries[i]); kfree(utn->entries); kfree(utn); } static int udp_tunnel_nic_register(struct net_device *dev) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic_shared_node *node = NULL; struct udp_tunnel_nic *utn; unsigned int n_tables, i; BUILD_BUG_ON(sizeof(utn->missed) * BITS_PER_BYTE < UDP_TUNNEL_NIC_MAX_TABLES); /* Expect use count of at most 2 (IPv4, IPv6) per device */ BUILD_BUG_ON(UDP_TUNNEL_NIC_USE_CNT_MAX < UDP_TUNNEL_NIC_MAX_SHARING_DEVICES * 2); /* Check that the driver info is sane */ if (WARN_ON(!info->set_port != !info->unset_port) || WARN_ON(!info->set_port == !info->sync_table) || WARN_ON(!info->tables[0].n_entries)) return -EINVAL; if (WARN_ON(info->shared && info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)) return -EINVAL; n_tables = 1; for (i = 1; i < UDP_TUNNEL_NIC_MAX_TABLES; i++) { if (!info->tables[i].n_entries) continue; n_tables++; if (WARN_ON(!info->tables[i - 1].n_entries)) return -EINVAL; } /* Create UDP tunnel state structures */ if (info->shared) { node = kzalloc(sizeof(*node), GFP_KERNEL); if (!node) return -ENOMEM; node->dev = dev; } if (info->shared && info->shared->udp_tunnel_nic_info) { utn = info->shared->udp_tunnel_nic_info; } else { utn = udp_tunnel_nic_alloc(info, n_tables); if (!utn) { kfree(node); return -ENOMEM; } } if (info->shared) { if (!info->shared->udp_tunnel_nic_info) { INIT_LIST_HEAD(&info->shared->devices); info->shared->udp_tunnel_nic_info = utn; } list_add_tail(&node->list, &info->shared->devices); } utn->dev = dev; dev_hold(dev); dev->udp_tunnel_nic = utn; if (!(info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)) udp_tunnel_get_rx_info(dev); return 0; } static void udp_tunnel_nic_unregister(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; /* For a shared table remove this dev from the list of sharing devices * and if there are other devices just detach. */ if (info->shared) { struct udp_tunnel_nic_shared_node *node, *first; list_for_each_entry(node, &info->shared->devices, list) if (node->dev == dev) break; if (list_entry_is_head(node, &info->shared->devices, list)) return; list_del(&node->list); kfree(node); first = list_first_entry_or_null(&info->shared->devices, typeof(*first), list); if (first) { udp_tunnel_drop_rx_info(dev); utn->dev = first->dev; goto release_dev; } info->shared->udp_tunnel_nic_info = NULL; } /* Flush before we check work, so we don't waste time adding entries * from the work which we will boot immediately. */ udp_tunnel_nic_flush(dev, utn); /* Wait for the work to be done using the state, netdev core will * retry unregister until we give up our reference on this device. */ if (utn->work_pending) return; udp_tunnel_nic_free(utn); release_dev: dev->udp_tunnel_nic = NULL; dev_put(dev); } static int udp_tunnel_nic_netdevice_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); const struct udp_tunnel_nic_info *info; struct udp_tunnel_nic *utn; info = dev->udp_tunnel_nic_info; if (!info) return NOTIFY_DONE; if (event == NETDEV_REGISTER) { int err; err = udp_tunnel_nic_register(dev); if (err) netdev_WARN(dev, "failed to register for UDP tunnel offloads: %d", err); return notifier_from_errno(err); } /* All other events will need the udp_tunnel_nic state */ utn = dev->udp_tunnel_nic; if (!utn) return NOTIFY_DONE; if (event == NETDEV_UNREGISTER) { udp_tunnel_nic_unregister(dev, utn); return NOTIFY_OK; } /* All other events only matter if NIC has to be programmed open */ if (!(info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)) return NOTIFY_DONE; if (event == NETDEV_UP) { WARN_ON(!udp_tunnel_nic_is_empty(dev, utn)); udp_tunnel_get_rx_info(dev); return NOTIFY_OK; } if (event == NETDEV_GOING_DOWN) { udp_tunnel_nic_flush(dev, utn); return NOTIFY_OK; } return NOTIFY_DONE; } static struct notifier_block udp_tunnel_nic_notifier_block __read_mostly = { .notifier_call = udp_tunnel_nic_netdevice_event, }; static int __init udp_tunnel_nic_init_module(void) { int err; udp_tunnel_nic_workqueue = alloc_ordered_workqueue("udp_tunnel_nic", 0); if (!udp_tunnel_nic_workqueue) return -ENOMEM; rtnl_lock(); udp_tunnel_nic_ops = &__udp_tunnel_nic_ops; rtnl_unlock(); err = register_netdevice_notifier(&udp_tunnel_nic_notifier_block); if (err) goto err_unset_ops; return 0; err_unset_ops: rtnl_lock(); udp_tunnel_nic_ops = NULL; rtnl_unlock(); destroy_workqueue(udp_tunnel_nic_workqueue); return err; } late_initcall(udp_tunnel_nic_init_module); static void __exit udp_tunnel_nic_cleanup_module(void) { unregister_netdevice_notifier(&udp_tunnel_nic_notifier_block); rtnl_lock(); udp_tunnel_nic_ops = NULL; rtnl_unlock(); destroy_workqueue(udp_tunnel_nic_workqueue); } module_exit(udp_tunnel_nic_cleanup_module); MODULE_LICENSE("GPL");
7 5 4 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 /* SPDX-License-Identifier: GPL-2.0 */ /* * This file define the new driver API for Wireless Extensions * * Version : 8 16.3.07 * * Authors : Jean Tourrilhes - HPL - <jt@hpl.hp.com> * Copyright (c) 2001-2007 Jean Tourrilhes, All Rights Reserved. */ #ifndef _IW_HANDLER_H #define _IW_HANDLER_H /************************** DOCUMENTATION **************************/ /* * Initial driver API (1996 -> onward) : * ----------------------------------- * The initial API just sends the IOCTL request received from user space * to the driver (via the driver ioctl handler). The driver has to * handle all the rest... * * The initial API also defines a specific handler in struct net_device * to handle wireless statistics. * * The initial APIs served us well and has proven a reasonably good design. * However, there is a few shortcommings : * o No events, everything is a request to the driver. * o Large ioctl function in driver with gigantic switch statement * (i.e. spaghetti code). * o Driver has to mess up with copy_to/from_user, and in many cases * does it unproperly. Common mistakes are : * * buffer overflows (no checks or off by one checks) * * call copy_to/from_user with irq disabled * o The user space interface is tied to ioctl because of the use * copy_to/from_user. * * New driver API (2002 -> onward) : * ------------------------------- * The new driver API is just a bunch of standard functions (handlers), * each handling a specific Wireless Extension. The driver just export * the list of handler it supports, and those will be called apropriately. * * I tried to keep the main advantage of the previous API (simplicity, * efficiency and light weight), and also I provide a good dose of backward * compatibility (most structures are the same, driver can use both API * simultaneously, ...). * Hopefully, I've also addressed the shortcomming of the initial API. * * The advantage of the new API are : * o Handling of Extensions in driver broken in small contained functions * o Tighter checks of ioctl before calling the driver * o Flexible commit strategy (at least, the start of it) * o Backward compatibility (can be mixed with old API) * o Driver doesn't have to worry about memory and user-space issues * The last point is important for the following reasons : * o You are now able to call the new driver API from any API you * want (including from within other parts of the kernel). * o Common mistakes are avoided (buffer overflow, user space copy * with irq disabled and so on). * * The Drawback of the new API are : * o bloat (especially kernel) * o need to migrate existing drivers to new API * My initial testing shows that the new API adds around 3kB to the kernel * and save between 0 and 5kB from a typical driver. * Also, as all structures and data types are unchanged, the migration is * quite straightforward (but tedious). * * --- * * The new driver API is defined below in this file. User space should * not be aware of what's happening down there... * * A new kernel wrapper is in charge of validating the IOCTLs and calling * the appropriate driver handler. This is implemented in : * # net/core/wireless.c * * The driver export the list of handlers in : * # include/linux/netdevice.h (one place) * * The new driver API is available for WIRELESS_EXT >= 13. * Good luck with migration to the new API ;-) */ /* ---------------------- THE IMPLEMENTATION ---------------------- */ /* * Some of the choice I've made are pretty controversials. Defining an * API is very much weighting compromises. This goes into some of the * details and the thinking behind the implementation. * * Implementation goals : * -------------------- * The implementation goals were as follow : * o Obvious : you should not need a PhD to understand what's happening, * the benefit is easier maintenance. * o Flexible : it should accommodate a wide variety of driver * implementations and be as flexible as the old API. * o Lean : it should be efficient memory wise to minimise the impact * on kernel footprint. * o Transparent to user space : the large number of user space * applications that use Wireless Extensions should not need * any modifications. * * Array of functions versus Struct of functions * --------------------------------------------- * 1) Having an array of functions allow the kernel code to access the * handler in a single lookup, which is much more efficient (think hash * table here). * 2) The only drawback is that driver writer may put their handler in * the wrong slot. This is trivial to test (I set the frequency, the * bitrate changes). Once the handler is in the proper slot, it will be * there forever, because the array is only extended at the end. * 3) Backward/forward compatibility : adding new handler just require * extending the array, so you can put newer driver in older kernel * without having to patch the kernel code (and vice versa). * * All handler are of the same generic type * ---------------------------------------- * That's a feature !!! * 1) Having a generic handler allow to have generic code, which is more * efficient. If each of the handler was individually typed I would need * to add a big switch in the kernel (== more bloat). This solution is * more scalable, adding new Wireless Extensions doesn't add new code. * 2) You can use the same handler in different slots of the array. For * hardware, it may be more efficient or logical to handle multiple * Wireless Extensions with a single function, and the API allow you to * do that. (An example would be a single record on the card to control * both bitrate and frequency, the handler would read the old record, * modify it according to info->cmd and rewrite it). * * Functions prototype uses union iwreq_data * ----------------------------------------- * Some would have preferred functions defined this way : * static int mydriver_ioctl_setrate(struct net_device *dev, * long rate, int auto) * 1) The kernel code doesn't "validate" the content of iwreq_data, and * can't do it (different hardware may have different notion of what a * valid frequency is), so we don't pretend that we do it. * 2) The above form is not extendable. If I want to add a flag (for * example to distinguish setting max rate and basic rate), I would * break the prototype. Using iwreq_data is more flexible. * 3) Also, the above form is not generic (see above). * 4) I don't expect driver developper using the wrong field of the * union (Doh !), so static typechecking doesn't add much value. * 5) Lastly, you can skip the union by doing : * static int mydriver_ioctl_setrate(struct net_device *dev, * struct iw_request_info *info, * struct iw_param *rrq, * char *extra) * And then adding the handler in the array like this : * (iw_handler) mydriver_ioctl_setrate, // SIOCSIWRATE * * Using functions and not a registry * ---------------------------------- * Another implementation option would have been for every instance to * define a registry (a struct containing all the Wireless Extensions) * and only have a function to commit the registry to the hardware. * 1) This approach can be emulated by the current code, but not * vice versa. * 2) Some drivers don't keep any configuration in the driver, for them * adding such a registry would be a significant bloat. * 3) The code to translate from Wireless Extension to native format is * needed anyway, so it would not reduce significantely the amount of code. * 4) The current approach only selectively translate Wireless Extensions * to native format and only selectively set, whereas the registry approach * would require to translate all WE and set all parameters for any single * change. * 5) For many Wireless Extensions, the GET operation return the current * dynamic value, not the value that was set. * * This header is <net/iw_handler.h> * --------------------------------- * 1) This header is kernel space only and should not be exported to * user space. Headers in "include/linux/" are exported, headers in * "include/net/" are not. * * Mixed 32/64 bit issues * ---------------------- * The Wireless Extensions are designed to be 64 bit clean, by using only * datatypes with explicit storage size. * There are some issues related to kernel and user space using different * memory model, and in particular 64bit kernel with 32bit user space. * The problem is related to struct iw_point, that contains a pointer * that *may* need to be translated. * This is quite messy. The new API doesn't solve this problem (it can't), * but is a step in the right direction : * 1) Meta data about each ioctl is easily available, so we know what type * of translation is needed. * 2) The move of data between kernel and user space is only done in a single * place in the kernel, so adding specific hooks in there is possible. * 3) In the long term, it allows to move away from using ioctl as the * user space API. * * So many comments and so few code * -------------------------------- * That's a feature. Comments won't bloat the resulting kernel binary. */ /***************************** INCLUDES *****************************/ #include <linux/wireless.h> /* IOCTL user space API */ #include <linux/if_ether.h> /***************************** VERSION *****************************/ /* * This constant is used to know which version of the driver API is * available. Hopefully, this will be pretty stable and no changes * will be needed... * I just plan to increment with each new version. */ #define IW_HANDLER_VERSION 8 /* * Changes : * * V2 to V3 * -------- * - Move event definition in <linux/wireless.h> * - Add Wireless Event support : * o wireless_send_event() prototype * o iwe_stream_add_event/point() inline functions * V3 to V4 * -------- * - Reshuffle IW_HEADER_TYPE_XXX to map IW_PRIV_TYPE_XXX changes * * V4 to V5 * -------- * - Add new spy support : struct iw_spy_data & prototypes * * V5 to V6 * -------- * - Change the way we get to spy_data method for added safety * - Remove spy #ifdef, they are always on -> cleaner code * - Add IW_DESCR_FLAG_NOMAX flag for very large requests * - Start migrating get_wireless_stats to struct iw_handler_def * * V6 to V7 * -------- * - Add struct ieee80211_device pointer in struct iw_public_data * - Remove (struct iw_point *)->pointer from events and streams * - Remove spy_offset from struct iw_handler_def * - Add "check" version of event macros for ieee802.11 stack * * V7 to V8 * ---------- * - Prevent leaking of kernel space in stream on 64 bits. */ /**************************** CONSTANTS ****************************/ /* Enhanced spy support available */ #define IW_WIRELESS_SPY #define IW_WIRELESS_THRSPY /* Special error message for the driver to indicate that we * should do a commit after return from the iw_handler */ #define EIWCOMMIT EINPROGRESS /* Flags available in struct iw_request_info */ #define IW_REQUEST_FLAG_COMPAT 0x0001 /* Compat ioctl call */ /* Type of headers we know about (basically union iwreq_data) */ #define IW_HEADER_TYPE_NULL 0 /* Not available */ #define IW_HEADER_TYPE_CHAR 2 /* char [IFNAMSIZ] */ #define IW_HEADER_TYPE_UINT 4 /* __u32 */ #define IW_HEADER_TYPE_FREQ 5 /* struct iw_freq */ #define IW_HEADER_TYPE_ADDR 6 /* struct sockaddr */ #define IW_HEADER_TYPE_POINT 8 /* struct iw_point */ #define IW_HEADER_TYPE_PARAM 9 /* struct iw_param */ #define IW_HEADER_TYPE_QUAL 10 /* struct iw_quality */ /* Handling flags */ /* Most are not implemented. I just use them as a reminder of some * cool features we might need one day ;-) */ #define IW_DESCR_FLAG_NONE 0x0000 /* Obvious */ /* Wrapper level flags */ #define IW_DESCR_FLAG_DUMP 0x0001 /* Not part of the dump command */ #define IW_DESCR_FLAG_EVENT 0x0002 /* Generate an event on SET */ #define IW_DESCR_FLAG_RESTRICT 0x0004 /* GET : request is ROOT only */ /* SET : Omit payload from generated iwevent */ #define IW_DESCR_FLAG_NOMAX 0x0008 /* GET : no limit on request size */ /* Driver level flags */ #define IW_DESCR_FLAG_WAIT 0x0100 /* Wait for driver event */ /****************************** TYPES ******************************/ /* ----------------------- WIRELESS HANDLER ----------------------- */ /* * A wireless handler is just a standard function, that looks like the * ioctl handler. * We also define there how a handler list look like... As the Wireless * Extension space is quite dense, we use a simple array, which is faster * (that's the perfect hash table ;-). */ /* * Meta data about the request passed to the iw_handler. * Most handlers can safely ignore what's in there. * The 'cmd' field might come handy if you want to use the same handler * for multiple command... * This struct is also my long term insurance. I can add new fields here * without breaking the prototype of iw_handler... */ struct iw_request_info { __u16 cmd; /* Wireless Extension command */ __u16 flags; /* More to come ;-) */ }; struct net_device; /* * This is how a function handling a Wireless Extension should look * like (both get and set, standard and private). */ typedef int (*iw_handler)(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra); /* * This define all the handler that the driver export. * As you need only one per driver type, please use a static const * shared by all driver instances... Same for the members... * This will be linked from net_device in <linux/netdevice.h> */ struct iw_handler_def { /* Array of handlers for standard ioctls * We will call dev->wireless_handlers->standard[ioctl - SIOCIWFIRST] */ const iw_handler * standard; /* Number of handlers defined (more precisely, index of the * last defined handler + 1) */ __u16 num_standard; #ifdef CONFIG_WEXT_PRIV __u16 num_private; /* Number of private arg description */ __u16 num_private_args; /* Array of handlers for private ioctls * Will call dev->wireless_handlers->private[ioctl - SIOCIWFIRSTPRIV] */ const iw_handler * private; /* Arguments of private handler. This one is just a list, so you * can put it in any order you want and should not leave holes... * We will automatically export that to user space... */ const struct iw_priv_args * private_args; #endif /* New location of get_wireless_stats, to de-bloat struct net_device. * The old pointer in struct net_device will be gradually phased * out, and drivers are encouraged to use this one... */ struct iw_statistics* (*get_wireless_stats)(struct net_device *dev); }; /* ---------------------- IOCTL DESCRIPTION ---------------------- */ /* * One of the main goal of the new interface is to deal entirely with * user space/kernel space memory move. * For that, we need to know : * o if iwreq is a pointer or contain the full data * o what is the size of the data to copy * * For private IOCTLs, we use the same rules as used by iwpriv and * defined in struct iw_priv_args. * * For standard IOCTLs, things are quite different and we need to * use the structures below. Actually, this struct is also more * efficient, but that's another story... */ /* * Describe how a standard IOCTL looks like. */ struct iw_ioctl_description { __u8 header_type; /* NULL, iw_point or other */ __u8 token_type; /* Future */ __u16 token_size; /* Granularity of payload */ __u16 min_tokens; /* Min acceptable token number */ __u16 max_tokens; /* Max acceptable token number */ __u32 flags; /* Special handling of the request */ }; /* Need to think of short header translation table. Later. */ /* --------------------- ENHANCED SPY SUPPORT --------------------- */ /* * In the old days, the driver was handling spy support all by itself. * Now, the driver can delegate this task to Wireless Extensions. * It needs to include this struct in its private part and use the * standard spy iw_handler. */ /* * Instance specific spy data, i.e. addresses spied and quality for them. */ struct iw_spy_data { /* --- Standard spy support --- */ int spy_number; u_char spy_address[IW_MAX_SPY][ETH_ALEN]; struct iw_quality spy_stat[IW_MAX_SPY]; /* --- Enhanced spy support (event) */ struct iw_quality spy_thr_low; /* Low threshold */ struct iw_quality spy_thr_high; /* High threshold */ u_char spy_thr_under[IW_MAX_SPY]; }; /* --------------------- DEVICE WIRELESS DATA --------------------- */ /* * This is all the wireless data specific to a device instance that * is managed by the core of Wireless Extensions or the 802.11 layer. * We only keep pointer to those structures, so that a driver is free * to share them between instances. * This structure should be initialised before registering the device. * Access to this data follow the same rules as any other struct net_device * data (i.e. valid as long as struct net_device exist, same locking rules). */ /* Forward declaration */ struct libipw_device; /* The struct */ struct iw_public_data { /* Driver enhanced spy support */ struct iw_spy_data * spy_data; /* Legacy structure managed by the ipw2x00-specific IEEE 802.11 layer */ struct libipw_device * libipw; }; /**************************** PROTOTYPES ****************************/ /* * Functions part of the Wireless Extensions (defined in net/wireless/wext-core.c). * Those may be called by driver modules. */ /* Send a single event to user space */ void wireless_send_event(struct net_device *dev, unsigned int cmd, union iwreq_data *wrqu, const char *extra); #ifdef CONFIG_WEXT_CORE /* flush all previous wext events - if work is done from netdev notifiers */ void wireless_nlevent_flush(void); #else static inline void wireless_nlevent_flush(void) {} #endif /* We may need a function to send a stream of events to user space. * More on that later... */ /* Standard handler for SIOCSIWSPY */ int iw_handler_set_spy(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra); /* Standard handler for SIOCGIWSPY */ int iw_handler_get_spy(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra); /* Standard handler for SIOCSIWTHRSPY */ int iw_handler_set_thrspy(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra); /* Standard handler for SIOCGIWTHRSPY */ int iw_handler_get_thrspy(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra); /* Driver call to update spy records */ void wireless_spy_update(struct net_device *dev, unsigned char *address, struct iw_quality *wstats); /************************* INLINE FUNTIONS *************************/ /* * Function that are so simple that it's more efficient inlining them */ static inline int iwe_stream_lcp_len(struct iw_request_info *info) { #ifdef CONFIG_COMPAT if (info->flags & IW_REQUEST_FLAG_COMPAT) return IW_EV_COMPAT_LCP_LEN; #endif return IW_EV_LCP_LEN; } static inline int iwe_stream_point_len(struct iw_request_info *info) { #ifdef CONFIG_COMPAT if (info->flags & IW_REQUEST_FLAG_COMPAT) return IW_EV_COMPAT_POINT_LEN; #endif return IW_EV_POINT_LEN; } static inline int iwe_stream_event_len_adjust(struct iw_request_info *info, int event_len) { #ifdef CONFIG_COMPAT if (info->flags & IW_REQUEST_FLAG_COMPAT) { event_len -= IW_EV_LCP_LEN; event_len += IW_EV_COMPAT_LCP_LEN; } #endif return event_len; } /*------------------------------------------------------------------*/ /* * Wrapper to add an Wireless Event to a stream of events. */ char *iwe_stream_add_event(struct iw_request_info *info, char *stream, char *ends, struct iw_event *iwe, int event_len); static inline char * iwe_stream_add_event_check(struct iw_request_info *info, char *stream, char *ends, struct iw_event *iwe, int event_len) { char *res = iwe_stream_add_event(info, stream, ends, iwe, event_len); if (res == stream) return ERR_PTR(-E2BIG); return res; } /*------------------------------------------------------------------*/ /* * Wrapper to add an short Wireless Event containing a pointer to a * stream of events. */ char *iwe_stream_add_point(struct iw_request_info *info, char *stream, char *ends, struct iw_event *iwe, char *extra); static inline char * iwe_stream_add_point_check(struct iw_request_info *info, char *stream, char *ends, struct iw_event *iwe, char *extra) { char *res = iwe_stream_add_point(info, stream, ends, iwe, extra); if (res == stream) return ERR_PTR(-E2BIG); return res; } /*------------------------------------------------------------------*/ /* * Wrapper to add a value to a Wireless Event in a stream of events. * Be careful, this one is tricky to use properly : * At the first run, you need to have (value = event + IW_EV_LCP_LEN). */ char *iwe_stream_add_value(struct iw_request_info *info, char *event, char *value, char *ends, struct iw_event *iwe, int event_len); #endif /* _IW_HANDLER_H */
2394 2394 2394 2394 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/acl.c * * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> */ #include <linux/quotaops.h> #include "ext4_jbd2.h" #include "ext4.h" #include "xattr.h" #include "acl.h" /* * Convert from filesystem to in-memory representation. */ static struct posix_acl * ext4_acl_from_disk(const void *value, size_t size) { const char *end = (char *)value + size; int n, count; struct posix_acl *acl; if (!value) return NULL; if (size < sizeof(ext4_acl_header)) return ERR_PTR(-EINVAL); if (((ext4_acl_header *)value)->a_version != cpu_to_le32(EXT4_ACL_VERSION)) return ERR_PTR(-EINVAL); value = (char *)value + sizeof(ext4_acl_header); count = ext4_acl_count(size); if (count < 0) return ERR_PTR(-EINVAL); if (count == 0) return NULL; acl = posix_acl_alloc(count, GFP_NOFS); if (!acl) return ERR_PTR(-ENOMEM); for (n = 0; n < count; n++) { ext4_acl_entry *entry = (ext4_acl_entry *)value; if ((char *)value + sizeof(ext4_acl_entry_short) > end) goto fail; acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); switch (acl->a_entries[n].e_tag) { case ACL_USER_OBJ: case ACL_GROUP_OBJ: case ACL_MASK: case ACL_OTHER: value = (char *)value + sizeof(ext4_acl_entry_short); break; case ACL_USER: value = (char *)value + sizeof(ext4_acl_entry); if ((char *)value > end) goto fail; acl->a_entries[n].e_uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id)); break; case ACL_GROUP: value = (char *)value + sizeof(ext4_acl_entry); if ((char *)value > end) goto fail; acl->a_entries[n].e_gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id)); break; default: goto fail; } } if (value != end) goto fail; return acl; fail: posix_acl_release(acl); return ERR_PTR(-EINVAL); } /* * Convert from in-memory to filesystem representation. */ static void * ext4_acl_to_disk(const struct posix_acl *acl, size_t *size) { ext4_acl_header *ext_acl; char *e; size_t n; *size = ext4_acl_size(acl->a_count); ext_acl = kmalloc(sizeof(ext4_acl_header) + acl->a_count * sizeof(ext4_acl_entry), GFP_NOFS); if (!ext_acl) return ERR_PTR(-ENOMEM); ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION); e = (char *)ext_acl + sizeof(ext4_acl_header); for (n = 0; n < acl->a_count; n++) { const struct posix_acl_entry *acl_e = &acl->a_entries[n]; ext4_acl_entry *entry = (ext4_acl_entry *)e; entry->e_tag = cpu_to_le16(acl_e->e_tag); entry->e_perm = cpu_to_le16(acl_e->e_perm); switch (acl_e->e_tag) { case ACL_USER: entry->e_id = cpu_to_le32( from_kuid(&init_user_ns, acl_e->e_uid)); e += sizeof(ext4_acl_entry); break; case ACL_GROUP: entry->e_id = cpu_to_le32( from_kgid(&init_user_ns, acl_e->e_gid)); e += sizeof(ext4_acl_entry); break; case ACL_USER_OBJ: case ACL_GROUP_OBJ: case ACL_MASK: case ACL_OTHER: e += sizeof(ext4_acl_entry_short); break; default: goto fail; } } return (char *)ext_acl; fail: kfree(ext_acl); return ERR_PTR(-EINVAL); } /* * Inode operation get_posix_acl(). * * inode->i_rwsem: don't care */ struct posix_acl * ext4_get_acl(struct inode *inode, int type, bool rcu) { int name_index; char *value = NULL; struct posix_acl *acl; int retval; if (rcu) return ERR_PTR(-ECHILD); switch (type) { case ACL_TYPE_ACCESS: name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; break; case ACL_TYPE_DEFAULT: name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; break; default: BUG(); } retval = ext4_xattr_get(inode, name_index, "", NULL, 0); if (retval > 0) { value = kmalloc(retval, GFP_NOFS); if (!value) return ERR_PTR(-ENOMEM); retval = ext4_xattr_get(inode, name_index, "", value, retval); } if (retval > 0) acl = ext4_acl_from_disk(value, retval); else if (retval == -ENODATA || retval == -ENOSYS) acl = NULL; else acl = ERR_PTR(retval); kfree(value); return acl; } /* * Set the access or default ACL of an inode. * * inode->i_rwsem: down unless called from ext4_new_inode */ static int __ext4_set_acl(handle_t *handle, struct inode *inode, int type, struct posix_acl *acl, int xattr_flags) { int name_index; void *value = NULL; size_t size = 0; int error; switch (type) { case ACL_TYPE_ACCESS: name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; break; case ACL_TYPE_DEFAULT: name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; if (!S_ISDIR(inode->i_mode)) return acl ? -EACCES : 0; break; default: return -EINVAL; } if (acl) { value = ext4_acl_to_disk(acl, &size); if (IS_ERR(value)) return (int)PTR_ERR(value); } error = ext4_xattr_set_handle(handle, inode, name_index, "", value, size, xattr_flags); kfree(value); if (!error) set_cached_acl(inode, type, acl); return error; } int ext4_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { handle_t *handle; int error, credits, retries = 0; size_t acl_size = acl ? ext4_acl_size(acl->a_count) : 0; struct inode *inode = d_inode(dentry); umode_t mode = inode->i_mode; int update_mode = 0; error = dquot_initialize(inode); if (error) return error; retry: error = ext4_xattr_set_credits(inode, acl_size, false /* is_create */, &credits); if (error) return error; handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits); if (IS_ERR(handle)) return PTR_ERR(handle); if ((type == ACL_TYPE_ACCESS) && acl) { error = posix_acl_update_mode(idmap, inode, &mode, &acl); if (error) goto out_stop; if (mode != inode->i_mode) update_mode = 1; } error = __ext4_set_acl(handle, inode, type, acl, 0 /* xattr_flags */); if (!error && update_mode) { inode->i_mode = mode; inode_set_ctime_current(inode); error = ext4_mark_inode_dirty(handle, inode); } out_stop: ext4_journal_stop(handle); if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; return error; } /* * Initialize the ACLs of a new inode. Called from ext4_new_inode. * * dir->i_rwsem: down * inode->i_rwsem: up (access to inode is still exclusive) */ int ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) { struct posix_acl *default_acl, *acl; int error; error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); if (error) return error; if (default_acl) { error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT, default_acl, XATTR_CREATE); posix_acl_release(default_acl); } else { inode->i_default_acl = NULL; } if (acl) { if (!error) error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl, XATTR_CREATE); posix_acl_release(acl); } else { inode->i_acl = NULL; } return error; }
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 // SPDX-License-Identifier: GPL-2.0-only /* * Authors: * (C) 2020 Alexander Aring <alex.aring@gmail.com> */ #include <linux/rpl_iptunnel.h> #include <net/dst_cache.h> #include <net/ip6_route.h> #include <net/lwtunnel.h> #include <net/ipv6.h> #include <net/rpl.h> struct rpl_iptunnel_encap { DECLARE_FLEX_ARRAY(struct ipv6_rpl_sr_hdr, srh); }; struct rpl_lwt { struct dst_cache cache; struct rpl_iptunnel_encap tuninfo; }; static inline struct rpl_lwt *rpl_lwt_lwtunnel(struct lwtunnel_state *lwt) { return (struct rpl_lwt *)lwt->data; } static inline struct rpl_iptunnel_encap * rpl_encap_lwtunnel(struct lwtunnel_state *lwt) { return &rpl_lwt_lwtunnel(lwt)->tuninfo; } static const struct nla_policy rpl_iptunnel_policy[RPL_IPTUNNEL_MAX + 1] = { [RPL_IPTUNNEL_SRH] = { .type = NLA_BINARY }, }; static bool rpl_validate_srh(struct net *net, struct ipv6_rpl_sr_hdr *srh, size_t seglen) { int err; if ((srh->hdrlen << 3) != seglen) return false; /* check at least one segment and seglen fit with segments_left */ if (!srh->segments_left || (srh->segments_left * sizeof(struct in6_addr)) != seglen) return false; if (srh->cmpri || srh->cmpre) return false; err = ipv6_chk_rpl_srh_loop(net, srh->rpl_segaddr, srh->segments_left); if (err) return false; if (ipv6_addr_type(&srh->rpl_segaddr[srh->segments_left - 1]) & IPV6_ADDR_MULTICAST) return false; return true; } static int rpl_build_state(struct net *net, struct nlattr *nla, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) { struct nlattr *tb[RPL_IPTUNNEL_MAX + 1]; struct lwtunnel_state *newts; struct ipv6_rpl_sr_hdr *srh; struct rpl_lwt *rlwt; int err, srh_len; if (family != AF_INET6) return -EINVAL; err = nla_parse_nested(tb, RPL_IPTUNNEL_MAX, nla, rpl_iptunnel_policy, extack); if (err < 0) return err; if (!tb[RPL_IPTUNNEL_SRH]) return -EINVAL; srh = nla_data(tb[RPL_IPTUNNEL_SRH]); srh_len = nla_len(tb[RPL_IPTUNNEL_SRH]); if (srh_len < sizeof(*srh)) return -EINVAL; /* verify that SRH is consistent */ if (!rpl_validate_srh(net, srh, srh_len - sizeof(*srh))) return -EINVAL; newts = lwtunnel_state_alloc(srh_len + sizeof(*rlwt)); if (!newts) return -ENOMEM; rlwt = rpl_lwt_lwtunnel(newts); err = dst_cache_init(&rlwt->cache, GFP_ATOMIC); if (err) { kfree(newts); return err; } memcpy(&rlwt->tuninfo.srh, srh, srh_len); newts->type = LWTUNNEL_ENCAP_RPL; newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT; newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; *ts = newts; return 0; } static void rpl_destroy_state(struct lwtunnel_state *lwt) { dst_cache_destroy(&rpl_lwt_lwtunnel(lwt)->cache); } static int rpl_do_srh_inline(struct sk_buff *skb, const struct rpl_lwt *rlwt, const struct ipv6_rpl_sr_hdr *srh) { struct ipv6_rpl_sr_hdr *isrh, *csrh; const struct ipv6hdr *oldhdr; struct ipv6hdr *hdr; unsigned char *buf; size_t hdrlen; int err; oldhdr = ipv6_hdr(skb); buf = kcalloc(struct_size(srh, segments.addr, srh->segments_left), 2, GFP_ATOMIC); if (!buf) return -ENOMEM; isrh = (struct ipv6_rpl_sr_hdr *)buf; csrh = (struct ipv6_rpl_sr_hdr *)(buf + ((srh->hdrlen + 1) << 3)); memcpy(isrh, srh, sizeof(*isrh)); memcpy(isrh->rpl_segaddr, &srh->rpl_segaddr[1], (srh->segments_left - 1) * 16); isrh->rpl_segaddr[srh->segments_left - 1] = oldhdr->daddr; ipv6_rpl_srh_compress(csrh, isrh, &srh->rpl_segaddr[0], isrh->segments_left - 1); hdrlen = ((csrh->hdrlen + 1) << 3); err = skb_cow_head(skb, hdrlen + skb->mac_len); if (unlikely(err)) { kfree(buf); return err; } skb_pull(skb, sizeof(struct ipv6hdr)); skb_postpull_rcsum(skb, skb_network_header(skb), sizeof(struct ipv6hdr)); skb_push(skb, sizeof(struct ipv6hdr) + hdrlen); skb_reset_network_header(skb); skb_mac_header_rebuild(skb); hdr = ipv6_hdr(skb); memmove(hdr, oldhdr, sizeof(*hdr)); isrh = (void *)hdr + sizeof(*hdr); memcpy(isrh, csrh, hdrlen); isrh->nexthdr = hdr->nexthdr; hdr->nexthdr = NEXTHDR_ROUTING; hdr->daddr = srh->rpl_segaddr[0]; ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); skb_set_transport_header(skb, sizeof(struct ipv6hdr)); skb_postpush_rcsum(skb, hdr, sizeof(struct ipv6hdr) + hdrlen); kfree(buf); return 0; } static int rpl_do_srh(struct sk_buff *skb, const struct rpl_lwt *rlwt) { struct dst_entry *dst = skb_dst(skb); struct rpl_iptunnel_encap *tinfo; if (skb->protocol != htons(ETH_P_IPV6)) return -EINVAL; tinfo = rpl_encap_lwtunnel(dst->lwtstate); return rpl_do_srh_inline(skb, rlwt, tinfo->srh); } static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *orig_dst = skb_dst(skb); struct dst_entry *dst = NULL; struct rpl_lwt *rlwt; int err; rlwt = rpl_lwt_lwtunnel(orig_dst->lwtstate); err = rpl_do_srh(skb, rlwt); if (unlikely(err)) goto drop; preempt_disable(); dst = dst_cache_get(&rlwt->cache); preempt_enable(); if (unlikely(!dst)) { struct ipv6hdr *hdr = ipv6_hdr(skb); struct flowi6 fl6; memset(&fl6, 0, sizeof(fl6)); fl6.daddr = hdr->daddr; fl6.saddr = hdr->saddr; fl6.flowlabel = ip6_flowinfo(hdr); fl6.flowi6_mark = skb->mark; fl6.flowi6_proto = hdr->nexthdr; dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { err = dst->error; dst_release(dst); goto drop; } preempt_disable(); dst_cache_set_ip6(&rlwt->cache, dst, &fl6.saddr); preempt_enable(); } skb_dst_drop(skb); skb_dst_set(skb, dst); err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); if (unlikely(err)) goto drop; return dst_output(net, sk, skb); drop: kfree_skb(skb); return err; } static int rpl_input(struct sk_buff *skb) { struct dst_entry *orig_dst = skb_dst(skb); struct dst_entry *dst = NULL; struct rpl_lwt *rlwt; int err; rlwt = rpl_lwt_lwtunnel(orig_dst->lwtstate); err = rpl_do_srh(skb, rlwt); if (unlikely(err)) { kfree_skb(skb); return err; } preempt_disable(); dst = dst_cache_get(&rlwt->cache); preempt_enable(); if (!dst) { ip6_route_input(skb); dst = skb_dst(skb); if (!dst->error) { preempt_disable(); dst_cache_set_ip6(&rlwt->cache, dst, &ipv6_hdr(skb)->saddr); preempt_enable(); } } else { skb_dst_drop(skb); skb_dst_set(skb, dst); } err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); if (unlikely(err)) return err; return dst_input(skb); } static int nla_put_rpl_srh(struct sk_buff *skb, int attrtype, struct rpl_iptunnel_encap *tuninfo) { struct rpl_iptunnel_encap *data; struct nlattr *nla; int len; len = RPL_IPTUNNEL_SRH_SIZE(tuninfo->srh); nla = nla_reserve(skb, attrtype, len); if (!nla) return -EMSGSIZE; data = nla_data(nla); memcpy(data, tuninfo->srh, len); return 0; } static int rpl_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwtstate) { struct rpl_iptunnel_encap *tuninfo = rpl_encap_lwtunnel(lwtstate); if (nla_put_rpl_srh(skb, RPL_IPTUNNEL_SRH, tuninfo)) return -EMSGSIZE; return 0; } static int rpl_encap_nlsize(struct lwtunnel_state *lwtstate) { struct rpl_iptunnel_encap *tuninfo = rpl_encap_lwtunnel(lwtstate); return nla_total_size(RPL_IPTUNNEL_SRH_SIZE(tuninfo->srh)); } static int rpl_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) { struct rpl_iptunnel_encap *a_hdr = rpl_encap_lwtunnel(a); struct rpl_iptunnel_encap *b_hdr = rpl_encap_lwtunnel(b); int len = RPL_IPTUNNEL_SRH_SIZE(a_hdr->srh); if (len != RPL_IPTUNNEL_SRH_SIZE(b_hdr->srh)) return 1; return memcmp(a_hdr, b_hdr, len); } static const struct lwtunnel_encap_ops rpl_ops = { .build_state = rpl_build_state, .destroy_state = rpl_destroy_state, .output = rpl_output, .input = rpl_input, .fill_encap = rpl_fill_encap_info, .get_encap_size = rpl_encap_nlsize, .cmp_encap = rpl_encap_cmp, .owner = THIS_MODULE, }; int __init rpl_init(void) { int err; err = lwtunnel_encap_add_ops(&rpl_ops, LWTUNNEL_ENCAP_RPL); if (err) goto out; pr_info("RPL Segment Routing with IPv6\n"); return 0; out: return err; } void rpl_exit(void) { lwtunnel_encap_del_ops(&rpl_ops, LWTUNNEL_ENCAP_RPL); }
5149 3126 1243 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_WAIT_H #define _LINUX_WAIT_H /* * Linux wait queue related types and methods */ #include <linux/list.h> #include <linux/stddef.h> #include <linux/spinlock.h> #include <asm/current.h> #include <uapi/linux/wait.h> typedef struct wait_queue_entry wait_queue_entry_t; typedef int (*wait_queue_func_t)(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key); int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key); /* wait_queue_entry::flags */ #define WQ_FLAG_EXCLUSIVE 0x01 #define WQ_FLAG_WOKEN 0x02 #define WQ_FLAG_BOOKMARK 0x04 #define WQ_FLAG_CUSTOM 0x08 #define WQ_FLAG_DONE 0x10 #define WQ_FLAG_PRIORITY 0x20 /* * A single wait-queue entry structure: */ struct wait_queue_entry { unsigned int flags; void *private; wait_queue_func_t func; struct list_head entry; }; struct wait_queue_head { spinlock_t lock; struct list_head head; }; typedef struct wait_queue_head wait_queue_head_t; struct task_struct; /* * Macros for declaration and initialisaton of the datatypes */ #define __WAITQUEUE_INITIALIZER(name, tsk) { \ .private = tsk, \ .func = default_wake_function, \ .entry = { NULL, NULL } } #define DECLARE_WAITQUEUE(name, tsk) \ struct wait_queue_entry name = __WAITQUEUE_INITIALIZER(name, tsk) #define __WAIT_QUEUE_HEAD_INITIALIZER(name) { \ .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ .head = LIST_HEAD_INIT(name.head) } #define DECLARE_WAIT_QUEUE_HEAD(name) \ struct wait_queue_head name = __WAIT_QUEUE_HEAD_INITIALIZER(name) extern void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *); #define init_waitqueue_head(wq_head) \ do { \ static struct lock_class_key __key; \ \ __init_waitqueue_head((wq_head), #wq_head, &__key); \ } while (0) #ifdef CONFIG_LOCKDEP # define __WAIT_QUEUE_HEAD_INIT_ONSTACK(name) \ ({ init_waitqueue_head(&name); name; }) # define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) \ struct wait_queue_head name = __WAIT_QUEUE_HEAD_INIT_ONSTACK(name) #else # define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) DECLARE_WAIT_QUEUE_HEAD(name) #endif static inline void init_waitqueue_entry(struct wait_queue_entry *wq_entry, struct task_struct *p) { wq_entry->flags = 0; wq_entry->private = p; wq_entry->func = default_wake_function; } static inline void init_waitqueue_func_entry(struct wait_queue_entry *wq_entry, wait_queue_func_t func) { wq_entry->flags = 0; wq_entry->private = NULL; wq_entry->func = func; } /** * waitqueue_active -- locklessly test for waiters on the queue * @wq_head: the waitqueue to test for waiters * * returns true if the wait list is not empty * * NOTE: this function is lockless and requires care, incorrect usage _will_ * lead to sporadic and non-obvious failure. * * Use either while holding wait_queue_head::lock or when used for wakeups * with an extra smp_mb() like:: * * CPU0 - waker CPU1 - waiter * * for (;;) { * @cond = true; prepare_to_wait(&wq_head, &wait, state); * smp_mb(); // smp_mb() from set_current_state() * if (waitqueue_active(wq_head)) if (@cond) * wake_up(wq_head); break; * schedule(); * } * finish_wait(&wq_head, &wait); * * Because without the explicit smp_mb() it's possible for the * waitqueue_active() load to get hoisted over the @cond store such that we'll * observe an empty wait list while the waiter might not observe @cond. * * Also note that this 'optimization' trades a spin_lock() for an smp_mb(), * which (when the lock is uncontended) are of roughly equal cost. */ static inline int waitqueue_active(struct wait_queue_head *wq_head) { return !list_empty(&wq_head->head); } /** * wq_has_single_sleeper - check if there is only one sleeper * @wq_head: wait queue head * * Returns true of wq_head has only one sleeper on the list. * * Please refer to the comment for waitqueue_active. */ static inline bool wq_has_single_sleeper(struct wait_queue_head *wq_head) { return list_is_singular(&wq_head->head); } /** * wq_has_sleeper - check if there are any waiting processes * @wq_head: wait queue head * * Returns true if wq_head has waiting processes * * Please refer to the comment for waitqueue_active. */ static inline bool wq_has_sleeper(struct wait_queue_head *wq_head) { /* * We need to be sure we are in sync with the * add_wait_queue modifications to the wait queue. * * This memory barrier should be paired with one on the * waiting side. */ smp_mb(); return waitqueue_active(wq_head); } extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); static inline void __add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { struct list_head *head = &wq_head->head; struct wait_queue_entry *wq; list_for_each_entry(wq, &wq_head->head, entry) { if (!(wq->flags & WQ_FLAG_PRIORITY)) break; head = &wq->entry; } list_add(&wq_entry->entry, head); } /* * Used for wake-one threads: */ static inline void __add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { wq_entry->flags |= WQ_FLAG_EXCLUSIVE; __add_wait_queue(wq_head, wq_entry); } static inline void __add_wait_queue_entry_tail(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { list_add_tail(&wq_entry->entry, &wq_head->head); } static inline void __add_wait_queue_entry_tail_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { wq_entry->flags |= WQ_FLAG_EXCLUSIVE; __add_wait_queue_entry_tail(wq_head, wq_entry); } static inline void __remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { list_del(&wq_entry->entry); } int __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key); void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head, unsigned int mode, void *key, wait_queue_entry_t *bookmark); void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr); void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode); void __wake_up_pollfree(struct wait_queue_head *wq_head); #define wake_up(x) __wake_up(x, TASK_NORMAL, 1, NULL) #define wake_up_nr(x, nr) __wake_up(x, TASK_NORMAL, nr, NULL) #define wake_up_all(x) __wake_up(x, TASK_NORMAL, 0, NULL) #define wake_up_locked(x) __wake_up_locked((x), TASK_NORMAL, 1) #define wake_up_all_locked(x) __wake_up_locked((x), TASK_NORMAL, 0) #define wake_up_interruptible(x) __wake_up(x, TASK_INTERRUPTIBLE, 1, NULL) #define wake_up_interruptible_nr(x, nr) __wake_up(x, TASK_INTERRUPTIBLE, nr, NULL) #define wake_up_interruptible_all(x) __wake_up(x, TASK_INTERRUPTIBLE, 0, NULL) #define wake_up_interruptible_sync(x) __wake_up_sync((x), TASK_INTERRUPTIBLE) /* * Wakeup macros to be used to report events to the targets. */ #define poll_to_key(m) ((void *)(__force uintptr_t)(__poll_t)(m)) #define key_to_poll(m) ((__force __poll_t)(uintptr_t)(void *)(m)) #define wake_up_poll(x, m) \ __wake_up(x, TASK_NORMAL, 1, poll_to_key(m)) #define wake_up_poll_on_current_cpu(x, m) \ __wake_up_on_current_cpu(x, TASK_NORMAL, poll_to_key(m)) #define wake_up_locked_poll(x, m) \ __wake_up_locked_key((x), TASK_NORMAL, poll_to_key(m)) #define wake_up_interruptible_poll(x, m) \ __wake_up(x, TASK_INTERRUPTIBLE, 1, poll_to_key(m)) #define wake_up_interruptible_sync_poll(x, m) \ __wake_up_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m)) #define wake_up_interruptible_sync_poll_locked(x, m) \ __wake_up_locked_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m)) /** * wake_up_pollfree - signal that a polled waitqueue is going away * @wq_head: the wait queue head * * In the very rare cases where a ->poll() implementation uses a waitqueue whose * lifetime is tied to a task rather than to the 'struct file' being polled, * this function must be called before the waitqueue is freed so that * non-blocking polls (e.g. epoll) are notified that the queue is going away. * * The caller must also RCU-delay the freeing of the wait_queue_head, e.g. via * an explicit synchronize_rcu() or call_rcu(), or via SLAB_TYPESAFE_BY_RCU. */ static inline void wake_up_pollfree(struct wait_queue_head *wq_head) { /* * For performance reasons, we don't always take the queue lock here. * Therefore, we might race with someone removing the last entry from * the queue, and proceed while they still hold the queue lock. * However, rcu_read_lock() is required to be held in such cases, so we * can safely proceed with an RCU-delayed free. */ if (waitqueue_active(wq_head)) __wake_up_pollfree(wq_head); } #define ___wait_cond_timeout(condition) \ ({ \ bool __cond = (condition); \ if (__cond && !__ret) \ __ret = 1; \ __cond || !__ret; \ }) #define ___wait_is_interruptible(state) \ (!__builtin_constant_p(state) || \ (state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL))) extern void init_wait_entry(struct wait_queue_entry *wq_entry, int flags); /* * The below macro ___wait_event() has an explicit shadow of the __ret * variable when used from the wait_event_*() macros. * * This is so that both can use the ___wait_cond_timeout() construct * to wrap the condition. * * The type inconsistency of the wait_event_*() __ret variable is also * on purpose; we use long where we can return timeout values and int * otherwise. */ #define ___wait_event(wq_head, condition, state, exclusive, ret, cmd) \ ({ \ __label__ __out; \ struct wait_queue_entry __wq_entry; \ long __ret = ret; /* explicit shadow */ \ \ init_wait_entry(&__wq_entry, exclusive ? WQ_FLAG_EXCLUSIVE : 0); \ for (;;) { \ long __int = prepare_to_wait_event(&wq_head, &__wq_entry, state);\ \ if (condition) \ break; \ \ if (___wait_is_interruptible(state) && __int) { \ __ret = __int; \ goto __out; \ } \ \ cmd; \ } \ finish_wait(&wq_head, &__wq_entry); \ __out: __ret; \ }) #define __wait_event(wq_head, condition) \ (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ schedule()) /** * wait_event - sleep until a condition gets true * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the * @condition evaluates to true. The @condition is checked each time * the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. */ #define wait_event(wq_head, condition) \ do { \ might_sleep(); \ if (condition) \ break; \ __wait_event(wq_head, condition); \ } while (0) #define __io_wait_event(wq_head, condition) \ (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ io_schedule()) /* * io_wait_event() -- like wait_event() but with io_schedule() */ #define io_wait_event(wq_head, condition) \ do { \ might_sleep(); \ if (condition) \ break; \ __io_wait_event(wq_head, condition); \ } while (0) #define __wait_event_freezable(wq_head, condition) \ ___wait_event(wq_head, condition, (TASK_INTERRUPTIBLE|TASK_FREEZABLE), \ 0, 0, schedule()) /** * wait_event_freezable - sleep (or freeze) until a condition gets true * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_INTERRUPTIBLE -- so as not to contribute * to system load) until the @condition evaluates to true. The * @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. */ #define wait_event_freezable(wq_head, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_freezable(wq_head, condition); \ __ret; \ }) #define __wait_event_timeout(wq_head, condition, timeout) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ TASK_UNINTERRUPTIBLE, 0, timeout, \ __ret = schedule_timeout(__ret)) /** * wait_event_timeout - sleep until a condition gets true or a timeout elapses * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, in jiffies * * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the * @condition evaluates to true. The @condition is checked each time * the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * Returns: * 0 if the @condition evaluated to %false after the @timeout elapsed, * 1 if the @condition evaluated to %true after the @timeout elapsed, * or the remaining jiffies (at least 1) if the @condition evaluated * to %true before the @timeout elapsed. */ #define wait_event_timeout(wq_head, condition, timeout) \ ({ \ long __ret = timeout; \ might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_timeout(wq_head, condition, timeout); \ __ret; \ }) #define __wait_event_freezable_timeout(wq_head, condition, timeout) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ (TASK_INTERRUPTIBLE|TASK_FREEZABLE), 0, timeout, \ __ret = schedule_timeout(__ret)) /* * like wait_event_timeout() -- except it uses TASK_INTERRUPTIBLE to avoid * increasing load and is freezable. */ #define wait_event_freezable_timeout(wq_head, condition, timeout) \ ({ \ long __ret = timeout; \ might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_freezable_timeout(wq_head, condition, timeout); \ __ret; \ }) #define __wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2) \ (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 1, 0, \ cmd1; schedule(); cmd2) /* * Just like wait_event_cmd(), except it sets exclusive flag */ #define wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2) \ do { \ if (condition) \ break; \ __wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2); \ } while (0) #define __wait_event_cmd(wq_head, condition, cmd1, cmd2) \ (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ cmd1; schedule(); cmd2) /** * wait_event_cmd - sleep until a condition gets true * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @cmd1: the command will be executed before sleep * @cmd2: the command will be executed after sleep * * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the * @condition evaluates to true. The @condition is checked each time * the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. */ #define wait_event_cmd(wq_head, condition, cmd1, cmd2) \ do { \ if (condition) \ break; \ __wait_event_cmd(wq_head, condition, cmd1, cmd2); \ } while (0) #define __wait_event_interruptible(wq_head, condition) \ ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0, \ schedule()) /** * wait_event_interruptible - sleep until a condition gets true * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * The function will return -ERESTARTSYS if it was interrupted by a * signal and 0 if @condition evaluated to true. */ #define wait_event_interruptible(wq_head, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_interruptible(wq_head, condition); \ __ret; \ }) #define __wait_event_interruptible_timeout(wq_head, condition, timeout) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ TASK_INTERRUPTIBLE, 0, timeout, \ __ret = schedule_timeout(__ret)) /** * wait_event_interruptible_timeout - sleep until a condition gets true or a timeout elapses * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, in jiffies * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * Returns: * 0 if the @condition evaluated to %false after the @timeout elapsed, * 1 if the @condition evaluated to %true after the @timeout elapsed, * the remaining jiffies (at least 1) if the @condition evaluated * to %true before the @timeout elapsed, or -%ERESTARTSYS if it was * interrupted by a signal. */ #define wait_event_interruptible_timeout(wq_head, condition, timeout) \ ({ \ long __ret = timeout; \ might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_interruptible_timeout(wq_head, \ condition, timeout); \ __ret; \ }) #define __wait_event_hrtimeout(wq_head, condition, timeout, state) \ ({ \ int __ret = 0; \ struct hrtimer_sleeper __t; \ \ hrtimer_init_sleeper_on_stack(&__t, CLOCK_MONOTONIC, \ HRTIMER_MODE_REL); \ if ((timeout) != KTIME_MAX) { \ hrtimer_set_expires_range_ns(&__t.timer, timeout, \ current->timer_slack_ns); \ hrtimer_sleeper_start_expires(&__t, HRTIMER_MODE_REL); \ } \ \ __ret = ___wait_event(wq_head, condition, state, 0, 0, \ if (!__t.task) { \ __ret = -ETIME; \ break; \ } \ schedule()); \ \ hrtimer_cancel(&__t.timer); \ destroy_hrtimer_on_stack(&__t.timer); \ __ret; \ }) /** * wait_event_hrtimeout - sleep until a condition gets true or a timeout elapses * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, as a ktime_t * * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * The function returns 0 if @condition became true, or -ETIME if the timeout * elapsed. */ #define wait_event_hrtimeout(wq_head, condition, timeout) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_hrtimeout(wq_head, condition, timeout, \ TASK_UNINTERRUPTIBLE); \ __ret; \ }) /** * wait_event_interruptible_hrtimeout - sleep until a condition gets true or a timeout elapses * @wq: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, as a ktime_t * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * The function returns 0 if @condition became true, -ERESTARTSYS if it was * interrupted by a signal, or -ETIME if the timeout elapsed. */ #define wait_event_interruptible_hrtimeout(wq, condition, timeout) \ ({ \ long __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_hrtimeout(wq, condition, timeout, \ TASK_INTERRUPTIBLE); \ __ret; \ }) #define __wait_event_interruptible_exclusive(wq, condition) \ ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0, \ schedule()) #define wait_event_interruptible_exclusive(wq, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_interruptible_exclusive(wq, condition); \ __ret; \ }) #define __wait_event_killable_exclusive(wq, condition) \ ___wait_event(wq, condition, TASK_KILLABLE, 1, 0, \ schedule()) #define wait_event_killable_exclusive(wq, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_killable_exclusive(wq, condition); \ __ret; \ }) #define __wait_event_freezable_exclusive(wq, condition) \ ___wait_event(wq, condition, (TASK_INTERRUPTIBLE|TASK_FREEZABLE), 1, 0,\ schedule()) #define wait_event_freezable_exclusive(wq, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_freezable_exclusive(wq, condition); \ __ret; \ }) /** * wait_event_idle - wait for a condition without contributing to system load * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_IDLE) until the * @condition evaluates to true. * The @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * */ #define wait_event_idle(wq_head, condition) \ do { \ might_sleep(); \ if (!(condition)) \ ___wait_event(wq_head, condition, TASK_IDLE, 0, 0, schedule()); \ } while (0) /** * wait_event_idle_exclusive - wait for a condition with contributing to system load * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_IDLE) until the * @condition evaluates to true. * The @condition is checked each time the waitqueue @wq_head is woken up. * * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag * set thus if other processes wait on the same list, when this * process is woken further processes are not considered. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * */ #define wait_event_idle_exclusive(wq_head, condition) \ do { \ might_sleep(); \ if (!(condition)) \ ___wait_event(wq_head, condition, TASK_IDLE, 1, 0, schedule()); \ } while (0) #define __wait_event_idle_timeout(wq_head, condition, timeout) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ TASK_IDLE, 0, timeout, \ __ret = schedule_timeout(__ret)) /** * wait_event_idle_timeout - sleep without load until a condition becomes true or a timeout elapses * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, in jiffies * * The process is put to sleep (TASK_IDLE) until the * @condition evaluates to true. The @condition is checked each time * the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * Returns: * 0 if the @condition evaluated to %false after the @timeout elapsed, * 1 if the @condition evaluated to %true after the @timeout elapsed, * or the remaining jiffies (at least 1) if the @condition evaluated * to %true before the @timeout elapsed. */ #define wait_event_idle_timeout(wq_head, condition, timeout) \ ({ \ long __ret = timeout; \ might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_idle_timeout(wq_head, condition, timeout); \ __ret; \ }) #define __wait_event_idle_exclusive_timeout(wq_head, condition, timeout) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ TASK_IDLE, 1, timeout, \ __ret = schedule_timeout(__ret)) /** * wait_event_idle_exclusive_timeout - sleep without load until a condition becomes true or a timeout elapses * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, in jiffies * * The process is put to sleep (TASK_IDLE) until the * @condition evaluates to true. The @condition is checked each time * the waitqueue @wq_head is woken up. * * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag * set thus if other processes wait on the same list, when this * process is woken further processes are not considered. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * Returns: * 0 if the @condition evaluated to %false after the @timeout elapsed, * 1 if the @condition evaluated to %true after the @timeout elapsed, * or the remaining jiffies (at least 1) if the @condition evaluated * to %true before the @timeout elapsed. */ #define wait_event_idle_exclusive_timeout(wq_head, condition, timeout) \ ({ \ long __ret = timeout; \ might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_idle_exclusive_timeout(wq_head, condition, timeout);\ __ret; \ }) extern int do_wait_intr(wait_queue_head_t *, wait_queue_entry_t *); extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_entry_t *); #define __wait_event_interruptible_locked(wq, condition, exclusive, fn) \ ({ \ int __ret; \ DEFINE_WAIT(__wait); \ if (exclusive) \ __wait.flags |= WQ_FLAG_EXCLUSIVE; \ do { \ __ret = fn(&(wq), &__wait); \ if (__ret) \ break; \ } while (!(condition)); \ __remove_wait_queue(&(wq), &__wait); \ __set_current_state(TASK_RUNNING); \ __ret; \ }) /** * wait_event_interruptible_locked - sleep until a condition gets true * @wq: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq is woken up. * * It must be called with wq.lock being held. This spinlock is * unlocked while sleeping but @condition testing is done while lock * is held and when this macro exits the lock is held. * * The lock is locked/unlocked using spin_lock()/spin_unlock() * functions which must match the way they are locked/unlocked outside * of this macro. * * wake_up_locked() has to be called after changing any variable that could * change the result of the wait condition. * * The function will return -ERESTARTSYS if it was interrupted by a * signal and 0 if @condition evaluated to true. */ #define wait_event_interruptible_locked(wq, condition) \ ((condition) \ ? 0 : __wait_event_interruptible_locked(wq, condition, 0, do_wait_intr)) /** * wait_event_interruptible_locked_irq - sleep until a condition gets true * @wq: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq is woken up. * * It must be called with wq.lock being held. This spinlock is * unlocked while sleeping but @condition testing is done while lock * is held and when this macro exits the lock is held. * * The lock is locked/unlocked using spin_lock_irq()/spin_unlock_irq() * functions which must match the way they are locked/unlocked outside * of this macro. * * wake_up_locked() has to be called after changing any variable that could * change the result of the wait condition. * * The function will return -ERESTARTSYS if it was interrupted by a * signal and 0 if @condition evaluated to true. */ #define wait_event_interruptible_locked_irq(wq, condition) \ ((condition) \ ? 0 : __wait_event_interruptible_locked(wq, condition, 0, do_wait_intr_irq)) /** * wait_event_interruptible_exclusive_locked - sleep exclusively until a condition gets true * @wq: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq is woken up. * * It must be called with wq.lock being held. This spinlock is * unlocked while sleeping but @condition testing is done while lock * is held and when this macro exits the lock is held. * * The lock is locked/unlocked using spin_lock()/spin_unlock() * functions which must match the way they are locked/unlocked outside * of this macro. * * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag * set thus when other process waits process on the list if this * process is awaken further processes are not considered. * * wake_up_locked() has to be called after changing any variable that could * change the result of the wait condition. * * The function will return -ERESTARTSYS if it was interrupted by a * signal and 0 if @condition evaluated to true. */ #define wait_event_interruptible_exclusive_locked(wq, condition) \ ((condition) \ ? 0 : __wait_event_interruptible_locked(wq, condition, 1, do_wait_intr)) /** * wait_event_interruptible_exclusive_locked_irq - sleep until a condition gets true * @wq: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq is woken up. * * It must be called with wq.lock being held. This spinlock is * unlocked while sleeping but @condition testing is done while lock * is held and when this macro exits the lock is held. * * The lock is locked/unlocked using spin_lock_irq()/spin_unlock_irq() * functions which must match the way they are locked/unlocked outside * of this macro. * * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag * set thus when other process waits process on the list if this * process is awaken further processes are not considered. * * wake_up_locked() has to be called after changing any variable that could * change the result of the wait condition. * * The function will return -ERESTARTSYS if it was interrupted by a * signal and 0 if @condition evaluated to true. */ #define wait_event_interruptible_exclusive_locked_irq(wq, condition) \ ((condition) \ ? 0 : __wait_event_interruptible_locked(wq, condition, 1, do_wait_intr_irq)) #define __wait_event_killable(wq, condition) \ ___wait_event(wq, condition, TASK_KILLABLE, 0, 0, schedule()) /** * wait_event_killable - sleep until a condition gets true * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_KILLABLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * The function will return -ERESTARTSYS if it was interrupted by a * signal and 0 if @condition evaluated to true. */ #define wait_event_killable(wq_head, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_killable(wq_head, condition); \ __ret; \ }) #define __wait_event_state(wq, condition, state) \ ___wait_event(wq, condition, state, 0, 0, schedule()) /** * wait_event_state - sleep until a condition gets true * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @state: state to sleep in * * The process is put to sleep (@state) until the @condition evaluates to true * or a signal is received (when allowed by @state). The @condition is checked * each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * The function will return -ERESTARTSYS if it was interrupted by a signal * (when allowed by @state) and 0 if @condition evaluated to true. */ #define wait_event_state(wq_head, condition, state) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_state(wq_head, condition, state); \ __ret; \ }) #define __wait_event_killable_timeout(wq_head, condition, timeout) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ TASK_KILLABLE, 0, timeout, \ __ret = schedule_timeout(__ret)) /** * wait_event_killable_timeout - sleep until a condition gets true or a timeout elapses * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, in jiffies * * The process is put to sleep (TASK_KILLABLE) until the * @condition evaluates to true or a kill signal is received. * The @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * Returns: * 0 if the @condition evaluated to %false after the @timeout elapsed, * 1 if the @condition evaluated to %true after the @timeout elapsed, * the remaining jiffies (at least 1) if the @condition evaluated * to %true before the @timeout elapsed, or -%ERESTARTSYS if it was * interrupted by a kill signal. * * Only kill signals interrupt this process. */ #define wait_event_killable_timeout(wq_head, condition, timeout) \ ({ \ long __ret = timeout; \ might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_killable_timeout(wq_head, \ condition, timeout); \ __ret; \ }) #define __wait_event_lock_irq(wq_head, condition, lock, cmd) \ (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ spin_unlock_irq(&lock); \ cmd; \ schedule(); \ spin_lock_irq(&lock)) /** * wait_event_lock_irq_cmd - sleep until a condition gets true. The * condition is checked under the lock. This * is expected to be called with the lock * taken. * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @lock: a locked spinlock_t, which will be released before cmd * and schedule() and reacquired afterwards. * @cmd: a command which is invoked outside the critical section before * sleep * * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the * @condition evaluates to true. The @condition is checked each time * the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * This is supposed to be called while holding the lock. The lock is * dropped before invoking the cmd and going to sleep and is reacquired * afterwards. */ #define wait_event_lock_irq_cmd(wq_head, condition, lock, cmd) \ do { \ if (condition) \ break; \ __wait_event_lock_irq(wq_head, condition, lock, cmd); \ } while (0) /** * wait_event_lock_irq - sleep until a condition gets true. The * condition is checked under the lock. This * is expected to be called with the lock * taken. * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @lock: a locked spinlock_t, which will be released before schedule() * and reacquired afterwards. * * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the * @condition evaluates to true. The @condition is checked each time * the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * This is supposed to be called while holding the lock. The lock is * dropped before going to sleep and is reacquired afterwards. */ #define wait_event_lock_irq(wq_head, condition, lock) \ do { \ if (condition) \ break; \ __wait_event_lock_irq(wq_head, condition, lock, ); \ } while (0) #define __wait_event_interruptible_lock_irq(wq_head, condition, lock, cmd) \ ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0, \ spin_unlock_irq(&lock); \ cmd; \ schedule(); \ spin_lock_irq(&lock)) /** * wait_event_interruptible_lock_irq_cmd - sleep until a condition gets true. * The condition is checked under the lock. This is expected to * be called with the lock taken. * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @lock: a locked spinlock_t, which will be released before cmd and * schedule() and reacquired afterwards. * @cmd: a command which is invoked outside the critical section before * sleep * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. The @condition is * checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * This is supposed to be called while holding the lock. The lock is * dropped before invoking the cmd and going to sleep and is reacquired * afterwards. * * The macro will return -ERESTARTSYS if it was interrupted by a signal * and 0 if @condition evaluated to true. */ #define wait_event_interruptible_lock_irq_cmd(wq_head, condition, lock, cmd) \ ({ \ int __ret = 0; \ if (!(condition)) \ __ret = __wait_event_interruptible_lock_irq(wq_head, \ condition, lock, cmd); \ __ret; \ }) /** * wait_event_interruptible_lock_irq - sleep until a condition gets true. * The condition is checked under the lock. This is expected * to be called with the lock taken. * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @lock: a locked spinlock_t, which will be released before schedule() * and reacquired afterwards. * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or signal is received. The @condition is * checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * This is supposed to be called while holding the lock. The lock is * dropped before going to sleep and is reacquired afterwards. * * The macro will return -ERESTARTSYS if it was interrupted by a signal * and 0 if @condition evaluated to true. */ #define wait_event_interruptible_lock_irq(wq_head, condition, lock) \ ({ \ int __ret = 0; \ if (!(condition)) \ __ret = __wait_event_interruptible_lock_irq(wq_head, \ condition, lock,); \ __ret; \ }) #define __wait_event_lock_irq_timeout(wq_head, condition, lock, timeout, state) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ state, 0, timeout, \ spin_unlock_irq(&lock); \ __ret = schedule_timeout(__ret); \ spin_lock_irq(&lock)); /** * wait_event_interruptible_lock_irq_timeout - sleep until a condition gets * true or a timeout elapses. The condition is checked under * the lock. This is expected to be called with the lock taken. * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @lock: a locked spinlock_t, which will be released before schedule() * and reacquired afterwards. * @timeout: timeout, in jiffies * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or signal is received. The @condition is * checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * This is supposed to be called while holding the lock. The lock is * dropped before going to sleep and is reacquired afterwards. * * The function returns 0 if the @timeout elapsed, -ERESTARTSYS if it * was interrupted by a signal, and the remaining jiffies otherwise * if the condition evaluated to true before the timeout elapsed. */ #define wait_event_interruptible_lock_irq_timeout(wq_head, condition, lock, \ timeout) \ ({ \ long __ret = timeout; \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_lock_irq_timeout( \ wq_head, condition, lock, timeout, \ TASK_INTERRUPTIBLE); \ __ret; \ }) #define wait_event_lock_irq_timeout(wq_head, condition, lock, timeout) \ ({ \ long __ret = timeout; \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_lock_irq_timeout( \ wq_head, condition, lock, timeout, \ TASK_UNINTERRUPTIBLE); \ __ret; \ }) /* * Waitqueues which are removed from the waitqueue_head at wakeup time */ void prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); bool prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout); int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key); int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key); #define DEFINE_WAIT_FUNC(name, function) \ struct wait_queue_entry name = { \ .private = current, \ .func = function, \ .entry = LIST_HEAD_INIT((name).entry), \ } #define DEFINE_WAIT(name) DEFINE_WAIT_FUNC(name, autoremove_wake_function) #define init_wait(wait) \ do { \ (wait)->private = current; \ (wait)->func = autoremove_wake_function; \ INIT_LIST_HEAD(&(wait)->entry); \ (wait)->flags = 0; \ } while (0) typedef int (*task_call_f)(struct task_struct *p, void *arg); extern int task_call_func(struct task_struct *p, task_call_f func, void *arg); #endif /* _LINUX_WAIT_H */
588 8 4129 1687 1673 543 5 396 629 1692 6016 1371 6 2573 390 39 253 326 385 242 327 325 464 461 4 29 6679 5872 6372 12 12 847 363 352 13 186 72 1019 3 501 1 36 1 5 5 26 26 1536 727 184 1361 297 47 49 9 29 29 44 1 1217 1631 1360 22 4700 3631 22 10 51 154 656 5647 170 1551 5 1 4 1398 3353 3877 1945 164 198 11 260 1500 3146 1906 58 24 291 425 1 1 230 888 1 1255 261 454 1889 407 206 485 287 23 613 342 16 395 265 37 1 16 145 36 157 890 279 260 278 115 2 7 2366 1006 319 1125 2 2 505 481 36 11 127 7 267 235 240 12 210 114 31 9 9 203 4 4 10 278 1 2 903 17 3 4 446 1772 305 155 97 97 67 293 174 677 815 8 115 5 303 22 1 2 2 2 2 2 11 7 3 5 9 9 1 6441 1372 35 6135 3015 15 15 37 3398 22 27 357 579 2830 293 293 168 3261 183 281 17 1319 8 1 25 423 454 741 61 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Definitions for the 'struct sk_buff' memory handlers. * * Authors: * Alan Cox, <gw4pts@gw4pts.ampr.org> * Florian La Roche, <rzsfl@rz.uni-sb.de> */ #ifndef _LINUX_SKBUFF_H #define _LINUX_SKBUFF_H #include <linux/kernel.h> #include <linux/compiler.h> #include <linux/time.h> #include <linux/bug.h> #include <linux/bvec.h> #include <linux/cache.h> #include <linux/rbtree.h> #include <linux/socket.h> #include <linux/refcount.h> #include <linux/atomic.h> #include <asm/types.h> #include <linux/spinlock.h> #include <net/checksum.h> #include <linux/rcupdate.h> #include <linux/dma-mapping.h> #include <linux/netdev_features.h> #include <net/flow_dissector.h> #include <linux/in6.h> #include <linux/if_packet.h> #include <linux/llist.h> #include <net/flow.h> #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <linux/netfilter/nf_conntrack_common.h> #endif #include <net/net_debug.h> #include <net/dropreason-core.h> /** * DOC: skb checksums * * The interface for checksum offload between the stack and networking drivers * is as follows... * * IP checksum related features * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * Drivers advertise checksum offload capabilities in the features of a device. * From the stack's point of view these are capabilities offered by the driver. * A driver typically only advertises features that it is capable of offloading * to its device. * * .. flat-table:: Checksum related device features * :widths: 1 10 * * * - %NETIF_F_HW_CSUM * - The driver (or its device) is able to compute one * IP (one's complement) checksum for any combination * of protocols or protocol layering. The checksum is * computed and set in a packet per the CHECKSUM_PARTIAL * interface (see below). * * * - %NETIF_F_IP_CSUM * - Driver (device) is only able to checksum plain * TCP or UDP packets over IPv4. These are specifically * unencapsulated packets of the form IPv4|TCP or * IPv4|UDP where the Protocol field in the IPv4 header * is TCP or UDP. The IPv4 header may contain IP options. * This feature cannot be set in features for a device * with NETIF_F_HW_CSUM also set. This feature is being * DEPRECATED (see below). * * * - %NETIF_F_IPV6_CSUM * - Driver (device) is only able to checksum plain * TCP or UDP packets over IPv6. These are specifically * unencapsulated packets of the form IPv6|TCP or * IPv6|UDP where the Next Header field in the IPv6 * header is either TCP or UDP. IPv6 extension headers * are not supported with this feature. This feature * cannot be set in features for a device with * NETIF_F_HW_CSUM also set. This feature is being * DEPRECATED (see below). * * * - %NETIF_F_RXCSUM * - Driver (device) performs receive checksum offload. * This flag is only used to disable the RX checksum * feature for a device. The stack will accept receive * checksum indication in packets received on a device * regardless of whether NETIF_F_RXCSUM is set. * * Checksumming of received packets by device * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * Indication of checksum verification is set in &sk_buff.ip_summed. * Possible values are: * * - %CHECKSUM_NONE * * Device did not checksum this packet e.g. due to lack of capabilities. * The packet contains full (though not verified) checksum in packet but * not in skb->csum. Thus, skb->csum is undefined in this case. * * - %CHECKSUM_UNNECESSARY * * The hardware you're dealing with doesn't calculate the full checksum * (as in %CHECKSUM_COMPLETE), but it does parse headers and verify checksums * for specific protocols. For such packets it will set %CHECKSUM_UNNECESSARY * if their checksums are okay. &sk_buff.csum is still undefined in this case * though. A driver or device must never modify the checksum field in the * packet even if checksum is verified. * * %CHECKSUM_UNNECESSARY is applicable to following protocols: * * - TCP: IPv6 and IPv4. * - UDP: IPv4 and IPv6. A device may apply CHECKSUM_UNNECESSARY to a * zero UDP checksum for either IPv4 or IPv6, the networking stack * may perform further validation in this case. * - GRE: only if the checksum is present in the header. * - SCTP: indicates the CRC in SCTP header has been validated. * - FCOE: indicates the CRC in FC frame has been validated. * * &sk_buff.csum_level indicates the number of consecutive checksums found in * the packet minus one that have been verified as %CHECKSUM_UNNECESSARY. * For instance if a device receives an IPv6->UDP->GRE->IPv4->TCP packet * and a device is able to verify the checksums for UDP (possibly zero), * GRE (checksum flag is set) and TCP, &sk_buff.csum_level would be set to * two. If the device were only able to verify the UDP checksum and not * GRE, either because it doesn't support GRE checksum or because GRE * checksum is bad, skb->csum_level would be set to zero (TCP checksum is * not considered in this case). * * - %CHECKSUM_COMPLETE * * This is the most generic way. The device supplied checksum of the _whole_ * packet as seen by netif_rx() and fills in &sk_buff.csum. This means the * hardware doesn't need to parse L3/L4 headers to implement this. * * Notes: * * - Even if device supports only some protocols, but is able to produce * skb->csum, it MUST use CHECKSUM_COMPLETE, not CHECKSUM_UNNECESSARY. * - CHECKSUM_COMPLETE is not applicable to SCTP and FCoE protocols. * * - %CHECKSUM_PARTIAL * * A checksum is set up to be offloaded to a device as described in the * output description for CHECKSUM_PARTIAL. This may occur on a packet * received directly from another Linux OS, e.g., a virtualized Linux kernel * on the same host, or it may be set in the input path in GRO or remote * checksum offload. For the purposes of checksum verification, the checksum * referred to by skb->csum_start + skb->csum_offset and any preceding * checksums in the packet are considered verified. Any checksums in the * packet that are after the checksum being offloaded are not considered to * be verified. * * Checksumming on transmit for non-GSO * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * The stack requests checksum offload in the &sk_buff.ip_summed for a packet. * Values are: * * - %CHECKSUM_PARTIAL * * The driver is required to checksum the packet as seen by hard_start_xmit() * from &sk_buff.csum_start up to the end, and to record/write the checksum at * offset &sk_buff.csum_start + &sk_buff.csum_offset. * A driver may verify that the * csum_start and csum_offset values are valid values given the length and * offset of the packet, but it should not attempt to validate that the * checksum refers to a legitimate transport layer checksum -- it is the * purview of the stack to validate that csum_start and csum_offset are set * correctly. * * When the stack requests checksum offload for a packet, the driver MUST * ensure that the checksum is set correctly. A driver can either offload the * checksum calculation to the device, or call skb_checksum_help (in the case * that the device does not support offload for a particular checksum). * * %NETIF_F_IP_CSUM and %NETIF_F_IPV6_CSUM are being deprecated in favor of * %NETIF_F_HW_CSUM. New devices should use %NETIF_F_HW_CSUM to indicate * checksum offload capability. * skb_csum_hwoffload_help() can be called to resolve %CHECKSUM_PARTIAL based * on network device checksumming capabilities: if a packet does not match * them, skb_checksum_help() or skb_crc32c_help() (depending on the value of * &sk_buff.csum_not_inet, see :ref:`crc`) * is called to resolve the checksum. * * - %CHECKSUM_NONE * * The skb was already checksummed by the protocol, or a checksum is not * required. * * - %CHECKSUM_UNNECESSARY * * This has the same meaning as CHECKSUM_NONE for checksum offload on * output. * * - %CHECKSUM_COMPLETE * * Not used in checksum output. If a driver observes a packet with this value * set in skbuff, it should treat the packet as if %CHECKSUM_NONE were set. * * .. _crc: * * Non-IP checksum (CRC) offloads * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * .. flat-table:: * :widths: 1 10 * * * - %NETIF_F_SCTP_CRC * - This feature indicates that a device is capable of * offloading the SCTP CRC in a packet. To perform this offload the stack * will set csum_start and csum_offset accordingly, set ip_summed to * %CHECKSUM_PARTIAL and set csum_not_inet to 1, to provide an indication * in the skbuff that the %CHECKSUM_PARTIAL refers to CRC32c. * A driver that supports both IP checksum offload and SCTP CRC32c offload * must verify which offload is configured for a packet by testing the * value of &sk_buff.csum_not_inet; skb_crc32c_csum_help() is provided to * resolve %CHECKSUM_PARTIAL on skbs where csum_not_inet is set to 1. * * * - %NETIF_F_FCOE_CRC * - This feature indicates that a device is capable of offloading the FCOE * CRC in a packet. To perform this offload the stack will set ip_summed * to %CHECKSUM_PARTIAL and set csum_start and csum_offset * accordingly. Note that there is no indication in the skbuff that the * %CHECKSUM_PARTIAL refers to an FCOE checksum, so a driver that supports * both IP checksum offload and FCOE CRC offload must verify which offload * is configured for a packet, presumably by inspecting packet headers. * * Checksumming on output with GSO * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * In the case of a GSO packet (skb_is_gso() is true), checksum offload * is implied by the SKB_GSO_* flags in gso_type. Most obviously, if the * gso_type is %SKB_GSO_TCPV4 or %SKB_GSO_TCPV6, TCP checksum offload as * part of the GSO operation is implied. If a checksum is being offloaded * with GSO then ip_summed is %CHECKSUM_PARTIAL, and both csum_start and * csum_offset are set to refer to the outermost checksum being offloaded * (two offloaded checksums are possible with UDP encapsulation). */ /* Don't change this without changing skb_csum_unnecessary! */ #define CHECKSUM_NONE 0 #define CHECKSUM_UNNECESSARY 1 #define CHECKSUM_COMPLETE 2 #define CHECKSUM_PARTIAL 3 /* Maximum value in skb->csum_level */ #define SKB_MAX_CSUM_LEVEL 3 #define SKB_DATA_ALIGN(X) ALIGN(X, SMP_CACHE_BYTES) #define SKB_WITH_OVERHEAD(X) \ ((X) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) /* For X bytes available in skb->head, what is the minimal * allocation needed, knowing struct skb_shared_info needs * to be aligned. */ #define SKB_HEAD_ALIGN(X) (SKB_DATA_ALIGN(X) + \ SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) #define SKB_MAX_ORDER(X, ORDER) \ SKB_WITH_OVERHEAD((PAGE_SIZE << (ORDER)) - (X)) #define SKB_MAX_HEAD(X) (SKB_MAX_ORDER((X), 0)) #define SKB_MAX_ALLOC (SKB_MAX_ORDER(0, 2)) /* return minimum truesize of one skb containing X bytes of data */ #define SKB_TRUESIZE(X) ((X) + \ SKB_DATA_ALIGN(sizeof(struct sk_buff)) + \ SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) struct ahash_request; struct net_device; struct scatterlist; struct pipe_inode_info; struct iov_iter; struct napi_struct; struct bpf_prog; union bpf_attr; struct skb_ext; struct ts_config; #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) struct nf_bridge_info { enum { BRNF_PROTO_UNCHANGED, BRNF_PROTO_8021Q, BRNF_PROTO_PPPOE } orig_proto:8; u8 pkt_otherhost:1; u8 in_prerouting:1; u8 bridged_dnat:1; u8 sabotage_in_done:1; __u16 frag_max_size; struct net_device *physindev; /* always valid & non-NULL from FORWARD on, for physdev match */ struct net_device *physoutdev; union { /* prerouting: detect dnat in orig/reply direction */ __be32 ipv4_daddr; struct in6_addr ipv6_daddr; /* after prerouting + nat detected: store original source * mac since neigh resolution overwrites it, only used while * skb is out in neigh layer. */ char neigh_header[8]; }; }; #endif #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) /* Chain in tc_skb_ext will be used to share the tc chain with * ovs recirc_id. It will be set to the current chain by tc * and read by ovs to recirc_id. */ struct tc_skb_ext { union { u64 act_miss_cookie; __u32 chain; }; __u16 mru; __u16 zone; u8 post_ct:1; u8 post_ct_snat:1; u8 post_ct_dnat:1; u8 act_miss:1; /* Set if act_miss_cookie is used */ u8 l2_miss:1; /* Set by bridge upon FDB or MDB miss */ }; #endif struct sk_buff_head { /* These two members must be first to match sk_buff. */ struct_group_tagged(sk_buff_list, list, struct sk_buff *next; struct sk_buff *prev; ); __u32 qlen; spinlock_t lock; }; struct sk_buff; #ifndef CONFIG_MAX_SKB_FRAGS # define CONFIG_MAX_SKB_FRAGS 17 #endif #define MAX_SKB_FRAGS CONFIG_MAX_SKB_FRAGS extern int sysctl_max_skb_frags; /* Set skb_shinfo(skb)->gso_size to this in case you want skb_segment to * segment using its current segmentation instead. */ #define GSO_BY_FRAGS 0xFFFF typedef struct bio_vec skb_frag_t; /** * skb_frag_size() - Returns the size of a skb fragment * @frag: skb fragment */ static inline unsigned int skb_frag_size(const skb_frag_t *frag) { return frag->bv_len; } /** * skb_frag_size_set() - Sets the size of a skb fragment * @frag: skb fragment * @size: size of fragment */ static inline void skb_frag_size_set(skb_frag_t *frag, unsigned int size) { frag->bv_len = size; } /** * skb_frag_size_add() - Increments the size of a skb fragment by @delta * @frag: skb fragment * @delta: value to add */ static inline void skb_frag_size_add(skb_frag_t *frag, int delta) { frag->bv_len += delta; } /** * skb_frag_size_sub() - Decrements the size of a skb fragment by @delta * @frag: skb fragment * @delta: value to subtract */ static inline void skb_frag_size_sub(skb_frag_t *frag, int delta) { frag->bv_len -= delta; } /** * skb_frag_must_loop - Test if %p is a high memory page * @p: fragment's page */ static inline bool skb_frag_must_loop(struct page *p) { #if defined(CONFIG_HIGHMEM) if (IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP) || PageHighMem(p)) return true; #endif return false; } /** * skb_frag_foreach_page - loop over pages in a fragment * * @f: skb frag to operate on * @f_off: offset from start of f->bv_page * @f_len: length from f_off to loop over * @p: (temp var) current page * @p_off: (temp var) offset from start of current page, * non-zero only on first page. * @p_len: (temp var) length in current page, * < PAGE_SIZE only on first and last page. * @copied: (temp var) length so far, excluding current p_len. * * A fragment can hold a compound page, in which case per-page * operations, notably kmap_atomic, must be called for each * regular page. */ #define skb_frag_foreach_page(f, f_off, f_len, p, p_off, p_len, copied) \ for (p = skb_frag_page(f) + ((f_off) >> PAGE_SHIFT), \ p_off = (f_off) & (PAGE_SIZE - 1), \ p_len = skb_frag_must_loop(p) ? \ min_t(u32, f_len, PAGE_SIZE - p_off) : f_len, \ copied = 0; \ copied < f_len; \ copied += p_len, p++, p_off = 0, \ p_len = min_t(u32, f_len - copied, PAGE_SIZE)) \ /** * struct skb_shared_hwtstamps - hardware time stamps * @hwtstamp: hardware time stamp transformed into duration * since arbitrary point in time * @netdev_data: address/cookie of network device driver used as * reference to actual hardware time stamp * * Software time stamps generated by ktime_get_real() are stored in * skb->tstamp. * * hwtstamps can only be compared against other hwtstamps from * the same device. * * This structure is attached to packets as part of the * &skb_shared_info. Use skb_hwtstamps() to get a pointer. */ struct skb_shared_hwtstamps { union { ktime_t hwtstamp; void *netdev_data; }; }; /* Definitions for tx_flags in struct skb_shared_info */ enum { /* generate hardware time stamp */ SKBTX_HW_TSTAMP = 1 << 0, /* generate software time stamp when queueing packet to NIC */ SKBTX_SW_TSTAMP = 1 << 1, /* device driver is going to provide hardware time stamp */ SKBTX_IN_PROGRESS = 1 << 2, /* generate hardware time stamp based on cycles if supported */ SKBTX_HW_TSTAMP_USE_CYCLES = 1 << 3, /* generate wifi status information (where possible) */ SKBTX_WIFI_STATUS = 1 << 4, /* determine hardware time stamp based on time or cycles */ SKBTX_HW_TSTAMP_NETDEV = 1 << 5, /* generate software time stamp when entering packet scheduling */ SKBTX_SCHED_TSTAMP = 1 << 6, }; #define SKBTX_ANY_SW_TSTAMP (SKBTX_SW_TSTAMP | \ SKBTX_SCHED_TSTAMP) #define SKBTX_ANY_TSTAMP (SKBTX_HW_TSTAMP | \ SKBTX_HW_TSTAMP_USE_CYCLES | \ SKBTX_ANY_SW_TSTAMP) /* Definitions for flags in struct skb_shared_info */ enum { /* use zcopy routines */ SKBFL_ZEROCOPY_ENABLE = BIT(0), /* This indicates at least one fragment might be overwritten * (as in vmsplice(), sendfile() ...) * If we need to compute a TX checksum, we'll need to copy * all frags to avoid possible bad checksum */ SKBFL_SHARED_FRAG = BIT(1), /* segment contains only zerocopy data and should not be * charged to the kernel memory. */ SKBFL_PURE_ZEROCOPY = BIT(2), SKBFL_DONT_ORPHAN = BIT(3), /* page references are managed by the ubuf_info, so it's safe to * use frags only up until ubuf_info is released */ SKBFL_MANAGED_FRAG_REFS = BIT(4), }; #define SKBFL_ZEROCOPY_FRAG (SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG) #define SKBFL_ALL_ZEROCOPY (SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY | \ SKBFL_DONT_ORPHAN | SKBFL_MANAGED_FRAG_REFS) /* * The callback notifies userspace to release buffers when skb DMA is done in * lower device, the skb last reference should be 0 when calling this. * The zerocopy_success argument is true if zero copy transmit occurred, * false on data copy or out of memory error caused by data copy attempt. * The ctx field is used to track device context. * The desc field is used to track userspace buffer index. */ struct ubuf_info { void (*callback)(struct sk_buff *, struct ubuf_info *, bool zerocopy_success); refcount_t refcnt; u8 flags; }; struct ubuf_info_msgzc { struct ubuf_info ubuf; union { struct { unsigned long desc; void *ctx; }; struct { u32 id; u16 len; u16 zerocopy:1; u32 bytelen; }; }; struct mmpin { struct user_struct *user; unsigned int num_pg; } mmp; }; #define skb_uarg(SKB) ((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg)) #define uarg_to_msgzc(ubuf_ptr) container_of((ubuf_ptr), struct ubuf_info_msgzc, \ ubuf) int mm_account_pinned_pages(struct mmpin *mmp, size_t size); void mm_unaccount_pinned_pages(struct mmpin *mmp); /* This data is invariant across clones and lives at * the end of the header data, ie. at skb->end. */ struct skb_shared_info { __u8 flags; __u8 meta_len; __u8 nr_frags; __u8 tx_flags; unsigned short gso_size; /* Warning: this field is not always filled in (UFO)! */ unsigned short gso_segs; struct sk_buff *frag_list; struct skb_shared_hwtstamps hwtstamps; unsigned int gso_type; u32 tskey; /* * Warning : all fields before dataref are cleared in __alloc_skb() */ atomic_t dataref; unsigned int xdp_frags_size; /* Intermediate layers must ensure that destructor_arg * remains valid until skb destructor */ void * destructor_arg; /* must be last field, see pskb_expand_head() */ skb_frag_t frags[MAX_SKB_FRAGS]; }; /** * DOC: dataref and headerless skbs * * Transport layers send out clones of payload skbs they hold for * retransmissions. To allow lower layers of the stack to prepend their headers * we split &skb_shared_info.dataref into two halves. * The lower 16 bits count the overall number of references. * The higher 16 bits indicate how many of the references are payload-only. * skb_header_cloned() checks if skb is allowed to add / write the headers. * * The creator of the skb (e.g. TCP) marks its skb as &sk_buff.nohdr * (via __skb_header_release()). Any clone created from marked skb will get * &sk_buff.hdr_len populated with the available headroom. * If there's the only clone in existence it's able to modify the headroom * at will. The sequence of calls inside the transport layer is:: * * <alloc skb> * skb_reserve() * __skb_header_release() * skb_clone() * // send the clone down the stack * * This is not a very generic construct and it depends on the transport layers * doing the right thing. In practice there's usually only one payload-only skb. * Having multiple payload-only skbs with different lengths of hdr_len is not * possible. The payload-only skbs should never leave their owner. */ #define SKB_DATAREF_SHIFT 16 #define SKB_DATAREF_MASK ((1 << SKB_DATAREF_SHIFT) - 1) enum { SKB_FCLONE_UNAVAILABLE, /* skb has no fclone (from head_cache) */ SKB_FCLONE_ORIG, /* orig skb (from fclone_cache) */ SKB_FCLONE_CLONE, /* companion fclone skb (from fclone_cache) */ }; enum { SKB_GSO_TCPV4 = 1 << 0, /* This indicates the skb is from an untrusted source. */ SKB_GSO_DODGY = 1 << 1, /* This indicates the tcp segment has CWR set. */ SKB_GSO_TCP_ECN = 1 << 2, SKB_GSO_TCP_FIXEDID = 1 << 3, SKB_GSO_TCPV6 = 1 << 4, SKB_GSO_FCOE = 1 << 5, SKB_GSO_GRE = 1 << 6, SKB_GSO_GRE_CSUM = 1 << 7, SKB_GSO_IPXIP4 = 1 << 8, SKB_GSO_IPXIP6 = 1 << 9, SKB_GSO_UDP_TUNNEL = 1 << 10, SKB_GSO_UDP_TUNNEL_CSUM = 1 << 11, SKB_GSO_PARTIAL = 1 << 12, SKB_GSO_TUNNEL_REMCSUM = 1 << 13, SKB_GSO_SCTP = 1 << 14, SKB_GSO_ESP = 1 << 15, SKB_GSO_UDP = 1 << 16, SKB_GSO_UDP_L4 = 1 << 17, SKB_GSO_FRAGLIST = 1 << 18, }; #if BITS_PER_LONG > 32 #define NET_SKBUFF_DATA_USES_OFFSET 1 #endif #ifdef NET_SKBUFF_DATA_USES_OFFSET typedef unsigned int sk_buff_data_t; #else typedef unsigned char *sk_buff_data_t; #endif /** * DOC: Basic sk_buff geometry * * struct sk_buff itself is a metadata structure and does not hold any packet * data. All the data is held in associated buffers. * * &sk_buff.head points to the main "head" buffer. The head buffer is divided * into two parts: * * - data buffer, containing headers and sometimes payload; * this is the part of the skb operated on by the common helpers * such as skb_put() or skb_pull(); * - shared info (struct skb_shared_info) which holds an array of pointers * to read-only data in the (page, offset, length) format. * * Optionally &skb_shared_info.frag_list may point to another skb. * * Basic diagram may look like this:: * * --------------- * | sk_buff | * --------------- * ,--------------------------- + head * / ,----------------- + data * / / ,----------- + tail * | | | , + end * | | | | * v v v v * ----------------------------------------------- * | headroom | data | tailroom | skb_shared_info | * ----------------------------------------------- * + [page frag] * + [page frag] * + [page frag] * + [page frag] --------- * + frag_list --> | sk_buff | * --------- * */ /** * struct sk_buff - socket buffer * @next: Next buffer in list * @prev: Previous buffer in list * @tstamp: Time we arrived/left * @skb_mstamp_ns: (aka @tstamp) earliest departure time; start point * for retransmit timer * @rbnode: RB tree node, alternative to next/prev for netem/tcp * @list: queue head * @ll_node: anchor in an llist (eg socket defer_list) * @sk: Socket we are owned by * @ip_defrag_offset: (aka @sk) alternate use of @sk, used in * fragmentation management * @dev: Device we arrived on/are leaving by * @dev_scratch: (aka @dev) alternate use of @dev when @dev would be %NULL * @cb: Control buffer. Free for use by every layer. Put private vars here * @_skb_refdst: destination entry (with norefcount bit) * @sp: the security path, used for xfrm * @len: Length of actual data * @data_len: Data length * @mac_len: Length of link layer header * @hdr_len: writable header length of cloned skb * @csum: Checksum (must include start/offset pair) * @csum_start: Offset from skb->head where checksumming should start * @csum_offset: Offset from csum_start where checksum should be stored * @priority: Packet queueing priority * @ignore_df: allow local fragmentation * @cloned: Head may be cloned (check refcnt to be sure) * @ip_summed: Driver fed us an IP checksum * @nohdr: Payload reference only, must not modify header * @pkt_type: Packet class * @fclone: skbuff clone status * @ipvs_property: skbuff is owned by ipvs * @inner_protocol_type: whether the inner protocol is * ENCAP_TYPE_ETHER or ENCAP_TYPE_IPPROTO * @remcsum_offload: remote checksum offload is enabled * @offload_fwd_mark: Packet was L2-forwarded in hardware * @offload_l3_fwd_mark: Packet was L3-forwarded in hardware * @tc_skip_classify: do not classify packet. set by IFB device * @tc_at_ingress: used within tc_classify to distinguish in/egress * @redirected: packet was redirected by packet classifier * @from_ingress: packet was redirected from the ingress path * @nf_skip_egress: packet shall skip nf egress - see netfilter_netdev.h * @peeked: this packet has been seen already, so stats have been * done for it, don't do them again * @nf_trace: netfilter packet trace flag * @protocol: Packet protocol from driver * @destructor: Destruct function * @tcp_tsorted_anchor: list structure for TCP (tp->tsorted_sent_queue) * @_sk_redir: socket redirection information for skmsg * @_nfct: Associated connection, if any (with nfctinfo bits) * @nf_bridge: Saved data about a bridged frame - see br_netfilter.c * @skb_iif: ifindex of device we arrived on * @tc_index: Traffic control index * @hash: the packet hash * @queue_mapping: Queue mapping for multiqueue devices * @head_frag: skb was allocated from page fragments, * not allocated by kmalloc() or vmalloc(). * @pfmemalloc: skbuff was allocated from PFMEMALLOC reserves * @pp_recycle: mark the packet for recycling instead of freeing (implies * page_pool support on driver) * @active_extensions: active extensions (skb_ext_id types) * @ndisc_nodetype: router type (from link layer) * @ooo_okay: allow the mapping of a socket to a queue to be changed * @l4_hash: indicate hash is a canonical 4-tuple hash over transport * ports. * @sw_hash: indicates hash was computed in software stack * @wifi_acked_valid: wifi_acked was set * @wifi_acked: whether frame was acked on wifi or not * @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS * @encapsulation: indicates the inner headers in the skbuff are valid * @encap_hdr_csum: software checksum is needed * @csum_valid: checksum is already valid * @csum_not_inet: use CRC32c to resolve CHECKSUM_PARTIAL * @csum_complete_sw: checksum was completed by software * @csum_level: indicates the number of consecutive checksums found in * the packet minus one that have been verified as * CHECKSUM_UNNECESSARY (max 3) * @dst_pending_confirm: need to confirm neighbour * @decrypted: Decrypted SKB * @slow_gro: state present at GRO time, slower prepare step required * @mono_delivery_time: When set, skb->tstamp has the * delivery_time in mono clock base (i.e. EDT). Otherwise, the * skb->tstamp has the (rcv) timestamp at ingress and * delivery_time at egress. * @napi_id: id of the NAPI struct this skb came from * @sender_cpu: (aka @napi_id) source CPU in XPS * @alloc_cpu: CPU which did the skb allocation. * @secmark: security marking * @mark: Generic packet mark * @reserved_tailroom: (aka @mark) number of bytes of free space available * at the tail of an sk_buff * @vlan_all: vlan fields (proto & tci) * @vlan_proto: vlan encapsulation protocol * @vlan_tci: vlan tag control information * @inner_protocol: Protocol (encapsulation) * @inner_ipproto: (aka @inner_protocol) stores ipproto when * skb->inner_protocol_type == ENCAP_TYPE_IPPROTO; * @inner_transport_header: Inner transport layer header (encapsulation) * @inner_network_header: Network layer header (encapsulation) * @inner_mac_header: Link layer header (encapsulation) * @transport_header: Transport layer header * @network_header: Network layer header * @mac_header: Link layer header * @kcov_handle: KCOV remote handle for remote coverage collection * @tail: Tail pointer * @end: End pointer * @head: Head of buffer * @data: Data head pointer * @truesize: Buffer size * @users: User count - see {datagram,tcp}.c * @extensions: allocated extensions, valid if active_extensions is nonzero */ struct sk_buff { union { struct { /* These two members must be first to match sk_buff_head. */ struct sk_buff *next; struct sk_buff *prev; union { struct net_device *dev; /* Some protocols might use this space to store information, * while device pointer would be NULL. * UDP receive path is one user. */ unsigned long dev_scratch; }; }; struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */ struct list_head list; struct llist_node ll_node; }; union { struct sock *sk; int ip_defrag_offset; }; union { ktime_t tstamp; u64 skb_mstamp_ns; /* earliest departure time */ }; /* * This is the control buffer. It is free to use for every * layer. Please put your private variables there. If you * want to keep them across layers you have to do a skb_clone() * first. This is owned by whoever has the skb queued ATM. */ char cb[48] __aligned(8); union { struct { unsigned long _skb_refdst; void (*destructor)(struct sk_buff *skb); }; struct list_head tcp_tsorted_anchor; #ifdef CONFIG_NET_SOCK_MSG unsigned long _sk_redir; #endif }; #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) unsigned long _nfct; #endif unsigned int len, data_len; __u16 mac_len, hdr_len; /* Following fields are _not_ copied in __copy_skb_header() * Note that queue_mapping is here mostly to fill a hole. */ __u16 queue_mapping; /* if you move cloned around you also must adapt those constants */ #ifdef __BIG_ENDIAN_BITFIELD #define CLONED_MASK (1 << 7) #else #define CLONED_MASK 1 #endif #define CLONED_OFFSET offsetof(struct sk_buff, __cloned_offset) /* private: */ __u8 __cloned_offset[0]; /* public: */ __u8 cloned:1, nohdr:1, fclone:2, peeked:1, head_frag:1, pfmemalloc:1, pp_recycle:1; /* page_pool recycle indicator */ #ifdef CONFIG_SKB_EXTENSIONS __u8 active_extensions; #endif /* Fields enclosed in headers group are copied * using a single memcpy() in __copy_skb_header() */ struct_group(headers, /* private: */ __u8 __pkt_type_offset[0]; /* public: */ __u8 pkt_type:3; /* see PKT_TYPE_MAX */ __u8 ignore_df:1; __u8 dst_pending_confirm:1; __u8 ip_summed:2; __u8 ooo_okay:1; /* private: */ __u8 __mono_tc_offset[0]; /* public: */ __u8 mono_delivery_time:1; /* See SKB_MONO_DELIVERY_TIME_MASK */ #ifdef CONFIG_NET_XGRESS __u8 tc_at_ingress:1; /* See TC_AT_INGRESS_MASK */ __u8 tc_skip_classify:1; #endif __u8 remcsum_offload:1; __u8 csum_complete_sw:1; __u8 csum_level:2; __u8 inner_protocol_type:1; __u8 l4_hash:1; __u8 sw_hash:1; #ifdef CONFIG_WIRELESS __u8 wifi_acked_valid:1; __u8 wifi_acked:1; #endif __u8 no_fcs:1; /* Indicates the inner headers are valid in the skbuff. */ __u8 encapsulation:1; __u8 encap_hdr_csum:1; __u8 csum_valid:1; #ifdef CONFIG_IPV6_NDISC_NODETYPE __u8 ndisc_nodetype:2; #endif #if IS_ENABLED(CONFIG_IP_VS) __u8 ipvs_property:1; #endif #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES) __u8 nf_trace:1; #endif #ifdef CONFIG_NET_SWITCHDEV __u8 offload_fwd_mark:1; __u8 offload_l3_fwd_mark:1; #endif __u8 redirected:1; #ifdef CONFIG_NET_REDIRECT __u8 from_ingress:1; #endif #ifdef CONFIG_NETFILTER_SKIP_EGRESS __u8 nf_skip_egress:1; #endif #ifdef CONFIG_TLS_DEVICE __u8 decrypted:1; #endif __u8 slow_gro:1; #if IS_ENABLED(CONFIG_IP_SCTP) __u8 csum_not_inet:1; #endif #if defined(CONFIG_NET_SCHED) || defined(CONFIG_NET_XGRESS) __u16 tc_index; /* traffic control index */ #endif u16 alloc_cpu; union { __wsum csum; struct { __u16 csum_start; __u16 csum_offset; }; }; __u32 priority; int skb_iif; __u32 hash; union { u32 vlan_all; struct { __be16 vlan_proto; __u16 vlan_tci; }; }; #if defined(CONFIG_NET_RX_BUSY_POLL) || defined(CONFIG_XPS) union { unsigned int napi_id; unsigned int sender_cpu; }; #endif #ifdef CONFIG_NETWORK_SECMARK __u32 secmark; #endif union { __u32 mark; __u32 reserved_tailroom; }; union { __be16 inner_protocol; __u8 inner_ipproto; }; __u16 inner_transport_header; __u16 inner_network_header; __u16 inner_mac_header; __be16 protocol; __u16 transport_header; __u16 network_header; __u16 mac_header; #ifdef CONFIG_KCOV u64 kcov_handle; #endif ); /* end headers group */ /* These elements must be at the end, see alloc_skb() for details. */ sk_buff_data_t tail; sk_buff_data_t end; unsigned char *head, *data; unsigned int truesize; refcount_t users; #ifdef CONFIG_SKB_EXTENSIONS /* only useable after checking ->active_extensions != 0 */ struct skb_ext *extensions; #endif }; /* if you move pkt_type around you also must adapt those constants */ #ifdef __BIG_ENDIAN_BITFIELD #define PKT_TYPE_MAX (7 << 5) #else #define PKT_TYPE_MAX 7 #endif #define PKT_TYPE_OFFSET offsetof(struct sk_buff, __pkt_type_offset) /* if you move tc_at_ingress or mono_delivery_time * around, you also must adapt these constants. */ #ifdef __BIG_ENDIAN_BITFIELD #define SKB_MONO_DELIVERY_TIME_MASK (1 << 7) #define TC_AT_INGRESS_MASK (1 << 6) #else #define SKB_MONO_DELIVERY_TIME_MASK (1 << 0) #define TC_AT_INGRESS_MASK (1 << 1) #endif #define SKB_BF_MONO_TC_OFFSET offsetof(struct sk_buff, __mono_tc_offset) #ifdef __KERNEL__ /* * Handling routines are only of interest to the kernel */ #define SKB_ALLOC_FCLONE 0x01 #define SKB_ALLOC_RX 0x02 #define SKB_ALLOC_NAPI 0x04 /** * skb_pfmemalloc - Test if the skb was allocated from PFMEMALLOC reserves * @skb: buffer */ static inline bool skb_pfmemalloc(const struct sk_buff *skb) { return unlikely(skb->pfmemalloc); } /* * skb might have a dst pointer attached, refcounted or not. * _skb_refdst low order bit is set if refcount was _not_ taken */ #define SKB_DST_NOREF 1UL #define SKB_DST_PTRMASK ~(SKB_DST_NOREF) /** * skb_dst - returns skb dst_entry * @skb: buffer * * Returns skb dst_entry, regardless of reference taken or not. */ static inline struct dst_entry *skb_dst(const struct sk_buff *skb) { /* If refdst was not refcounted, check we still are in a * rcu_read_lock section */ WARN_ON((skb->_skb_refdst & SKB_DST_NOREF) && !rcu_read_lock_held() && !rcu_read_lock_bh_held()); return (struct dst_entry *)(skb->_skb_refdst & SKB_DST_PTRMASK); } /** * skb_dst_set - sets skb dst * @skb: buffer * @dst: dst entry * * Sets skb dst, assuming a reference was taken on dst and should * be released by skb_dst_drop() */ static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst) { skb->slow_gro |= !!dst; skb->_skb_refdst = (unsigned long)dst; } /** * skb_dst_set_noref - sets skb dst, hopefully, without taking reference * @skb: buffer * @dst: dst entry * * Sets skb dst, assuming a reference was not taken on dst. * If dst entry is cached, we do not take reference and dst_release * will be avoided by refdst_drop. If dst entry is not cached, we take * reference, so that last dst_release can destroy the dst immediately. */ static inline void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst) { WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); skb->slow_gro |= !!dst; skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF; } /** * skb_dst_is_noref - Test if skb dst isn't refcounted * @skb: buffer */ static inline bool skb_dst_is_noref(const struct sk_buff *skb) { return (skb->_skb_refdst & SKB_DST_NOREF) && skb_dst(skb); } /** * skb_rtable - Returns the skb &rtable * @skb: buffer */ static inline struct rtable *skb_rtable(const struct sk_buff *skb) { return (struct rtable *)skb_dst(skb); } /* For mangling skb->pkt_type from user space side from applications * such as nft, tc, etc, we only allow a conservative subset of * possible pkt_types to be set. */ static inline bool skb_pkt_type_ok(u32 ptype) { return ptype <= PACKET_OTHERHOST; } /** * skb_napi_id - Returns the skb's NAPI id * @skb: buffer */ static inline unsigned int skb_napi_id(const struct sk_buff *skb) { #ifdef CONFIG_NET_RX_BUSY_POLL return skb->napi_id; #else return 0; #endif } static inline bool skb_wifi_acked_valid(const struct sk_buff *skb) { #ifdef CONFIG_WIRELESS return skb->wifi_acked_valid; #else return 0; #endif } /** * skb_unref - decrement the skb's reference count * @skb: buffer * * Returns true if we can free the skb. */ static inline bool skb_unref(struct sk_buff *skb) { if (unlikely(!skb)) return false; if (likely(refcount_read(&skb->users) == 1)) smp_rmb(); else if (likely(!refcount_dec_and_test(&skb->users))) return false; return true; } void __fix_address kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason); /** * kfree_skb - free an sk_buff with 'NOT_SPECIFIED' reason * @skb: buffer to free */ static inline void kfree_skb(struct sk_buff *skb) { kfree_skb_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED); } void skb_release_head_state(struct sk_buff *skb); void kfree_skb_list_reason(struct sk_buff *segs, enum skb_drop_reason reason); void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt); void skb_tx_error(struct sk_buff *skb); static inline void kfree_skb_list(struct sk_buff *segs) { kfree_skb_list_reason(segs, SKB_DROP_REASON_NOT_SPECIFIED); } #ifdef CONFIG_TRACEPOINTS void consume_skb(struct sk_buff *skb); #else static inline void consume_skb(struct sk_buff *skb) { return kfree_skb(skb); } #endif void __consume_stateless_skb(struct sk_buff *skb); void __kfree_skb(struct sk_buff *skb); extern struct kmem_cache *skbuff_cache; void kfree_skb_partial(struct sk_buff *skb, bool head_stolen); bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, bool *fragstolen, int *delta_truesize); struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int flags, int node); struct sk_buff *__build_skb(void *data, unsigned int frag_size); struct sk_buff *build_skb(void *data, unsigned int frag_size); struct sk_buff *build_skb_around(struct sk_buff *skb, void *data, unsigned int frag_size); void skb_attempt_defer_free(struct sk_buff *skb); struct sk_buff *napi_build_skb(void *data, unsigned int frag_size); struct sk_buff *slab_build_skb(void *data); /** * alloc_skb - allocate a network buffer * @size: size to allocate * @priority: allocation mask * * This function is a convenient wrapper around __alloc_skb(). */ static inline struct sk_buff *alloc_skb(unsigned int size, gfp_t priority) { return __alloc_skb(size, priority, 0, NUMA_NO_NODE); } struct sk_buff *alloc_skb_with_frags(unsigned long header_len, unsigned long data_len, int max_page_order, int *errcode, gfp_t gfp_mask); struct sk_buff *alloc_skb_for_msg(struct sk_buff *first); /* Layout of fast clones : [skb1][skb2][fclone_ref] */ struct sk_buff_fclones { struct sk_buff skb1; struct sk_buff skb2; refcount_t fclone_ref; }; /** * skb_fclone_busy - check if fclone is busy * @sk: socket * @skb: buffer * * Returns true if skb is a fast clone, and its clone is not freed. * Some drivers call skb_orphan() in their ndo_start_xmit(), * so we also check that didn't happen. */ static inline bool skb_fclone_busy(const struct sock *sk, const struct sk_buff *skb) { const struct sk_buff_fclones *fclones; fclones = container_of(skb, struct sk_buff_fclones, skb1); return skb->fclone == SKB_FCLONE_ORIG && refcount_read(&fclones->fclone_ref) > 1 && READ_ONCE(fclones->skb2.sk) == sk; } /** * alloc_skb_fclone - allocate a network buffer from fclone cache * @size: size to allocate * @priority: allocation mask * * This function is a convenient wrapper around __alloc_skb(). */ static inline struct sk_buff *alloc_skb_fclone(unsigned int size, gfp_t priority) { return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, NUMA_NO_NODE); } struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src); void skb_headers_offset_update(struct sk_buff *skb, int off); int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask); struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t priority); void skb_copy_header(struct sk_buff *new, const struct sk_buff *old); struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t priority); struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom, gfp_t gfp_mask, bool fclone); static inline struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask) { return __pskb_copy_fclone(skb, headroom, gfp_mask, false); } int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, gfp_t gfp_mask); struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom); struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom); struct sk_buff *skb_copy_expand(const struct sk_buff *skb, int newheadroom, int newtailroom, gfp_t priority); int __must_check skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg, int offset, int len); int __must_check skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len); int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer); int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error); /** * skb_pad - zero pad the tail of an skb * @skb: buffer to pad * @pad: space to pad * * Ensure that a buffer is followed by a padding area that is zero * filled. Used by network drivers which may DMA or transfer data * beyond the buffer end onto the wire. * * May return error in out of memory cases. The skb is freed on error. */ static inline int skb_pad(struct sk_buff *skb, int pad) { return __skb_pad(skb, pad, true); } #define dev_kfree_skb(a) consume_skb(a) int skb_append_pagefrags(struct sk_buff *skb, struct page *page, int offset, size_t size, size_t max_frags); struct skb_seq_state { __u32 lower_offset; __u32 upper_offset; __u32 frag_idx; __u32 stepped_offset; struct sk_buff *root_skb; struct sk_buff *cur_skb; __u8 *frag_data; __u32 frag_off; }; void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, unsigned int to, struct skb_seq_state *st); unsigned int skb_seq_read(unsigned int consumed, const u8 **data, struct skb_seq_state *st); void skb_abort_seq_read(struct skb_seq_state *st); unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, unsigned int to, struct ts_config *config); /* * Packet hash types specify the type of hash in skb_set_hash. * * Hash types refer to the protocol layer addresses which are used to * construct a packet's hash. The hashes are used to differentiate or identify * flows of the protocol layer for the hash type. Hash types are either * layer-2 (L2), layer-3 (L3), or layer-4 (L4). * * Properties of hashes: * * 1) Two packets in different flows have different hash values * 2) Two packets in the same flow should have the same hash value * * A hash at a higher layer is considered to be more specific. A driver should * set the most specific hash possible. * * A driver cannot indicate a more specific hash than the layer at which a hash * was computed. For instance an L3 hash cannot be set as an L4 hash. * * A driver may indicate a hash level which is less specific than the * actual layer the hash was computed on. For instance, a hash computed * at L4 may be considered an L3 hash. This should only be done if the * driver can't unambiguously determine that the HW computed the hash at * the higher layer. Note that the "should" in the second property above * permits this. */ enum pkt_hash_types { PKT_HASH_TYPE_NONE, /* Undefined type */ PKT_HASH_TYPE_L2, /* Input: src_MAC, dest_MAC */ PKT_HASH_TYPE_L3, /* Input: src_IP, dst_IP */ PKT_HASH_TYPE_L4, /* Input: src_IP, dst_IP, src_port, dst_port */ }; static inline void skb_clear_hash(struct sk_buff *skb) { skb->hash = 0; skb->sw_hash = 0; skb->l4_hash = 0; } static inline void skb_clear_hash_if_not_l4(struct sk_buff *skb) { if (!skb->l4_hash) skb_clear_hash(skb); } static inline void __skb_set_hash(struct sk_buff *skb, __u32 hash, bool is_sw, bool is_l4) { skb->l4_hash = is_l4; skb->sw_hash = is_sw; skb->hash = hash; } static inline void skb_set_hash(struct sk_buff *skb, __u32 hash, enum pkt_hash_types type) { /* Used by drivers to set hash from HW */ __skb_set_hash(skb, hash, false, type == PKT_HASH_TYPE_L4); } static inline void __skb_set_sw_hash(struct sk_buff *skb, __u32 hash, bool is_l4) { __skb_set_hash(skb, hash, true, is_l4); } void __skb_get_hash(struct sk_buff *skb); u32 __skb_get_hash_symmetric(const struct sk_buff *skb); u32 skb_get_poff(const struct sk_buff *skb); u32 __skb_get_poff(const struct sk_buff *skb, const void *data, const struct flow_keys_basic *keys, int hlen); __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, const void *data, int hlen_proto); static inline __be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto) { return __skb_flow_get_ports(skb, thoff, ip_proto, NULL, 0); } void skb_flow_dissector_init(struct flow_dissector *flow_dissector, const struct flow_dissector_key *key, unsigned int key_count); struct bpf_flow_dissector; u32 bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, __be16 proto, int nhoff, int hlen, unsigned int flags); bool __skb_flow_dissect(const struct net *net, const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, __be16 proto, int nhoff, int hlen, unsigned int flags); static inline bool skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, unsigned int flags) { return __skb_flow_dissect(NULL, skb, flow_dissector, target_container, NULL, 0, 0, 0, flags); } static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb, struct flow_keys *flow, unsigned int flags) { memset(flow, 0, sizeof(*flow)); return __skb_flow_dissect(NULL, skb, &flow_keys_dissector, flow, NULL, 0, 0, 0, flags); } static inline bool skb_flow_dissect_flow_keys_basic(const struct net *net, const struct sk_buff *skb, struct flow_keys_basic *flow, const void *data, __be16 proto, int nhoff, int hlen, unsigned int flags) { memset(flow, 0, sizeof(*flow)); return __skb_flow_dissect(net, skb, &flow_keys_basic_dissector, flow, data, proto, nhoff, hlen, flags); } void skb_flow_dissect_meta(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container); /* Gets a skb connection tracking info, ctinfo map should be a * map of mapsize to translate enum ip_conntrack_info states * to user states. */ void skb_flow_dissect_ct(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, u16 *ctinfo_map, size_t mapsize, bool post_ct, u16 zone); void skb_flow_dissect_tunnel_info(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container); void skb_flow_dissect_hash(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container); static inline __u32 skb_get_hash(struct sk_buff *skb) { if (!skb->l4_hash && !skb->sw_hash) __skb_get_hash(skb); return skb->hash; } static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6) { if (!skb->l4_hash && !skb->sw_hash) { struct flow_keys keys; __u32 hash = __get_hash_from_flowi6(fl6, &keys); __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys)); } return skb->hash; } __u32 skb_get_hash_perturb(const struct sk_buff *skb, const siphash_key_t *perturb); static inline __u32 skb_get_hash_raw(const struct sk_buff *skb) { return skb->hash; } static inline void skb_copy_hash(struct sk_buff *to, const struct sk_buff *from) { to->hash = from->hash; to->sw_hash = from->sw_hash; to->l4_hash = from->l4_hash; }; static inline int skb_cmp_decrypted(const struct sk_buff *skb1, const struct sk_buff *skb2) { #ifdef CONFIG_TLS_DEVICE return skb2->decrypted - skb1->decrypted; #else return 0; #endif } static inline void skb_copy_decrypted(struct sk_buff *to, const struct sk_buff *from) { #ifdef CONFIG_TLS_DEVICE to->decrypted = from->decrypted; #endif } #ifdef NET_SKBUFF_DATA_USES_OFFSET static inline unsigned char *skb_end_pointer(const struct sk_buff *skb) { return skb->head + skb->end; } static inline unsigned int skb_end_offset(const struct sk_buff *skb) { return skb->end; } static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset) { skb->end = offset; } #else static inline unsigned char *skb_end_pointer(const struct sk_buff *skb) { return skb->end; } static inline unsigned int skb_end_offset(const struct sk_buff *skb) { return skb->end - skb->head; } static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset) { skb->end = skb->head + offset; } #endif struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, struct ubuf_info *uarg); void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref); void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, bool success); int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, struct sk_buff *skb, struct iov_iter *from, size_t length); static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len) { return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len); } int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, struct ubuf_info *uarg); /* Internal */ #define skb_shinfo(SKB) ((struct skb_shared_info *)(skb_end_pointer(SKB))) static inline struct skb_shared_hwtstamps *skb_hwtstamps(struct sk_buff *skb) { return &skb_shinfo(skb)->hwtstamps; } static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb) { bool is_zcopy = skb && skb_shinfo(skb)->flags & SKBFL_ZEROCOPY_ENABLE; return is_zcopy ? skb_uarg(skb) : NULL; } static inline bool skb_zcopy_pure(const struct sk_buff *skb) { return skb_shinfo(skb)->flags & SKBFL_PURE_ZEROCOPY; } static inline bool skb_zcopy_managed(const struct sk_buff *skb) { return skb_shinfo(skb)->flags & SKBFL_MANAGED_FRAG_REFS; } static inline bool skb_pure_zcopy_same(const struct sk_buff *skb1, const struct sk_buff *skb2) { return skb_zcopy_pure(skb1) == skb_zcopy_pure(skb2); } static inline void net_zcopy_get(struct ubuf_info *uarg) { refcount_inc(&uarg->refcnt); } static inline void skb_zcopy_init(struct sk_buff *skb, struct ubuf_info *uarg) { skb_shinfo(skb)->destructor_arg = uarg; skb_shinfo(skb)->flags |= uarg->flags; } static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg, bool *have_ref) { if (skb && uarg && !skb_zcopy(skb)) { if (unlikely(have_ref && *have_ref)) *have_ref = false; else net_zcopy_get(uarg); skb_zcopy_init(skb, uarg); } } static inline void skb_zcopy_set_nouarg(struct sk_buff *skb, void *val) { skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t) val | 0x1UL); skb_shinfo(skb)->flags |= SKBFL_ZEROCOPY_FRAG; } static inline bool skb_zcopy_is_nouarg(struct sk_buff *skb) { return (uintptr_t) skb_shinfo(skb)->destructor_arg & 0x1UL; } static inline void *skb_zcopy_get_nouarg(struct sk_buff *skb) { return (void *)((uintptr_t) skb_shinfo(skb)->destructor_arg & ~0x1UL); } static inline void net_zcopy_put(struct ubuf_info *uarg) { if (uarg) uarg->callback(NULL, uarg, true); } static inline void net_zcopy_put_abort(struct ubuf_info *uarg, bool have_uref) { if (uarg) { if (uarg->callback == msg_zerocopy_callback) msg_zerocopy_put_abort(uarg, have_uref); else if (have_uref) net_zcopy_put(uarg); } } /* Release a reference on a zerocopy structure */ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success) { struct ubuf_info *uarg = skb_zcopy(skb); if (uarg) { if (!skb_zcopy_is_nouarg(skb)) uarg->callback(skb, uarg, zerocopy_success); skb_shinfo(skb)->flags &= ~SKBFL_ALL_ZEROCOPY; } } void __skb_zcopy_downgrade_managed(struct sk_buff *skb); static inline void skb_zcopy_downgrade_managed(struct sk_buff *skb) { if (unlikely(skb_zcopy_managed(skb))) __skb_zcopy_downgrade_managed(skb); } static inline void skb_mark_not_on_list(struct sk_buff *skb) { skb->next = NULL; } static inline void skb_poison_list(struct sk_buff *skb) { #ifdef CONFIG_DEBUG_NET skb->next = SKB_LIST_POISON_NEXT; #endif } /* Iterate through singly-linked GSO fragments of an skb. */ #define skb_list_walk_safe(first, skb, next_skb) \ for ((skb) = (first), (next_skb) = (skb) ? (skb)->next : NULL; (skb); \ (skb) = (next_skb), (next_skb) = (skb) ? (skb)->next : NULL) static inline void skb_list_del_init(struct sk_buff *skb) { __list_del_entry(&skb->list); skb_mark_not_on_list(skb); } /** * skb_queue_empty - check if a queue is empty * @list: queue head * * Returns true if the queue is empty, false otherwise. */ static inline int skb_queue_empty(const struct sk_buff_head *list) { return list->next == (const struct sk_buff *) list; } /** * skb_queue_empty_lockless - check if a queue is empty * @list: queue head * * Returns true if the queue is empty, false otherwise. * This variant can be used in lockless contexts. */ static inline bool skb_queue_empty_lockless(const struct sk_buff_head *list) { return READ_ONCE(list->next) == (const struct sk_buff *) list; } /** * skb_queue_is_last - check if skb is the last entry in the queue * @list: queue head * @skb: buffer * * Returns true if @skb is the last buffer on the list. */ static inline bool skb_queue_is_last(const struct sk_buff_head *list, const struct sk_buff *skb) { return skb->next == (const struct sk_buff *) list; } /** * skb_queue_is_first - check if skb is the first entry in the queue * @list: queue head * @skb: buffer * * Returns true if @skb is the first buffer on the list. */ static inline bool skb_queue_is_first(const struct sk_buff_head *list, const struct sk_buff *skb) { return skb->prev == (const struct sk_buff *) list; } /** * skb_queue_next - return the next packet in the queue * @list: queue head * @skb: current buffer * * Return the next packet in @list after @skb. It is only valid to * call this if skb_queue_is_last() evaluates to false. */ static inline struct sk_buff *skb_queue_next(const struct sk_buff_head *list, const struct sk_buff *skb) { /* This BUG_ON may seem severe, but if we just return then we * are going to dereference garbage. */ BUG_ON(skb_queue_is_last(list, skb)); return skb->next; } /** * skb_queue_prev - return the prev packet in the queue * @list: queue head * @skb: current buffer * * Return the prev packet in @list before @skb. It is only valid to * call this if skb_queue_is_first() evaluates to false. */ static inline struct sk_buff *skb_queue_prev(const struct sk_buff_head *list, const struct sk_buff *skb) { /* This BUG_ON may seem severe, but if we just return then we * are going to dereference garbage. */ BUG_ON(skb_queue_is_first(list, skb)); return skb->prev; } /** * skb_get - reference buffer * @skb: buffer to reference * * Makes another reference to a socket buffer and returns a pointer * to the buffer. */ static inline struct sk_buff *skb_get(struct sk_buff *skb) { refcount_inc(&skb->users); return skb; } /* * If users == 1, we are the only owner and can avoid redundant atomic changes. */ /** * skb_cloned - is the buffer a clone * @skb: buffer to check * * Returns true if the buffer was generated with skb_clone() and is * one of multiple shared copies of the buffer. Cloned buffers are * shared data so must not be written to under normal circumstances. */ static inline int skb_cloned(const struct sk_buff *skb) { return skb->cloned && (atomic_read(&skb_shinfo(skb)->dataref) & SKB_DATAREF_MASK) != 1; } static inline int skb_unclone(struct sk_buff *skb, gfp_t pri) { might_sleep_if(gfpflags_allow_blocking(pri)); if (skb_cloned(skb)) return pskb_expand_head(skb, 0, 0, pri); return 0; } /* This variant of skb_unclone() makes sure skb->truesize * and skb_end_offset() are not changed, whenever a new skb->head is needed. * * Indeed there is no guarantee that ksize(kmalloc(X)) == ksize(kmalloc(X)) * when various debugging features are in place. */ int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri); static inline int skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) { might_sleep_if(gfpflags_allow_blocking(pri)); if (skb_cloned(skb)) return __skb_unclone_keeptruesize(skb, pri); return 0; } /** * skb_header_cloned - is the header a clone * @skb: buffer to check * * Returns true if modifying the header part of the buffer requires * the data to be copied. */ static inline int skb_header_cloned(const struct sk_buff *skb) { int dataref; if (!skb->cloned) return 0; dataref = atomic_read(&skb_shinfo(skb)->dataref); dataref = (dataref & SKB_DATAREF_MASK) - (dataref >> SKB_DATAREF_SHIFT); return dataref != 1; } static inline int skb_header_unclone(struct sk_buff *skb, gfp_t pri) { might_sleep_if(gfpflags_allow_blocking(pri)); if (skb_header_cloned(skb)) return pskb_expand_head(skb, 0, 0, pri); return 0; } /** * __skb_header_release() - allow clones to use the headroom * @skb: buffer to operate on * * See "DOC: dataref and headerless skbs". */ static inline void __skb_header_release(struct sk_buff *skb) { skb->nohdr = 1; atomic_set(&skb_shinfo(skb)->dataref, 1 + (1 << SKB_DATAREF_SHIFT)); } /** * skb_shared - is the buffer shared * @skb: buffer to check * * Returns true if more than one person has a reference to this * buffer. */ static inline int skb_shared(const struct sk_buff *skb) { return refcount_read(&skb->users) != 1; } /** * skb_share_check - check if buffer is shared and if so clone it * @skb: buffer to check * @pri: priority for memory allocation * * If the buffer is shared the buffer is cloned and the old copy * drops a reference. A new clone with a single reference is returned. * If the buffer is not shared the original buffer is returned. When * being called from interrupt status or with spinlocks held pri must * be GFP_ATOMIC. * * NULL is returned on a memory allocation failure. */ static inline struct sk_buff *skb_share_check(struct sk_buff *skb, gfp_t pri) { might_sleep_if(gfpflags_allow_blocking(pri)); if (skb_shared(skb)) { struct sk_buff *nskb = skb_clone(skb, pri); if (likely(nskb)) consume_skb(skb); else kfree_skb(skb); skb = nskb; } return skb; } /* * Copy shared buffers into a new sk_buff. We effectively do COW on * packets to handle cases where we have a local reader and forward * and a couple of other messy ones. The normal one is tcpdumping * a packet that's being forwarded. */ /** * skb_unshare - make a copy of a shared buffer * @skb: buffer to check * @pri: priority for memory allocation * * If the socket buffer is a clone then this function creates a new * copy of the data, drops a reference count on the old copy and returns * the new copy with the reference count at 1. If the buffer is not a clone * the original buffer is returned. When called with a spinlock held or * from interrupt state @pri must be %GFP_ATOMIC * * %NULL is returned on a memory allocation failure. */ static inline struct sk_buff *skb_unshare(struct sk_buff *skb, gfp_t pri) { might_sleep_if(gfpflags_allow_blocking(pri)); if (skb_cloned(skb)) { struct sk_buff *nskb = skb_copy(skb, pri); /* Free our shared copy */ if (likely(nskb)) consume_skb(skb); else kfree_skb(skb); skb = nskb; } return skb; } /** * skb_peek - peek at the head of an &sk_buff_head * @list_: list to peek at * * Peek an &sk_buff. Unlike most other operations you _MUST_ * be careful with this one. A peek leaves the buffer on the * list and someone else may run off with it. You must hold * the appropriate locks or have a private queue to do this. * * Returns %NULL for an empty list or a pointer to the head element. * The reference count is not incremented and the reference is therefore * volatile. Use with caution. */ static inline struct sk_buff *skb_peek(const struct sk_buff_head *list_) { struct sk_buff *skb = list_->next; if (skb == (struct sk_buff *)list_) skb = NULL; return skb; } /** * __skb_peek - peek at the head of a non-empty &sk_buff_head * @list_: list to peek at * * Like skb_peek(), but the caller knows that the list is not empty. */ static inline struct sk_buff *__skb_peek(const struct sk_buff_head *list_) { return list_->next; } /** * skb_peek_next - peek skb following the given one from a queue * @skb: skb to start from * @list_: list to peek at * * Returns %NULL when the end of the list is met or a pointer to the * next element. The reference count is not incremented and the * reference is therefore volatile. Use with caution. */ static inline struct sk_buff *skb_peek_next(struct sk_buff *skb, const struct sk_buff_head *list_) { struct sk_buff *next = skb->next; if (next == (struct sk_buff *)list_) next = NULL; return next; } /** * skb_peek_tail - peek at the tail of an &sk_buff_head * @list_: list to peek at * * Peek an &sk_buff. Unlike most other operations you _MUST_ * be careful with this one. A peek leaves the buffer on the * list and someone else may run off with it. You must hold * the appropriate locks or have a private queue to do this. * * Returns %NULL for an empty list or a pointer to the tail element. * The reference count is not incremented and the reference is therefore * volatile. Use with caution. */ static inline struct sk_buff *skb_peek_tail(const struct sk_buff_head *list_) { struct sk_buff *skb = READ_ONCE(list_->prev); if (skb == (struct sk_buff *)list_) skb = NULL; return skb; } /** * skb_queue_len - get queue length * @list_: list to measure * * Return the length of an &sk_buff queue. */ static inline __u32 skb_queue_len(const struct sk_buff_head *list_) { return list_->qlen; } /** * skb_queue_len_lockless - get queue length * @list_: list to measure * * Return the length of an &sk_buff queue. * This variant can be used in lockless contexts. */ static inline __u32 skb_queue_len_lockless(const struct sk_buff_head *list_) { return READ_ONCE(list_->qlen); } /** * __skb_queue_head_init - initialize non-spinlock portions of sk_buff_head * @list: queue to initialize * * This initializes only the list and queue length aspects of * an sk_buff_head object. This allows to initialize the list * aspects of an sk_buff_head without reinitializing things like * the spinlock. It can also be used for on-stack sk_buff_head * objects where the spinlock is known to not be used. */ static inline void __skb_queue_head_init(struct sk_buff_head *list) { list->prev = list->next = (struct sk_buff *)list; list->qlen = 0; } /* * This function creates a split out lock class for each invocation; * this is needed for now since a whole lot of users of the skb-queue * infrastructure in drivers have different locking usage (in hardirq) * than the networking core (in softirq only). In the long run either the * network layer or drivers should need annotation to consolidate the * main types of usage into 3 classes. */ static inline void skb_queue_head_init(struct sk_buff_head *list) { spin_lock_init(&list->lock); __skb_queue_head_init(list); } static inline void skb_queue_head_init_class(struct sk_buff_head *list, struct lock_class_key *class) { skb_queue_head_init(list); lockdep_set_class(&list->lock, class); } /* * Insert an sk_buff on a list. * * The "__skb_xxxx()" functions are the non-atomic ones that * can only be called with interrupts disabled. */ static inline void __skb_insert(struct sk_buff *newsk, struct sk_buff *prev, struct sk_buff *next, struct sk_buff_head *list) { /* See skb_queue_empty_lockless() and skb_peek_tail() * for the opposite READ_ONCE() */ WRITE_ONCE(newsk->next, next); WRITE_ONCE(newsk->prev, prev); WRITE_ONCE(((struct sk_buff_list *)next)->prev, newsk); WRITE_ONCE(((struct sk_buff_list *)prev)->next, newsk); WRITE_ONCE(list->qlen, list->qlen + 1); } static inline void __skb_queue_splice(const struct sk_buff_head *list, struct sk_buff *prev, struct sk_buff *next) { struct sk_buff *first = list->next; struct sk_buff *last = list->prev; WRITE_ONCE(first->prev, prev); WRITE_ONCE(prev->next, first); WRITE_ONCE(last->next, next); WRITE_ONCE(next->prev, last); } /** * skb_queue_splice - join two skb lists, this is designed for stacks * @list: the new list to add * @head: the place to add it in the first list */ static inline void skb_queue_splice(const struct sk_buff_head *list, struct sk_buff_head *head) { if (!skb_queue_empty(list)) { __skb_queue_splice(list, (struct sk_buff *) head, head->next); head->qlen += list->qlen; } } /** * skb_queue_splice_init - join two skb lists and reinitialise the emptied list * @list: the new list to add * @head: the place to add it in the first list * * The list at @list is reinitialised */ static inline void skb_queue_splice_init(struct sk_buff_head *list, struct sk_buff_head *head) { if (!skb_queue_empty(list)) { __skb_queue_splice(list, (struct sk_buff *) head, head->next); head->qlen += list->qlen; __skb_queue_head_init(list); } } /** * skb_queue_splice_tail - join two skb lists, each list being a queue * @list: the new list to add * @head: the place to add it in the first list */ static inline void skb_queue_splice_tail(const struct sk_buff_head *list, struct sk_buff_head *head) { if (!skb_queue_empty(list)) { __skb_queue_splice(list, head->prev, (struct sk_buff *) head); head->qlen += list->qlen; } } /** * skb_queue_splice_tail_init - join two skb lists and reinitialise the emptied list * @list: the new list to add * @head: the place to add it in the first list * * Each of the lists is a queue. * The list at @list is reinitialised */ static inline void skb_queue_splice_tail_init(struct sk_buff_head *list, struct sk_buff_head *head) { if (!skb_queue_empty(list)) { __skb_queue_splice(list, head->prev, (struct sk_buff *) head); head->qlen += list->qlen; __skb_queue_head_init(list); } } /** * __skb_queue_after - queue a buffer at the list head * @list: list to use * @prev: place after this buffer * @newsk: buffer to queue * * Queue a buffer int the middle of a list. This function takes no locks * and you must therefore hold required locks before calling it. * * A buffer cannot be placed on two lists at the same time. */ static inline void __skb_queue_after(struct sk_buff_head *list, struct sk_buff *prev, struct sk_buff *newsk) { __skb_insert(newsk, prev, ((struct sk_buff_list *)prev)->next, list); } void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list); static inline void __skb_queue_before(struct sk_buff_head *list, struct sk_buff *next, struct sk_buff *newsk) { __skb_insert(newsk, ((struct sk_buff_list *)next)->prev, next, list); } /** * __skb_queue_head - queue a buffer at the list head * @list: list to use * @newsk: buffer to queue * * Queue a buffer at the start of a list. This function takes no locks * and you must therefore hold required locks before calling it. * * A buffer cannot be placed on two lists at the same time. */ static inline void __skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk) { __skb_queue_after(list, (struct sk_buff *)list, newsk); } void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk); /** * __skb_queue_tail - queue a buffer at the list tail * @list: list to use * @newsk: buffer to queue * * Queue a buffer at the end of a list. This function takes no locks * and you must therefore hold required locks before calling it. * * A buffer cannot be placed on two lists at the same time. */ static inline void __skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) { __skb_queue_before(list, (struct sk_buff *)list, newsk); } void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk); /* * remove sk_buff from list. _Must_ be called atomically, and with * the list known.. */ void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list); static inline void __skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) { struct sk_buff *next, *prev; WRITE_ONCE(list->qlen, list->qlen - 1); next = skb->next; prev = skb->prev; skb->next = skb->prev = NULL; WRITE_ONCE(next->prev, prev); WRITE_ONCE(prev->next, next); } /** * __skb_dequeue - remove from the head of the queue * @list: list to dequeue from * * Remove the head of the list. This function does not take any locks * so must be used with appropriate locks held only. The head item is * returned or %NULL if the list is empty. */ static inline struct sk_buff *__skb_dequeue(struct sk_buff_head *list) { struct sk_buff *skb = skb_peek(list); if (skb) __skb_unlink(skb, list); return skb; } struct sk_buff *skb_dequeue(struct sk_buff_head *list); /** * __skb_dequeue_tail - remove from the tail of the queue * @list: list to dequeue from * * Remove the tail of the list. This function does not take any locks * so must be used with appropriate locks held only. The tail item is * returned or %NULL if the list is empty. */ static inline struct sk_buff *__skb_dequeue_tail(struct sk_buff_head *list) { struct sk_buff *skb = skb_peek_tail(list); if (skb) __skb_unlink(skb, list); return skb; } struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list); static inline bool skb_is_nonlinear(const struct sk_buff *skb) { return skb->data_len; } static inline unsigned int skb_headlen(const struct sk_buff *skb) { return skb->len - skb->data_len; } static inline unsigned int __skb_pagelen(const struct sk_buff *skb) { unsigned int i, len = 0; for (i = skb_shinfo(skb)->nr_frags - 1; (int)i >= 0; i--) len += skb_frag_size(&skb_shinfo(skb)->frags[i]); return len; } static inline unsigned int skb_pagelen(const struct sk_buff *skb) { return skb_headlen(skb) + __skb_pagelen(skb); } static inline void skb_frag_fill_page_desc(skb_frag_t *frag, struct page *page, int off, int size) { frag->bv_page = page; frag->bv_offset = off; skb_frag_size_set(frag, size); } static inline void __skb_fill_page_desc_noacc(struct skb_shared_info *shinfo, int i, struct page *page, int off, int size) { skb_frag_t *frag = &shinfo->frags[i]; skb_frag_fill_page_desc(frag, page, off, size); } /** * skb_len_add - adds a number to len fields of skb * @skb: buffer to add len to * @delta: number of bytes to add */ static inline void skb_len_add(struct sk_buff *skb, int delta) { skb->len += delta; skb->data_len += delta; skb->truesize += delta; } /** * __skb_fill_page_desc - initialise a paged fragment in an skb * @skb: buffer containing fragment to be initialised * @i: paged fragment index to initialise * @page: the page to use for this fragment * @off: the offset to the data with @page * @size: the length of the data * * Initialises the @i'th fragment of @skb to point to &size bytes at * offset @off within @page. * * Does not take any additional reference on the fragment. */ static inline void __skb_fill_page_desc(struct sk_buff *skb, int i, struct page *page, int off, int size) { __skb_fill_page_desc_noacc(skb_shinfo(skb), i, page, off, size); /* Propagate page pfmemalloc to the skb if we can. The problem is * that not all callers have unique ownership of the page but rely * on page_is_pfmemalloc doing the right thing(tm). */ page = compound_head(page); if (page_is_pfmemalloc(page)) skb->pfmemalloc = true; } /** * skb_fill_page_desc - initialise a paged fragment in an skb * @skb: buffer containing fragment to be initialised * @i: paged fragment index to initialise * @page: the page to use for this fragment * @off: the offset to the data with @page * @size: the length of the data * * As per __skb_fill_page_desc() -- initialises the @i'th fragment of * @skb to point to @size bytes at offset @off within @page. In * addition updates @skb such that @i is the last fragment. * * Does not take any additional reference on the fragment. */ static inline void skb_fill_page_desc(struct sk_buff *skb, int i, struct page *page, int off, int size) { __skb_fill_page_desc(skb, i, page, off, size); skb_shinfo(skb)->nr_frags = i + 1; } /** * skb_fill_page_desc_noacc - initialise a paged fragment in an skb * @skb: buffer containing fragment to be initialised * @i: paged fragment index to initialise * @page: the page to use for this fragment * @off: the offset to the data with @page * @size: the length of the data * * Variant of skb_fill_page_desc() which does not deal with * pfmemalloc, if page is not owned by us. */ static inline void skb_fill_page_desc_noacc(struct sk_buff *skb, int i, struct page *page, int off, int size) { struct skb_shared_info *shinfo = skb_shinfo(skb); __skb_fill_page_desc_noacc(shinfo, i, page, off, size); shinfo->nr_frags = i + 1; } void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, int size, unsigned int truesize); void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size, unsigned int truesize); #define SKB_LINEAR_ASSERT(skb) BUG_ON(skb_is_nonlinear(skb)) #ifdef NET_SKBUFF_DATA_USES_OFFSET static inline unsigned char *skb_tail_pointer(const struct sk_buff *skb) { return skb->head + skb->tail; } static inline void skb_reset_tail_pointer(struct sk_buff *skb) { skb->tail = skb->data - skb->head; } static inline void skb_set_tail_pointer(struct sk_buff *skb, const int offset) { skb_reset_tail_pointer(skb); skb->tail += offset; } #else /* NET_SKBUFF_DATA_USES_OFFSET */ static inline unsigned char *skb_tail_pointer(const struct sk_buff *skb) { return skb->tail; } static inline void skb_reset_tail_pointer(struct sk_buff *skb) { skb->tail = skb->data; } static inline void skb_set_tail_pointer(struct sk_buff *skb, const int offset) { skb->tail = skb->data + offset; } #endif /* NET_SKBUFF_DATA_USES_OFFSET */ static inline void skb_assert_len(struct sk_buff *skb) { #ifdef CONFIG_DEBUG_NET if (WARN_ONCE(!skb->len, "%s\n", __func__)) DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); #endif /* CONFIG_DEBUG_NET */ } /* * Add data to an sk_buff */ void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len); void *skb_put(struct sk_buff *skb, unsigned int len); static inline void *__skb_put(struct sk_buff *skb, unsigned int len) { void *tmp = skb_tail_pointer(skb); SKB_LINEAR_ASSERT(skb); skb->tail += len; skb->len += len; return tmp; } static inline void *__skb_put_zero(struct sk_buff *skb, unsigned int len) { void *tmp = __skb_put(skb, len); memset(tmp, 0, len); return tmp; } static inline void *__skb_put_data(struct sk_buff *skb, const void *data, unsigned int len) { void *tmp = __skb_put(skb, len); memcpy(tmp, data, len); return tmp; } static inline void __skb_put_u8(struct sk_buff *skb, u8 val) { *(u8 *)__skb_put(skb, 1) = val; } static inline void *skb_put_zero(struct sk_buff *skb, unsigned int len) { void *tmp = skb_put(skb, len); memset(tmp, 0, len); return tmp; } static inline void *skb_put_data(struct sk_buff *skb, const void *data, unsigned int len) { void *tmp = skb_put(skb, len); memcpy(tmp, data, len); return tmp; } static inline void skb_put_u8(struct sk_buff *skb, u8 val) { *(u8 *)skb_put(skb, 1) = val; } void *skb_push(struct sk_buff *skb, unsigned int len); static inline void *__skb_push(struct sk_buff *skb, unsigned int len) { skb->data -= len; skb->len += len; return skb->data; } void *skb_pull(struct sk_buff *skb, unsigned int len); static inline void *__skb_pull(struct sk_buff *skb, unsigned int len) { skb->len -= len; if (unlikely(skb->len < skb->data_len)) { #if defined(CONFIG_DEBUG_NET) skb->len += len; pr_err("__skb_pull(len=%u)\n", len); skb_dump(KERN_ERR, skb, false); #endif BUG(); } return skb->data += len; } static inline void *skb_pull_inline(struct sk_buff *skb, unsigned int len) { return unlikely(len > skb->len) ? NULL : __skb_pull(skb, len); } void *skb_pull_data(struct sk_buff *skb, size_t len); void *__pskb_pull_tail(struct sk_buff *skb, int delta); static inline enum skb_drop_reason pskb_may_pull_reason(struct sk_buff *skb, unsigned int len) { if (likely(len <= skb_headlen(skb))) return SKB_NOT_DROPPED_YET; if (unlikely(len > skb->len)) return SKB_DROP_REASON_PKT_TOO_SMALL; if (unlikely(!__pskb_pull_tail(skb, len - skb_headlen(skb)))) return SKB_DROP_REASON_NOMEM; return SKB_NOT_DROPPED_YET; } static inline bool pskb_may_pull(struct sk_buff *skb, unsigned int len) { return pskb_may_pull_reason(skb, len) == SKB_NOT_DROPPED_YET; } static inline void *pskb_pull(struct sk_buff *skb, unsigned int len) { if (!pskb_may_pull(skb, len)) return NULL; skb->len -= len; return skb->data += len; } void skb_condense(struct sk_buff *skb); /** * skb_headroom - bytes at buffer head * @skb: buffer to check * * Return the number of bytes of free space at the head of an &sk_buff. */ static inline unsigned int skb_headroom(const struct sk_buff *skb) { return skb->data - skb->head; } /** * skb_tailroom - bytes at buffer end * @skb: buffer to check * * Return the number of bytes of free space at the tail of an sk_buff */ static inline int skb_tailroom(const struct sk_buff *skb) { return skb_is_nonlinear(skb) ? 0 : skb->end - skb->tail; } /** * skb_availroom - bytes at buffer end * @skb: buffer to check * * Return the number of bytes of free space at the tail of an sk_buff * allocated by sk_stream_alloc() */ static inline int skb_availroom(const struct sk_buff *skb) { if (skb_is_nonlinear(skb)) return 0; return skb->end - skb->tail - skb->reserved_tailroom; } /** * skb_reserve - adjust headroom * @skb: buffer to alter * @len: bytes to move * * Increase the headroom of an empty &sk_buff by reducing the tail * room. This is only allowed for an empty buffer. */ static inline void skb_reserve(struct sk_buff *skb, int len) { skb->data += len; skb->tail += len; } /** * skb_tailroom_reserve - adjust reserved_tailroom * @skb: buffer to alter * @mtu: maximum amount of headlen permitted * @needed_tailroom: minimum amount of reserved_tailroom * * Set reserved_tailroom so that headlen can be as large as possible but * not larger than mtu and tailroom cannot be smaller than * needed_tailroom. * The required headroom should already have been reserved before using * this function. */ static inline void skb_tailroom_reserve(struct sk_buff *skb, unsigned int mtu, unsigned int needed_tailroom) { SKB_LINEAR_ASSERT(skb); if (mtu < skb_tailroom(skb) - needed_tailroom) /* use at most mtu */ skb->reserved_tailroom = skb_tailroom(skb) - mtu; else /* use up to all available space */ skb->reserved_tailroom = needed_tailroom; } #define ENCAP_TYPE_ETHER 0 #define ENCAP_TYPE_IPPROTO 1 static inline void skb_set_inner_protocol(struct sk_buff *skb, __be16 protocol) { skb->inner_protocol = protocol; skb->inner_protocol_type = ENCAP_TYPE_ETHER; } static inline void skb_set_inner_ipproto(struct sk_buff *skb, __u8 ipproto) { skb->inner_ipproto = ipproto; skb->inner_protocol_type = ENCAP_TYPE_IPPROTO; } static inline void skb_reset_inner_headers(struct sk_buff *skb) { skb->inner_mac_header = skb->mac_header; skb->inner_network_header = skb->network_header; skb->inner_transport_header = skb->transport_header; } static inline void skb_reset_mac_len(struct sk_buff *skb) { skb->mac_len = skb->network_header - skb->mac_header; } static inline unsigned char *skb_inner_transport_header(const struct sk_buff *skb) { return skb->head + skb->inner_transport_header; } static inline int skb_inner_transport_offset(const struct sk_buff *skb) { return skb_inner_transport_header(skb) - skb->data; } static inline void skb_reset_inner_transport_header(struct sk_buff *skb) { skb->inner_transport_header = skb->data - skb->head; } static inline void skb_set_inner_transport_header(struct sk_buff *skb, const int offset) { skb_reset_inner_transport_header(skb); skb->inner_transport_header += offset; } static inline unsigned char *skb_inner_network_header(const struct sk_buff *skb) { return skb->head + skb->inner_network_header; } static inline void skb_reset_inner_network_header(struct sk_buff *skb) { skb->inner_network_header = skb->data - skb->head; } static inline void skb_set_inner_network_header(struct sk_buff *skb, const int offset) { skb_reset_inner_network_header(skb); skb->inner_network_header += offset; } static inline unsigned char *skb_inner_mac_header(const struct sk_buff *skb) { return skb->head + skb->inner_mac_header; } static inline void skb_reset_inner_mac_header(struct sk_buff *skb) { skb->inner_mac_header = skb->data - skb->head; } static inline void skb_set_inner_mac_header(struct sk_buff *skb, const int offset) { skb_reset_inner_mac_header(skb); skb->inner_mac_header += offset; } static inline bool skb_transport_header_was_set(const struct sk_buff *skb) { return skb->transport_header != (typeof(skb->transport_header))~0U; } static inline unsigned char *skb_transport_header(const struct sk_buff *skb) { DEBUG_NET_WARN_ON_ONCE(!skb_transport_header_was_set(skb)); return skb->head + skb->transport_header; } static inline void skb_reset_transport_header(struct sk_buff *skb) { skb->transport_header = skb->data - skb->head; } static inline void skb_set_transport_header(struct sk_buff *skb, const int offset) { skb_reset_transport_header(skb); skb->transport_header += offset; } static inline unsigned char *skb_network_header(const struct sk_buff *skb) { return skb->head + skb->network_header; } static inline void skb_reset_network_header(struct sk_buff *skb) { skb->network_header = skb->data - skb->head; } static inline void skb_set_network_header(struct sk_buff *skb, const int offset) { skb_reset_network_header(skb); skb->network_header += offset; } static inline int skb_mac_header_was_set(const struct sk_buff *skb) { return skb->mac_header != (typeof(skb->mac_header))~0U; } static inline unsigned char *skb_mac_header(const struct sk_buff *skb) { DEBUG_NET_WARN_ON_ONCE(!skb_mac_header_was_set(skb)); return skb->head + skb->mac_header; } static inline int skb_mac_offset(const struct sk_buff *skb) { return skb_mac_header(skb) - skb->data; } static inline u32 skb_mac_header_len(const struct sk_buff *skb) { DEBUG_NET_WARN_ON_ONCE(!skb_mac_header_was_set(skb)); return skb->network_header - skb->mac_header; } static inline void skb_unset_mac_header(struct sk_buff *skb) { skb->mac_header = (typeof(skb->mac_header))~0U; } static inline void skb_reset_mac_header(struct sk_buff *skb) { skb->mac_header = skb->data - skb->head; } static inline void skb_set_mac_header(struct sk_buff *skb, const int offset) { skb_reset_mac_header(skb); skb->mac_header += offset; } static inline void skb_pop_mac_header(struct sk_buff *skb) { skb->mac_header = skb->network_header; } static inline void skb_probe_transport_header(struct sk_buff *skb) { struct flow_keys_basic keys; if (skb_transport_header_was_set(skb)) return; if (skb_flow_dissect_flow_keys_basic(NULL, skb, &keys, NULL, 0, 0, 0, 0)) skb_set_transport_header(skb, keys.control.thoff); } static inline void skb_mac_header_rebuild(struct sk_buff *skb) { if (skb_mac_header_was_set(skb)) { const unsigned char *old_mac = skb_mac_header(skb); skb_set_mac_header(skb, -skb->mac_len); memmove(skb_mac_header(skb), old_mac, skb->mac_len); } } static inline int skb_checksum_start_offset(const struct sk_buff *skb) { return skb->csum_start - skb_headroom(skb); } static inline unsigned char *skb_checksum_start(const struct sk_buff *skb) { return skb->head + skb->csum_start; } static inline int skb_transport_offset(const struct sk_buff *skb) { return skb_transport_header(skb) - skb->data; } static inline u32 skb_network_header_len(const struct sk_buff *skb) { return skb->transport_header - skb->network_header; } static inline u32 skb_inner_network_header_len(const struct sk_buff *skb) { return skb->inner_transport_header - skb->inner_network_header; } static inline int skb_network_offset(const struct sk_buff *skb) { return skb_network_header(skb) - skb->data; } static inline int skb_inner_network_offset(const struct sk_buff *skb) { return skb_inner_network_header(skb) - skb->data; } static inline int pskb_network_may_pull(struct sk_buff *skb, unsigned int len) { return pskb_may_pull(skb, skb_network_offset(skb) + len); } /* * CPUs often take a performance hit when accessing unaligned memory * locations. The actual performance hit varies, it can be small if the * hardware handles it or large if we have to take an exception and fix it * in software. * * Since an ethernet header is 14 bytes network drivers often end up with * the IP header at an unaligned offset. The IP header can be aligned by * shifting the start of the packet by 2 bytes. Drivers should do this * with: * * skb_reserve(skb, NET_IP_ALIGN); * * The downside to this alignment of the IP header is that the DMA is now * unaligned. On some architectures the cost of an unaligned DMA is high * and this cost outweighs the gains made by aligning the IP header. * * Since this trade off varies between architectures, we allow NET_IP_ALIGN * to be overridden. */ #ifndef NET_IP_ALIGN #define NET_IP_ALIGN 2 #endif /* * The networking layer reserves some headroom in skb data (via * dev_alloc_skb). This is used to avoid having to reallocate skb data when * the header has to grow. In the default case, if the header has to grow * 32 bytes or less we avoid the reallocation. * * Unfortunately this headroom changes the DMA alignment of the resulting * network packet. As for NET_IP_ALIGN, this unaligned DMA is expensive * on some architectures. An architecture can override this value, * perhaps setting it to a cacheline in size (since that will maintain * cacheline alignment of the DMA). It must be a power of 2. * * Various parts of the networking layer expect at least 32 bytes of * headroom, you should not reduce this. * * Using max(32, L1_CACHE_BYTES) makes sense (especially with RPS) * to reduce average number of cache lines per packet. * get_rps_cpu() for example only access one 64 bytes aligned block : * NET_IP_ALIGN(2) + ethernet_header(14) + IP_header(20/40) + ports(8) */ #ifndef NET_SKB_PAD #define NET_SKB_PAD max(32, L1_CACHE_BYTES) #endif int ___pskb_trim(struct sk_buff *skb, unsigned int len); static inline void __skb_set_length(struct sk_buff *skb, unsigned int len) { if (WARN_ON(skb_is_nonlinear(skb))) return; skb->len = len; skb_set_tail_pointer(skb, len); } static inline void __skb_trim(struct sk_buff *skb, unsigned int len) { __skb_set_length(skb, len); } void skb_trim(struct sk_buff *skb, unsigned int len); static inline int __pskb_trim(struct sk_buff *skb, unsigned int len) { if (skb->data_len) return ___pskb_trim(skb, len); __skb_trim(skb, len); return 0; } static inline int pskb_trim(struct sk_buff *skb, unsigned int len) { return (len < skb->len) ? __pskb_trim(skb, len) : 0; } /** * pskb_trim_unique - remove end from a paged unique (not cloned) buffer * @skb: buffer to alter * @len: new length * * This is identical to pskb_trim except that the caller knows that * the skb is not cloned so we should never get an error due to out- * of-memory. */ static inline void pskb_trim_unique(struct sk_buff *skb, unsigned int len) { int err = pskb_trim(skb, len); BUG_ON(err); } static inline int __skb_grow(struct sk_buff *skb, unsigned int len) { unsigned int diff = len - skb->len; if (skb_tailroom(skb) < diff) { int ret = pskb_expand_head(skb, 0, diff - skb_tailroom(skb), GFP_ATOMIC); if (ret) return ret; } __skb_set_length(skb, len); return 0; } /** * skb_orphan - orphan a buffer * @skb: buffer to orphan * * If a buffer currently has an owner then we call the owner's * destructor function and make the @skb unowned. The buffer continues * to exist but is no longer charged to its former owner. */ static inline void skb_orphan(struct sk_buff *skb) { if (skb->destructor) { skb->destructor(skb); skb->destructor = NULL; skb->sk = NULL; } else { BUG_ON(skb->sk); } } /** * skb_orphan_frags - orphan the frags contained in a buffer * @skb: buffer to orphan frags from * @gfp_mask: allocation mask for replacement pages * * For each frag in the SKB which needs a destructor (i.e. has an * owner) create a copy of that frag and release the original * page by calling the destructor. */ static inline int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask) { if (likely(!skb_zcopy(skb))) return 0; if (skb_shinfo(skb)->flags & SKBFL_DONT_ORPHAN) return 0; return skb_copy_ubufs(skb, gfp_mask); } /* Frags must be orphaned, even if refcounted, if skb might loop to rx path */ static inline int skb_orphan_frags_rx(struct sk_buff *skb, gfp_t gfp_mask) { if (likely(!skb_zcopy(skb))) return 0; return skb_copy_ubufs(skb, gfp_mask); } /** * __skb_queue_purge_reason - empty a list * @list: list to empty * @reason: drop reason * * Delete all buffers on an &sk_buff list. Each buffer is removed from * the list and one reference dropped. This function does not take the * list lock and the caller must hold the relevant locks to use it. */ static inline void __skb_queue_purge_reason(struct sk_buff_head *list, enum skb_drop_reason reason) { struct sk_buff *skb; while ((skb = __skb_dequeue(list)) != NULL) kfree_skb_reason(skb, reason); } static inline void __skb_queue_purge(struct sk_buff_head *list) { __skb_queue_purge_reason(list, SKB_DROP_REASON_QUEUE_PURGE); } void skb_queue_purge_reason(struct sk_buff_head *list, enum skb_drop_reason reason); static inline void skb_queue_purge(struct sk_buff_head *list) { skb_queue_purge_reason(list, SKB_DROP_REASON_QUEUE_PURGE); } unsigned int skb_rbtree_purge(struct rb_root *root); void skb_errqueue_purge(struct sk_buff_head *list); void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask); /** * netdev_alloc_frag - allocate a page fragment * @fragsz: fragment size * * Allocates a frag from a page for receive buffer. * Uses GFP_ATOMIC allocations. */ static inline void *netdev_alloc_frag(unsigned int fragsz) { return __netdev_alloc_frag_align(fragsz, ~0u); } static inline void *netdev_alloc_frag_align(unsigned int fragsz, unsigned int align) { WARN_ON_ONCE(!is_power_of_2(align)); return __netdev_alloc_frag_align(fragsz, -align); } struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length, gfp_t gfp_mask); /** * netdev_alloc_skb - allocate an skbuff for rx on a specific device * @dev: network device to receive on * @length: length to allocate * * Allocate a new &sk_buff and assign it a usage count of one. The * buffer has unspecified headroom built in. Users should allocate * the headroom they think they need without accounting for the * built in space. The built in space is used for optimisations. * * %NULL is returned if there is no free memory. Although this function * allocates memory it can be called from an interrupt. */ static inline struct sk_buff *netdev_alloc_skb(struct net_device *dev, unsigned int length) { return __netdev_alloc_skb(dev, length, GFP_ATOMIC); } /* legacy helper around __netdev_alloc_skb() */ static inline struct sk_buff *__dev_alloc_skb(unsigned int length, gfp_t gfp_mask) { return __netdev_alloc_skb(NULL, length, gfp_mask); } /* legacy helper around netdev_alloc_skb() */ static inline struct sk_buff *dev_alloc_skb(unsigned int length) { return netdev_alloc_skb(NULL, length); } static inline struct sk_buff *__netdev_alloc_skb_ip_align(struct net_device *dev, unsigned int length, gfp_t gfp) { struct sk_buff *skb = __netdev_alloc_skb(dev, length + NET_IP_ALIGN, gfp); if (NET_IP_ALIGN && skb) skb_reserve(skb, NET_IP_ALIGN); return skb; } static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev, unsigned int length) { return __netdev_alloc_skb_ip_align(dev, length, GFP_ATOMIC); } static inline void skb_free_frag(void *addr) { page_frag_free(addr); } void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask); static inline void *napi_alloc_frag(unsigned int fragsz) { return __napi_alloc_frag_align(fragsz, ~0u); } static inline void *napi_alloc_frag_align(unsigned int fragsz, unsigned int align) { WARN_ON_ONCE(!is_power_of_2(align)); return __napi_alloc_frag_align(fragsz, -align); } struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int length, gfp_t gfp_mask); static inline struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int length) { return __napi_alloc_skb(napi, length, GFP_ATOMIC); } void napi_consume_skb(struct sk_buff *skb, int budget); void napi_skb_free_stolen_head(struct sk_buff *skb); void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason); /** * __dev_alloc_pages - allocate page for network Rx * @gfp_mask: allocation priority. Set __GFP_NOMEMALLOC if not for network Rx * @order: size of the allocation * * Allocate a new page. * * %NULL is returned if there is no free memory. */ static inline struct page *__dev_alloc_pages(gfp_t gfp_mask, unsigned int order) { /* This piece of code contains several assumptions. * 1. This is for device Rx, therefor a cold page is preferred. * 2. The expectation is the user wants a compound page. * 3. If requesting a order 0 page it will not be compound * due to the check to see if order has a value in prep_new_page * 4. __GFP_MEMALLOC is ignored if __GFP_NOMEMALLOC is set due to * code in gfp_to_alloc_flags that should be enforcing this. */ gfp_mask |= __GFP_COMP | __GFP_MEMALLOC; return alloc_pages_node(NUMA_NO_NODE, gfp_mask, order); } static inline struct page *dev_alloc_pages(unsigned int order) { return __dev_alloc_pages(GFP_ATOMIC | __GFP_NOWARN, order); } /** * __dev_alloc_page - allocate a page for network Rx * @gfp_mask: allocation priority. Set __GFP_NOMEMALLOC if not for network Rx * * Allocate a new page. * * %NULL is returned if there is no free memory. */ static inline struct page *__dev_alloc_page(gfp_t gfp_mask) { return __dev_alloc_pages(gfp_mask, 0); } static inline struct page *dev_alloc_page(void) { return dev_alloc_pages(0); } /** * dev_page_is_reusable - check whether a page can be reused for network Rx * @page: the page to test * * A page shouldn't be considered for reusing/recycling if it was allocated * under memory pressure or at a distant memory node. * * Returns false if this page should be returned to page allocator, true * otherwise. */ static inline bool dev_page_is_reusable(const struct page *page) { return likely(page_to_nid(page) == numa_mem_id() && !page_is_pfmemalloc(page)); } /** * skb_propagate_pfmemalloc - Propagate pfmemalloc if skb is allocated after RX page * @page: The page that was allocated from skb_alloc_page * @skb: The skb that may need pfmemalloc set */ static inline void skb_propagate_pfmemalloc(const struct page *page, struct sk_buff *skb) { if (page_is_pfmemalloc(page)) skb->pfmemalloc = true; } /** * skb_frag_off() - Returns the offset of a skb fragment * @frag: the paged fragment */ static inline unsigned int skb_frag_off(const skb_frag_t *frag) { return frag->bv_offset; } /** * skb_frag_off_add() - Increments the offset of a skb fragment by @delta * @frag: skb fragment * @delta: value to add */ static inline void skb_frag_off_add(skb_frag_t *frag, int delta) { frag->bv_offset += delta; } /** * skb_frag_off_set() - Sets the offset of a skb fragment * @frag: skb fragment * @offset: offset of fragment */ static inline void skb_frag_off_set(skb_frag_t *frag, unsigned int offset) { frag->bv_offset = offset; } /** * skb_frag_off_copy() - Sets the offset of a skb fragment from another fragment * @fragto: skb fragment where offset is set * @fragfrom: skb fragment offset is copied from */ static inline void skb_frag_off_copy(skb_frag_t *fragto, const skb_frag_t *fragfrom) { fragto->bv_offset = fragfrom->bv_offset; } /** * skb_frag_page - retrieve the page referred to by a paged fragment * @frag: the paged fragment * * Returns the &struct page associated with @frag. */ static inline struct page *skb_frag_page(const skb_frag_t *frag) { return frag->bv_page; } /** * __skb_frag_ref - take an addition reference on a paged fragment. * @frag: the paged fragment * * Takes an additional reference on the paged fragment @frag. */ static inline void __skb_frag_ref(skb_frag_t *frag) { get_page(skb_frag_page(frag)); } /** * skb_frag_ref - take an addition reference on a paged fragment of an skb. * @skb: the buffer * @f: the fragment offset. * * Takes an additional reference on the @f'th paged fragment of @skb. */ static inline void skb_frag_ref(struct sk_buff *skb, int f) { __skb_frag_ref(&skb_shinfo(skb)->frags[f]); } bool napi_pp_put_page(struct page *page, bool napi_safe); static inline void napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe) { struct page *page = skb_frag_page(frag); #ifdef CONFIG_PAGE_POOL if (recycle && napi_pp_put_page(page, napi_safe)) return; #endif put_page(page); } /** * __skb_frag_unref - release a reference on a paged fragment. * @frag: the paged fragment * @recycle: recycle the page if allocated via page_pool * * Releases a reference on the paged fragment @frag * or recycles the page via the page_pool API. */ static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle) { napi_frag_unref(frag, recycle, false); } /** * skb_frag_unref - release a reference on a paged fragment of an skb. * @skb: the buffer * @f: the fragment offset * * Releases a reference on the @f'th paged fragment of @skb. */ static inline void skb_frag_unref(struct sk_buff *skb, int f) { struct skb_shared_info *shinfo = skb_shinfo(skb); if (!skb_zcopy_managed(skb)) __skb_frag_unref(&shinfo->frags[f], skb->pp_recycle); } /** * skb_frag_address - gets the address of the data contained in a paged fragment * @frag: the paged fragment buffer * * Returns the address of the data within @frag. The page must already * be mapped. */ static inline void *skb_frag_address(const skb_frag_t *frag) { return page_address(skb_frag_page(frag)) + skb_frag_off(frag); } /** * skb_frag_address_safe - gets the address of the data contained in a paged fragment * @frag: the paged fragment buffer * * Returns the address of the data within @frag. Checks that the page * is mapped and returns %NULL otherwise. */ static inline void *skb_frag_address_safe(const skb_frag_t *frag) { void *ptr = page_address(skb_frag_page(frag)); if (unlikely(!ptr)) return NULL; return ptr + skb_frag_off(frag); } /** * skb_frag_page_copy() - sets the page in a fragment from another fragment * @fragto: skb fragment where page is set * @fragfrom: skb fragment page is copied from */ static inline void skb_frag_page_copy(skb_frag_t *fragto, const skb_frag_t *fragfrom) { fragto->bv_page = fragfrom->bv_page; } bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio); /** * skb_frag_dma_map - maps a paged fragment via the DMA API * @dev: the device to map the fragment to * @frag: the paged fragment to map * @offset: the offset within the fragment (starting at the * fragment's own offset) * @size: the number of bytes to map * @dir: the direction of the mapping (``PCI_DMA_*``) * * Maps the page associated with @frag to @device. */ static inline dma_addr_t skb_frag_dma_map(struct device *dev, const skb_frag_t *frag, size_t offset, size_t size, enum dma_data_direction dir) { return dma_map_page(dev, skb_frag_page(frag), skb_frag_off(frag) + offset, size, dir); } static inline struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) { return __pskb_copy(skb, skb_headroom(skb), gfp_mask); } static inline struct sk_buff *pskb_copy_for_clone(struct sk_buff *skb, gfp_t gfp_mask) { return __pskb_copy_fclone(skb, skb_headroom(skb), gfp_mask, true); } /** * skb_clone_writable - is the header of a clone writable * @skb: buffer to check * @len: length up to which to write * * Returns true if modifying the header part of the cloned buffer * does not requires the data to be copied. */ static inline int skb_clone_writable(const struct sk_buff *skb, unsigned int len) { return !skb_header_cloned(skb) && skb_headroom(skb) + len <= skb->hdr_len; } static inline int skb_try_make_writable(struct sk_buff *skb, unsigned int write_len) { return skb_cloned(skb) && !skb_clone_writable(skb, write_len) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC); } static inline int __skb_cow(struct sk_buff *skb, unsigned int headroom, int cloned) { int delta = 0; if (headroom > skb_headroom(skb)) delta = headroom - skb_headroom(skb); if (delta || cloned) return pskb_expand_head(skb, ALIGN(delta, NET_SKB_PAD), 0, GFP_ATOMIC); return 0; } /** * skb_cow - copy header of skb when it is required * @skb: buffer to cow * @headroom: needed headroom * * If the skb passed lacks sufficient headroom or its data part * is shared, data is reallocated. If reallocation fails, an error * is returned and original skb is not changed. * * The result is skb with writable area skb->head...skb->tail * and at least @headroom of space at head. */ static inline int skb_cow(struct sk_buff *skb, unsigned int headroom) { return __skb_cow(skb, headroom, skb_cloned(skb)); } /** * skb_cow_head - skb_cow but only making the head writable * @skb: buffer to cow * @headroom: needed headroom * * This function is identical to skb_cow except that we replace the * skb_cloned check by skb_header_cloned. It should be used when * you only need to push on some header and do not need to modify * the data. */ static inline int skb_cow_head(struct sk_buff *skb, unsigned int headroom) { return __skb_cow(skb, headroom, skb_header_cloned(skb)); } /** * skb_padto - pad an skbuff up to a minimal size * @skb: buffer to pad * @len: minimal length * * Pads up a buffer to ensure the trailing bytes exist and are * blanked. If the buffer already contains sufficient data it * is untouched. Otherwise it is extended. Returns zero on * success. The skb is freed on error. */ static inline int skb_padto(struct sk_buff *skb, unsigned int len) { unsigned int size = skb->len; if (likely(size >= len)) return 0; return skb_pad(skb, len - size); } /** * __skb_put_padto - increase size and pad an skbuff up to a minimal size * @skb: buffer to pad * @len: minimal length * @free_on_error: free buffer on error * * Pads up a buffer to ensure the trailing bytes exist and are * blanked. If the buffer already contains sufficient data it * is untouched. Otherwise it is extended. Returns zero on * success. The skb is freed on error if @free_on_error is true. */ static inline int __must_check __skb_put_padto(struct sk_buff *skb, unsigned int len, bool free_on_error) { unsigned int size = skb->len; if (unlikely(size < len)) { len -= size; if (__skb_pad(skb, len, free_on_error)) return -ENOMEM; __skb_put(skb, len); } return 0; } /** * skb_put_padto - increase size and pad an skbuff up to a minimal size * @skb: buffer to pad * @len: minimal length * * Pads up a buffer to ensure the trailing bytes exist and are * blanked. If the buffer already contains sufficient data it * is untouched. Otherwise it is extended. Returns zero on * success. The skb is freed on error. */ static inline int __must_check skb_put_padto(struct sk_buff *skb, unsigned int len) { return __skb_put_padto(skb, len, true); } static inline int skb_add_data(struct sk_buff *skb, struct iov_iter *from, int copy) { const int off = skb->len; if (skb->ip_summed == CHECKSUM_NONE) { __wsum csum = 0; if (csum_and_copy_from_iter_full(skb_put(skb, copy), copy, &csum, from)) { skb->csum = csum_block_add(skb->csum, csum, off); return 0; } } else if (copy_from_iter_full(skb_put(skb, copy), copy, from)) return 0; __skb_trim(skb, off); return -EFAULT; } static inline bool skb_can_coalesce(struct sk_buff *skb, int i, const struct page *page, int off) { if (skb_zcopy(skb)) return false; if (i) { const skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1]; return page == skb_frag_page(frag) && off == skb_frag_off(frag) + skb_frag_size(frag); } return false; } static inline int __skb_linearize(struct sk_buff *skb) { return __pskb_pull_tail(skb, skb->data_len) ? 0 : -ENOMEM; } /** * skb_linearize - convert paged skb to linear one * @skb: buffer to linarize * * If there is no free memory -ENOMEM is returned, otherwise zero * is returned and the old skb data released. */ static inline int skb_linearize(struct sk_buff *skb) { return skb_is_nonlinear(skb) ? __skb_linearize(skb) : 0; } /** * skb_has_shared_frag - can any frag be overwritten * @skb: buffer to test * * Return true if the skb has at least one frag that might be modified * by an external entity (as in vmsplice()/sendfile()) */ static inline bool skb_has_shared_frag(const struct sk_buff *skb) { return skb_is_nonlinear(skb) && skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG; } /** * skb_linearize_cow - make sure skb is linear and writable * @skb: buffer to process * * If there is no free memory -ENOMEM is returned, otherwise zero * is returned and the old skb data released. */ static inline int skb_linearize_cow(struct sk_buff *skb) { return skb_is_nonlinear(skb) || skb_cloned(skb) ? __skb_linearize(skb) : 0; } static __always_inline void __skb_postpull_rcsum(struct sk_buff *skb, const void *start, unsigned int len, unsigned int off) { if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = csum_block_sub(skb->csum, csum_partial(start, len, 0), off); else if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_start_offset(skb) < 0) skb->ip_summed = CHECKSUM_NONE; } /** * skb_postpull_rcsum - update checksum for received skb after pull * @skb: buffer to update * @start: start of data before pull * @len: length of data pulled * * After doing a pull on a received packet, you need to call this to * update the CHECKSUM_COMPLETE checksum, or set ip_summed to * CHECKSUM_NONE so that it can be recomputed from scratch. */ static inline void skb_postpull_rcsum(struct sk_buff *skb, const void *start, unsigned int len) { if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = wsum_negate(csum_partial(start, len, wsum_negate(skb->csum))); else if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_start_offset(skb) < 0) skb->ip_summed = CHECKSUM_NONE; } static __always_inline void __skb_postpush_rcsum(struct sk_buff *skb, const void *start, unsigned int len, unsigned int off) { if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = csum_block_add(skb->csum, csum_partial(start, len, 0), off); } /** * skb_postpush_rcsum - update checksum for received skb after push * @skb: buffer to update * @start: start of data after push * @len: length of data pushed * * After doing a push on a received packet, you need to call this to * update the CHECKSUM_COMPLETE checksum. */ static inline void skb_postpush_rcsum(struct sk_buff *skb, const void *start, unsigned int len) { __skb_postpush_rcsum(skb, start, len, 0); } void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len); /** * skb_push_rcsum - push skb and update receive checksum * @skb: buffer to update * @len: length of data pulled * * This function performs an skb_push on the packet and updates * the CHECKSUM_COMPLETE checksum. It should be used on * receive path processing instead of skb_push unless you know * that the checksum difference is zero (e.g., a valid IP header) * or you are setting ip_summed to CHECKSUM_NONE. */ static inline void *skb_push_rcsum(struct sk_buff *skb, unsigned int len) { skb_push(skb, len); skb_postpush_rcsum(skb, skb->data, len); return skb->data; } int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len); /** * pskb_trim_rcsum - trim received skb and update checksum * @skb: buffer to trim * @len: new length * * This is exactly the same as pskb_trim except that it ensures the * checksum of received packets are still valid after the operation. * It can change skb pointers. */ static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len) { if (likely(len >= skb->len)) return 0; return pskb_trim_rcsum_slow(skb, len); } static inline int __skb_trim_rcsum(struct sk_buff *skb, unsigned int len) { if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; __skb_trim(skb, len); return 0; } static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len) { if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; return __skb_grow(skb, len); } #define rb_to_skb(rb) rb_entry_safe(rb, struct sk_buff, rbnode) #define skb_rb_first(root) rb_to_skb(rb_first(root)) #define skb_rb_last(root) rb_to_skb(rb_last(root)) #define skb_rb_next(skb) rb_to_skb(rb_next(&(skb)->rbnode)) #define skb_rb_prev(skb) rb_to_skb(rb_prev(&(skb)->rbnode)) #define skb_queue_walk(queue, skb) \ for (skb = (queue)->next; \ skb != (struct sk_buff *)(queue); \ skb = skb->next) #define skb_queue_walk_safe(queue, skb, tmp) \ for (skb = (queue)->next, tmp = skb->next; \ skb != (struct sk_buff *)(queue); \ skb = tmp, tmp = skb->next) #define skb_queue_walk_from(queue, skb) \ for (; skb != (struct sk_buff *)(queue); \ skb = skb->next) #define skb_rbtree_walk(skb, root) \ for (skb = skb_rb_first(root); skb != NULL; \ skb = skb_rb_next(skb)) #define skb_rbtree_walk_from(skb) \ for (; skb != NULL; \ skb = skb_rb_next(skb)) #define skb_rbtree_walk_from_safe(skb, tmp) \ for (; tmp = skb ? skb_rb_next(skb) : NULL, (skb != NULL); \ skb = tmp) #define skb_queue_walk_from_safe(queue, skb, tmp) \ for (tmp = skb->next; \ skb != (struct sk_buff *)(queue); \ skb = tmp, tmp = skb->next) #define skb_queue_reverse_walk(queue, skb) \ for (skb = (queue)->prev; \ skb != (struct sk_buff *)(queue); \ skb = skb->prev) #define skb_queue_reverse_walk_safe(queue, skb, tmp) \ for (skb = (queue)->prev, tmp = skb->prev; \ skb != (struct sk_buff *)(queue); \ skb = tmp, tmp = skb->prev) #define skb_queue_reverse_walk_from_safe(queue, skb, tmp) \ for (tmp = skb->prev; \ skb != (struct sk_buff *)(queue); \ skb = tmp, tmp = skb->prev) static inline bool skb_has_frag_list(const struct sk_buff *skb) { return skb_shinfo(skb)->frag_list != NULL; } static inline void skb_frag_list_init(struct sk_buff *skb) { skb_shinfo(skb)->frag_list = NULL; } #define skb_walk_frags(skb, iter) \ for (iter = skb_shinfo(skb)->frag_list; iter; iter = iter->next) int __skb_wait_for_more_packets(struct sock *sk, struct sk_buff_head *queue, int *err, long *timeo_p, const struct sk_buff *skb); struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, struct sk_buff_head *queue, unsigned int flags, int *off, int *err, struct sk_buff **last); struct sk_buff *__skb_try_recv_datagram(struct sock *sk, struct sk_buff_head *queue, unsigned int flags, int *off, int *err, struct sk_buff **last); struct sk_buff *__skb_recv_datagram(struct sock *sk, struct sk_buff_head *sk_queue, unsigned int flags, int *off, int *err); struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags, int *err); __poll_t datagram_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); int skb_copy_datagram_iter(const struct sk_buff *from, int offset, struct iov_iter *to, int size); static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset, struct msghdr *msg, int size) { return skb_copy_datagram_iter(from, offset, &msg->msg_iter, size); } int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen, struct msghdr *msg); int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset, struct iov_iter *to, int len, struct ahash_request *hash); int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset, struct iov_iter *from, int len); int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *frm); void skb_free_datagram(struct sock *sk, struct sk_buff *skb); void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len); static inline void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb) { __skb_free_datagram_locked(sk, skb, 0); } int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags); int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len); int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len); __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, u8 *to, int len); int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, struct pipe_inode_info *pipe, unsigned int len, unsigned int flags); int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, int len); int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len); void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to); unsigned int skb_zerocopy_headlen(const struct sk_buff *from); int skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen); void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len); int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen); void skb_scrub_packet(struct sk_buff *skb, bool xnet); struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features); struct sk_buff *skb_segment_list(struct sk_buff *skb, netdev_features_t features, unsigned int offset); struct sk_buff *skb_vlan_untag(struct sk_buff *skb); int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len); int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci); int skb_vlan_pop(struct sk_buff *skb); int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci); int skb_eth_pop(struct sk_buff *skb); int skb_eth_push(struct sk_buff *skb, const unsigned char *dst, const unsigned char *src); int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto, int mac_len, bool ethernet); int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len, bool ethernet); int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse); int skb_mpls_dec_ttl(struct sk_buff *skb); struct sk_buff *pskb_extract(struct sk_buff *skb, int off, int to_copy, gfp_t gfp); static inline int memcpy_from_msg(void *data, struct msghdr *msg, int len) { return copy_from_iter_full(data, len, &msg->msg_iter) ? 0 : -EFAULT; } static inline int memcpy_to_msg(struct msghdr *msg, void *data, int len) { return copy_to_iter(data, len, &msg->msg_iter) == len ? 0 : -EFAULT; } struct skb_checksum_ops { __wsum (*update)(const void *mem, int len, __wsum wsum); __wsum (*combine)(__wsum csum, __wsum csum2, int offset, int len); }; extern const struct skb_checksum_ops *crc32c_csum_stub __read_mostly; __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum, const struct skb_checksum_ops *ops); __wsum skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum); static inline void * __must_check __skb_header_pointer(const struct sk_buff *skb, int offset, int len, const void *data, int hlen, void *buffer) { if (likely(hlen - offset >= len)) return (void *)data + offset; if (!skb || unlikely(skb_copy_bits(skb, offset, buffer, len) < 0)) return NULL; return buffer; } static inline void * __must_check skb_header_pointer(const struct sk_buff *skb, int offset, int len, void *buffer) { return __skb_header_pointer(skb, offset, len, skb->data, skb_headlen(skb), buffer); } static inline void * __must_check skb_pointer_if_linear(const struct sk_buff *skb, int offset, int len) { if (likely(skb_headlen(skb) - offset >= len)) return skb->data + offset; return NULL; } /** * skb_needs_linearize - check if we need to linearize a given skb * depending on the given device features. * @skb: socket buffer to check * @features: net device features * * Returns true if either: * 1. skb has frag_list and the device doesn't support FRAGLIST, or * 2. skb is fragmented and the device does not support SG. */ static inline bool skb_needs_linearize(struct sk_buff *skb, netdev_features_t features) { return skb_is_nonlinear(skb) && ((skb_has_frag_list(skb) && !(features & NETIF_F_FRAGLIST)) || (skb_shinfo(skb)->nr_frags && !(features & NETIF_F_SG))); } static inline void skb_copy_from_linear_data(const struct sk_buff *skb, void *to, const unsigned int len) { memcpy(to, skb->data, len); } static inline void skb_copy_from_linear_data_offset(const struct sk_buff *skb, const int offset, void *to, const unsigned int len) { memcpy(to, skb->data + offset, len); } static inline void skb_copy_to_linear_data(struct sk_buff *skb, const void *from, const unsigned int len) { memcpy(skb->data, from, len); } static inline void skb_copy_to_linear_data_offset(struct sk_buff *skb, const int offset, const void *from, const unsigned int len) { memcpy(skb->data + offset, from, len); } void skb_init(void); static inline ktime_t skb_get_ktime(const struct sk_buff *skb) { return skb->tstamp; } /** * skb_get_timestamp - get timestamp from a skb * @skb: skb to get stamp from * @stamp: pointer to struct __kernel_old_timeval to store stamp in * * Timestamps are stored in the skb as offsets to a base timestamp. * This function converts the offset back to a struct timeval and stores * it in stamp. */ static inline void skb_get_timestamp(const struct sk_buff *skb, struct __kernel_old_timeval *stamp) { *stamp = ns_to_kernel_old_timeval(skb->tstamp); } static inline void skb_get_new_timestamp(const struct sk_buff *skb, struct __kernel_sock_timeval *stamp) { struct timespec64 ts = ktime_to_timespec64(skb->tstamp); stamp->tv_sec = ts.tv_sec; stamp->tv_usec = ts.tv_nsec / 1000; } static inline void skb_get_timestampns(const struct sk_buff *skb, struct __kernel_old_timespec *stamp) { struct timespec64 ts = ktime_to_timespec64(skb->tstamp); stamp->tv_sec = ts.tv_sec; stamp->tv_nsec = ts.tv_nsec; } static inline void skb_get_new_timestampns(const struct sk_buff *skb, struct __kernel_timespec *stamp) { struct timespec64 ts = ktime_to_timespec64(skb->tstamp); stamp->tv_sec = ts.tv_sec; stamp->tv_nsec = ts.tv_nsec; } static inline void __net_timestamp(struct sk_buff *skb) { skb->tstamp = ktime_get_real(); skb->mono_delivery_time = 0; } static inline ktime_t net_timedelta(ktime_t t) { return ktime_sub(ktime_get_real(), t); } static inline void skb_set_delivery_time(struct sk_buff *skb, ktime_t kt, bool mono) { skb->tstamp = kt; skb->mono_delivery_time = kt && mono; } DECLARE_STATIC_KEY_FALSE(netstamp_needed_key); /* It is used in the ingress path to clear the delivery_time. * If needed, set the skb->tstamp to the (rcv) timestamp. */ static inline void skb_clear_delivery_time(struct sk_buff *skb) { if (skb->mono_delivery_time) { skb->mono_delivery_time = 0; if (static_branch_unlikely(&netstamp_needed_key)) skb->tstamp = ktime_get_real(); else skb->tstamp = 0; } } static inline void skb_clear_tstamp(struct sk_buff *skb) { if (skb->mono_delivery_time) return; skb->tstamp = 0; } static inline ktime_t skb_tstamp(const struct sk_buff *skb) { if (skb->mono_delivery_time) return 0; return skb->tstamp; } static inline ktime_t skb_tstamp_cond(const struct sk_buff *skb, bool cond) { if (!skb->mono_delivery_time && skb->tstamp) return skb->tstamp; if (static_branch_unlikely(&netstamp_needed_key) || cond) return ktime_get_real(); return 0; } static inline u8 skb_metadata_len(const struct sk_buff *skb) { return skb_shinfo(skb)->meta_len; } static inline void *skb_metadata_end(const struct sk_buff *skb) { return skb_mac_header(skb); } static inline bool __skb_metadata_differs(const struct sk_buff *skb_a, const struct sk_buff *skb_b, u8 meta_len) { const void *a = skb_metadata_end(skb_a); const void *b = skb_metadata_end(skb_b); /* Using more efficient varaiant than plain call to memcmp(). */ #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 u64 diffs = 0; switch (meta_len) { #define __it(x, op) (x -= sizeof(u##op)) #define __it_diff(a, b, op) (*(u##op *)__it(a, op)) ^ (*(u##op *)__it(b, op)) case 32: diffs |= __it_diff(a, b, 64); fallthrough; case 24: diffs |= __it_diff(a, b, 64); fallthrough; case 16: diffs |= __it_diff(a, b, 64); fallthrough; case 8: diffs |= __it_diff(a, b, 64); break; case 28: diffs |= __it_diff(a, b, 64); fallthrough; case 20: diffs |= __it_diff(a, b, 64); fallthrough; case 12: diffs |= __it_diff(a, b, 64); fallthrough; case 4: diffs |= __it_diff(a, b, 32); break; } return diffs; #else return memcmp(a - meta_len, b - meta_len, meta_len); #endif } static inline bool skb_metadata_differs(const struct sk_buff *skb_a, const struct sk_buff *skb_b) { u8 len_a = skb_metadata_len(skb_a); u8 len_b = skb_metadata_len(skb_b); if (!(len_a | len_b)) return false; return len_a != len_b ? true : __skb_metadata_differs(skb_a, skb_b, len_a); } static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len) { skb_shinfo(skb)->meta_len = meta_len; } static inline void skb_metadata_clear(struct sk_buff *skb) { skb_metadata_set(skb, 0); } struct sk_buff *skb_clone_sk(struct sk_buff *skb); #ifdef CONFIG_NETWORK_PHY_TIMESTAMPING void skb_clone_tx_timestamp(struct sk_buff *skb); bool skb_defer_rx_timestamp(struct sk_buff *skb); #else /* CONFIG_NETWORK_PHY_TIMESTAMPING */ static inline void skb_clone_tx_timestamp(struct sk_buff *skb) { } static inline bool skb_defer_rx_timestamp(struct sk_buff *skb) { return false; } #endif /* !CONFIG_NETWORK_PHY_TIMESTAMPING */ /** * skb_complete_tx_timestamp() - deliver cloned skb with tx timestamps * * PHY drivers may accept clones of transmitted packets for * timestamping via their phy_driver.txtstamp method. These drivers * must call this function to return the skb back to the stack with a * timestamp. * * @skb: clone of the original outgoing packet * @hwtstamps: hardware time stamps * */ void skb_complete_tx_timestamp(struct sk_buff *skb, struct skb_shared_hwtstamps *hwtstamps); void __skb_tstamp_tx(struct sk_buff *orig_skb, const struct sk_buff *ack_skb, struct skb_shared_hwtstamps *hwtstamps, struct sock *sk, int tstype); /** * skb_tstamp_tx - queue clone of skb with send time stamps * @orig_skb: the original outgoing packet * @hwtstamps: hardware time stamps, may be NULL if not available * * If the skb has a socket associated, then this function clones the * skb (thus sharing the actual data and optional structures), stores * the optional hardware time stamping information (if non NULL) or * generates a software time stamp (otherwise), then queues the clone * to the error queue of the socket. Errors are silently ignored. */ void skb_tstamp_tx(struct sk_buff *orig_skb, struct skb_shared_hwtstamps *hwtstamps); /** * skb_tx_timestamp() - Driver hook for transmit timestamping * * Ethernet MAC Drivers should call this function in their hard_xmit() * function immediately before giving the sk_buff to the MAC hardware. * * Specifically, one should make absolutely sure that this function is * called before TX completion of this packet can trigger. Otherwise * the packet could potentially already be freed. * * @skb: A socket buffer. */ static inline void skb_tx_timestamp(struct sk_buff *skb) { skb_clone_tx_timestamp(skb); if (skb_shinfo(skb)->tx_flags & SKBTX_SW_TSTAMP) skb_tstamp_tx(skb, NULL); } /** * skb_complete_wifi_ack - deliver skb with wifi status * * @skb: the original outgoing packet * @acked: ack status * */ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked); __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len); __sum16 __skb_checksum_complete(struct sk_buff *skb); static inline int skb_csum_unnecessary(const struct sk_buff *skb) { return ((skb->ip_summed == CHECKSUM_UNNECESSARY) || skb->csum_valid || (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_start_offset(skb) >= 0)); } /** * skb_checksum_complete - Calculate checksum of an entire packet * @skb: packet to process * * This function calculates the checksum over the entire packet plus * the value of skb->csum. The latter can be used to supply the * checksum of a pseudo header as used by TCP/UDP. It returns the * checksum. * * For protocols that contain complete checksums such as ICMP/TCP/UDP, * this function can be used to verify that checksum on received * packets. In that case the function should return zero if the * checksum is correct. In particular, this function will return zero * if skb->ip_summed is CHECKSUM_UNNECESSARY which indicates that the * hardware has already verified the correctness of the checksum. */ static inline __sum16 skb_checksum_complete(struct sk_buff *skb) { return skb_csum_unnecessary(skb) ? 0 : __skb_checksum_complete(skb); } static inline void __skb_decr_checksum_unnecessary(struct sk_buff *skb) { if (skb->ip_summed == CHECKSUM_UNNECESSARY) { if (skb->csum_level == 0) skb->ip_summed = CHECKSUM_NONE; else skb->csum_level--; } } static inline void __skb_incr_checksum_unnecessary(struct sk_buff *skb) { if (skb->ip_summed == CHECKSUM_UNNECESSARY) { if (skb->csum_level < SKB_MAX_CSUM_LEVEL) skb->csum_level++; } else if (skb->ip_summed == CHECKSUM_NONE) { skb->ip_summed = CHECKSUM_UNNECESSARY; skb->csum_level = 0; } } static inline void __skb_reset_checksum_unnecessary(struct sk_buff *skb) { if (skb->ip_summed == CHECKSUM_UNNECESSARY) { skb->ip_summed = CHECKSUM_NONE; skb->csum_level = 0; } } /* Check if we need to perform checksum complete validation. * * Returns true if checksum complete is needed, false otherwise * (either checksum is unnecessary or zero checksum is allowed). */ static inline bool __skb_checksum_validate_needed(struct sk_buff *skb, bool zero_okay, __sum16 check) { if (skb_csum_unnecessary(skb) || (zero_okay && !check)) { skb->csum_valid = 1; __skb_decr_checksum_unnecessary(skb); return false; } return true; } /* For small packets <= CHECKSUM_BREAK perform checksum complete directly * in checksum_init. */ #define CHECKSUM_BREAK 76 /* Unset checksum-complete * * Unset checksum complete can be done when packet is being modified * (uncompressed for instance) and checksum-complete value is * invalidated. */ static inline void skb_checksum_complete_unset(struct sk_buff *skb) { if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; } /* Validate (init) checksum based on checksum complete. * * Return values: * 0: checksum is validated or try to in skb_checksum_complete. In the latter * case the ip_summed will not be CHECKSUM_UNNECESSARY and the pseudo * checksum is stored in skb->csum for use in __skb_checksum_complete * non-zero: value of invalid checksum * */ static inline __sum16 __skb_checksum_validate_complete(struct sk_buff *skb, bool complete, __wsum psum) { if (skb->ip_summed == CHECKSUM_COMPLETE) { if (!csum_fold(csum_add(psum, skb->csum))) { skb->csum_valid = 1; return 0; } } skb->csum = psum; if (complete || skb->len <= CHECKSUM_BREAK) { __sum16 csum; csum = __skb_checksum_complete(skb); skb->csum_valid = !csum; return csum; } return 0; } static inline __wsum null_compute_pseudo(struct sk_buff *skb, int proto) { return 0; } /* Perform checksum validate (init). Note that this is a macro since we only * want to calculate the pseudo header which is an input function if necessary. * First we try to validate without any computation (checksum unnecessary) and * then calculate based on checksum complete calling the function to compute * pseudo header. * * Return values: * 0: checksum is validated or try to in skb_checksum_complete * non-zero: value of invalid checksum */ #define __skb_checksum_validate(skb, proto, complete, \ zero_okay, check, compute_pseudo) \ ({ \ __sum16 __ret = 0; \ skb->csum_valid = 0; \ if (__skb_checksum_validate_needed(skb, zero_okay, check)) \ __ret = __skb_checksum_validate_complete(skb, \ complete, compute_pseudo(skb, proto)); \ __ret; \ }) #define skb_checksum_init(skb, proto, compute_pseudo) \ __skb_checksum_validate(skb, proto, false, false, 0, compute_pseudo) #define skb_checksum_init_zero_check(skb, proto, check, compute_pseudo) \ __skb_checksum_validate(skb, proto, false, true, check, compute_pseudo) #define skb_checksum_validate(skb, proto, compute_pseudo) \ __skb_checksum_validate(skb, proto, true, false, 0, compute_pseudo) #define skb_checksum_validate_zero_check(skb, proto, check, \ compute_pseudo) \ __skb_checksum_validate(skb, proto, true, true, check, compute_pseudo) #define skb_checksum_simple_validate(skb) \ __skb_checksum_validate(skb, 0, true, false, 0, null_compute_pseudo) static inline bool __skb_checksum_convert_check(struct sk_buff *skb) { return (skb->ip_summed == CHECKSUM_NONE && skb->csum_valid); } static inline void __skb_checksum_convert(struct sk_buff *skb, __wsum pseudo) { skb->csum = ~pseudo; skb->ip_summed = CHECKSUM_COMPLETE; } #define skb_checksum_try_convert(skb, proto, compute_pseudo) \ do { \ if (__skb_checksum_convert_check(skb)) \ __skb_checksum_convert(skb, compute_pseudo(skb, proto)); \ } while (0) static inline void skb_remcsum_adjust_partial(struct sk_buff *skb, void *ptr, u16 start, u16 offset) { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = ((unsigned char *)ptr + start) - skb->head; skb->csum_offset = offset - start; } /* Update skbuf and packet to reflect the remote checksum offload operation. * When called, ptr indicates the starting point for skb->csum when * ip_summed is CHECKSUM_COMPLETE. If we need create checksum complete * here, skb_postpull_rcsum is done so skb->csum start is ptr. */ static inline void skb_remcsum_process(struct sk_buff *skb, void *ptr, int start, int offset, bool nopartial) { __wsum delta; if (!nopartial) { skb_remcsum_adjust_partial(skb, ptr, start, offset); return; } if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE)) { __skb_checksum_complete(skb); skb_postpull_rcsum(skb, skb->data, ptr - (void *)skb->data); } delta = remcsum_adjust(ptr, skb->csum, start, offset); /* Adjust skb->csum since we changed the packet */ skb->csum = csum_add(skb->csum, delta); } static inline struct nf_conntrack *skb_nfct(const struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) return (void *)(skb->_nfct & NFCT_PTRMASK); #else return NULL; #endif } static inline unsigned long skb_get_nfct(const struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) return skb->_nfct; #else return 0UL; #endif } static inline void skb_set_nfct(struct sk_buff *skb, unsigned long nfct) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) skb->slow_gro |= !!nfct; skb->_nfct = nfct; #endif } #ifdef CONFIG_SKB_EXTENSIONS enum skb_ext_id { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) SKB_EXT_BRIDGE_NF, #endif #ifdef CONFIG_XFRM SKB_EXT_SEC_PATH, #endif #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) TC_SKB_EXT, #endif #if IS_ENABLED(CONFIG_MPTCP) SKB_EXT_MPTCP, #endif #if IS_ENABLED(CONFIG_MCTP_FLOWS) SKB_EXT_MCTP, #endif SKB_EXT_NUM, /* must be last */ }; /** * struct skb_ext - sk_buff extensions * @refcnt: 1 on allocation, deallocated on 0 * @offset: offset to add to @data to obtain extension address * @chunks: size currently allocated, stored in SKB_EXT_ALIGN_SHIFT units * @data: start of extension data, variable sized * * Note: offsets/lengths are stored in chunks of 8 bytes, this allows * to use 'u8' types while allowing up to 2kb worth of extension data. */ struct skb_ext { refcount_t refcnt; u8 offset[SKB_EXT_NUM]; /* in chunks of 8 bytes */ u8 chunks; /* same */ char data[] __aligned(8); }; struct skb_ext *__skb_ext_alloc(gfp_t flags); void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id, struct skb_ext *ext); void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id); void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id); void __skb_ext_put(struct skb_ext *ext); static inline void skb_ext_put(struct sk_buff *skb) { if (skb->active_extensions) __skb_ext_put(skb->extensions); } static inline void __skb_ext_copy(struct sk_buff *dst, const struct sk_buff *src) { dst->active_extensions = src->active_extensions; if (src->active_extensions) { struct skb_ext *ext = src->extensions; refcount_inc(&ext->refcnt); dst->extensions = ext; } } static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *src) { skb_ext_put(dst); __skb_ext_copy(dst, src); } static inline bool __skb_ext_exist(const struct skb_ext *ext, enum skb_ext_id i) { return !!ext->offset[i]; } static inline bool skb_ext_exist(const struct sk_buff *skb, enum skb_ext_id id) { return skb->active_extensions & (1 << id); } static inline void skb_ext_del(struct sk_buff *skb, enum skb_ext_id id) { if (skb_ext_exist(skb, id)) __skb_ext_del(skb, id); } static inline void *skb_ext_find(const struct sk_buff *skb, enum skb_ext_id id) { if (skb_ext_exist(skb, id)) { struct skb_ext *ext = skb->extensions; return (void *)ext + (ext->offset[id] << 3); } return NULL; } static inline void skb_ext_reset(struct sk_buff *skb) { if (unlikely(skb->active_extensions)) { __skb_ext_put(skb->extensions); skb->active_extensions = 0; } } static inline bool skb_has_extensions(struct sk_buff *skb) { return unlikely(skb->active_extensions); } #else static inline void skb_ext_put(struct sk_buff *skb) {} static inline void skb_ext_reset(struct sk_buff *skb) {} static inline void skb_ext_del(struct sk_buff *skb, int unused) {} static inline void __skb_ext_copy(struct sk_buff *d, const struct sk_buff *s) {} static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *s) {} static inline bool skb_has_extensions(struct sk_buff *skb) { return false; } #endif /* CONFIG_SKB_EXTENSIONS */ static inline void nf_reset_ct(struct sk_buff *skb) { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) nf_conntrack_put(skb_nfct(skb)); skb->_nfct = 0; #endif } static inline void nf_reset_trace(struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES) skb->nf_trace = 0; #endif } static inline void ipvs_reset(struct sk_buff *skb) { #if IS_ENABLED(CONFIG_IP_VS) skb->ipvs_property = 0; #endif } /* Note: This doesn't put any conntrack info in dst. */ static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src, bool copy) { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) dst->_nfct = src->_nfct; nf_conntrack_get(skb_nfct(src)); #endif #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES) if (copy) dst->nf_trace = src->nf_trace; #endif } static inline void nf_copy(struct sk_buff *dst, const struct sk_buff *src) { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) nf_conntrack_put(skb_nfct(dst)); #endif dst->slow_gro = src->slow_gro; __nf_copy(dst, src, true); } #ifdef CONFIG_NETWORK_SECMARK static inline void skb_copy_secmark(struct sk_buff *to, const struct sk_buff *from) { to->secmark = from->secmark; } static inline void skb_init_secmark(struct sk_buff *skb) { skb->secmark = 0; } #else static inline void skb_copy_secmark(struct sk_buff *to, const struct sk_buff *from) { } static inline void skb_init_secmark(struct sk_buff *skb) { } #endif static inline int secpath_exists(const struct sk_buff *skb) { #ifdef CONFIG_XFRM return skb_ext_exist(skb, SKB_EXT_SEC_PATH); #else return 0; #endif } static inline bool skb_irq_freeable(const struct sk_buff *skb) { return !skb->destructor && !secpath_exists(skb) && !skb_nfct(skb) && !skb->_skb_refdst && !skb_has_frag_list(skb); } static inline void skb_set_queue_mapping(struct sk_buff *skb, u16 queue_mapping) { skb->queue_mapping = queue_mapping; } static inline u16 skb_get_queue_mapping(const struct sk_buff *skb) { return skb->queue_mapping; } static inline void skb_copy_queue_mapping(struct sk_buff *to, const struct sk_buff *from) { to->queue_mapping = from->queue_mapping; } static inline void skb_record_rx_queue(struct sk_buff *skb, u16 rx_queue) { skb->queue_mapping = rx_queue + 1; } static inline u16 skb_get_rx_queue(const struct sk_buff *skb) { return skb->queue_mapping - 1; } static inline bool skb_rx_queue_recorded(const struct sk_buff *skb) { return skb->queue_mapping != 0; } static inline void skb_set_dst_pending_confirm(struct sk_buff *skb, u32 val) { skb->dst_pending_confirm = val; } static inline bool skb_get_dst_pending_confirm(const struct sk_buff *skb) { return skb->dst_pending_confirm != 0; } static inline struct sec_path *skb_sec_path(const struct sk_buff *skb) { #ifdef CONFIG_XFRM return skb_ext_find(skb, SKB_EXT_SEC_PATH); #else return NULL; #endif } static inline bool skb_is_gso(const struct sk_buff *skb) { return skb_shinfo(skb)->gso_size; } /* Note: Should be called only if skb_is_gso(skb) is true */ static inline bool skb_is_gso_v6(const struct sk_buff *skb) { return skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6; } /* Note: Should be called only if skb_is_gso(skb) is true */ static inline bool skb_is_gso_sctp(const struct sk_buff *skb) { return skb_shinfo(skb)->gso_type & SKB_GSO_SCTP; } /* Note: Should be called only if skb_is_gso(skb) is true */ static inline bool skb_is_gso_tcp(const struct sk_buff *skb) { return skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6); } static inline void skb_gso_reset(struct sk_buff *skb) { skb_shinfo(skb)->gso_size = 0; skb_shinfo(skb)->gso_segs = 0; skb_shinfo(skb)->gso_type = 0; } static inline void skb_increase_gso_size(struct skb_shared_info *shinfo, u16 increment) { if (WARN_ON_ONCE(shinfo->gso_size == GSO_BY_FRAGS)) return; shinfo->gso_size += increment; } static inline void skb_decrease_gso_size(struct skb_shared_info *shinfo, u16 decrement) { if (WARN_ON_ONCE(shinfo->gso_size == GSO_BY_FRAGS)) return; shinfo->gso_size -= decrement; } void __skb_warn_lro_forwarding(const struct sk_buff *skb); static inline bool skb_warn_if_lro(const struct sk_buff *skb) { /* LRO sets gso_size but not gso_type, whereas if GSO is really * wanted then gso_type will be set. */ const struct skb_shared_info *shinfo = skb_shinfo(skb); if (skb_is_nonlinear(skb) && shinfo->gso_size != 0 && unlikely(shinfo->gso_type == 0)) { __skb_warn_lro_forwarding(skb); return true; } return false; } static inline void skb_forward_csum(struct sk_buff *skb) { /* Unfortunately we don't support this one. Any brave souls? */ if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; } /** * skb_checksum_none_assert - make sure skb ip_summed is CHECKSUM_NONE * @skb: skb to check * * fresh skbs have their ip_summed set to CHECKSUM_NONE. * Instead of forcing ip_summed to CHECKSUM_NONE, we can * use this helper, to document places where we make this assertion. */ static inline void skb_checksum_none_assert(const struct sk_buff *skb) { DEBUG_NET_WARN_ON_ONCE(skb->ip_summed != CHECKSUM_NONE); } bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off); int skb_checksum_setup(struct sk_buff *skb, bool recalculate); struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb, unsigned int transport_len, __sum16(*skb_chkf)(struct sk_buff *skb)); /** * skb_head_is_locked - Determine if the skb->head is locked down * @skb: skb to check * * The head on skbs build around a head frag can be removed if they are * not cloned. This function returns true if the skb head is locked down * due to either being allocated via kmalloc, or by being a clone with * multiple references to the head. */ static inline bool skb_head_is_locked(const struct sk_buff *skb) { return !skb->head_frag || skb_cloned(skb); } /* Local Checksum Offload. * Compute outer checksum based on the assumption that the * inner checksum will be offloaded later. * See Documentation/networking/checksum-offloads.rst for * explanation of how this works. * Fill in outer checksum adjustment (e.g. with sum of outer * pseudo-header) before calling. * Also ensure that inner checksum is in linear data area. */ static inline __wsum lco_csum(struct sk_buff *skb) { unsigned char *csum_start = skb_checksum_start(skb); unsigned char *l4_hdr = skb_transport_header(skb); __wsum partial; /* Start with complement of inner checksum adjustment */ partial = ~csum_unfold(*(__force __sum16 *)(csum_start + skb->csum_offset)); /* Add in checksum of our headers (incl. outer checksum * adjustment filled in by caller) and return result. */ return csum_partial(l4_hdr, csum_start - l4_hdr, partial); } static inline bool skb_is_redirected(const struct sk_buff *skb) { return skb->redirected; } static inline void skb_set_redirected(struct sk_buff *skb, bool from_ingress) { skb->redirected = 1; #ifdef CONFIG_NET_REDIRECT skb->from_ingress = from_ingress; if (skb->from_ingress) skb_clear_tstamp(skb); #endif } static inline void skb_reset_redirect(struct sk_buff *skb) { skb->redirected = 0; } static inline void skb_set_redirected_noclear(struct sk_buff *skb, bool from_ingress) { skb->redirected = 1; #ifdef CONFIG_NET_REDIRECT skb->from_ingress = from_ingress; #endif } static inline bool skb_csum_is_sctp(struct sk_buff *skb) { #if IS_ENABLED(CONFIG_IP_SCTP) return skb->csum_not_inet; #else return 0; #endif } static inline void skb_reset_csum_not_inet(struct sk_buff *skb) { skb->ip_summed = CHECKSUM_NONE; #if IS_ENABLED(CONFIG_IP_SCTP) skb->csum_not_inet = 0; #endif } static inline void skb_set_kcov_handle(struct sk_buff *skb, const u64 kcov_handle) { #ifdef CONFIG_KCOV skb->kcov_handle = kcov_handle; #endif } static inline u64 skb_get_kcov_handle(struct sk_buff *skb) { #ifdef CONFIG_KCOV return skb->kcov_handle; #else return 0; #endif } static inline void skb_mark_for_recycle(struct sk_buff *skb) { #ifdef CONFIG_PAGE_POOL skb->pp_recycle = 1; #endif } ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter, ssize_t maxsize, gfp_t gfp); #endif /* __KERNEL__ */ #endif /* _LINUX_SKBUFF_H */
9140 9141 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 // SPDX-License-Identifier: GPL-2.0 /* * security/tomoyo/audit.c * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #include "common.h" #include <linux/slab.h> /** * tomoyo_print_bprm - Print "struct linux_binprm" for auditing. * * @bprm: Pointer to "struct linux_binprm". * @dump: Pointer to "struct tomoyo_page_dump". * * Returns the contents of @bprm on success, NULL otherwise. * * This function uses kzalloc(), so caller must kfree() if this function * didn't return NULL. */ static char *tomoyo_print_bprm(struct linux_binprm *bprm, struct tomoyo_page_dump *dump) { static const int tomoyo_buffer_len = 4096 * 2; char *buffer = kzalloc(tomoyo_buffer_len, GFP_NOFS); char *cp; char *last_start; int len; unsigned long pos = bprm->p; int offset = pos % PAGE_SIZE; int argv_count = bprm->argc; int envp_count = bprm->envc; bool truncated = false; if (!buffer) return NULL; len = snprintf(buffer, tomoyo_buffer_len - 1, "argv[]={ "); cp = buffer + len; if (!argv_count) { memmove(cp, "} envp[]={ ", 11); cp += 11; } last_start = cp; while (argv_count || envp_count) { if (!tomoyo_dump_page(bprm, pos, dump)) goto out; pos += PAGE_SIZE - offset; /* Read. */ while (offset < PAGE_SIZE) { const char *kaddr = dump->data; const unsigned char c = kaddr[offset++]; if (cp == last_start) *cp++ = '"'; if (cp >= buffer + tomoyo_buffer_len - 32) { /* Reserve some room for "..." string. */ truncated = true; } else if (c == '\\') { *cp++ = '\\'; *cp++ = '\\'; } else if (c > ' ' && c < 127) { *cp++ = c; } else if (!c) { *cp++ = '"'; *cp++ = ' '; last_start = cp; } else { *cp++ = '\\'; *cp++ = (c >> 6) + '0'; *cp++ = ((c >> 3) & 7) + '0'; *cp++ = (c & 7) + '0'; } if (c) continue; if (argv_count) { if (--argv_count == 0) { if (truncated) { cp = last_start; memmove(cp, "... ", 4); cp += 4; } memmove(cp, "} envp[]={ ", 11); cp += 11; last_start = cp; truncated = false; } } else if (envp_count) { if (--envp_count == 0) { if (truncated) { cp = last_start; memmove(cp, "... ", 4); cp += 4; } } } if (!argv_count && !envp_count) break; } offset = 0; } *cp++ = '}'; *cp = '\0'; return buffer; out: snprintf(buffer, tomoyo_buffer_len - 1, "argv[]={ ... } envp[]= { ... }"); return buffer; } /** * tomoyo_filetype - Get string representation of file type. * * @mode: Mode value for stat(). * * Returns file type string. */ static inline const char *tomoyo_filetype(const umode_t mode) { switch (mode & S_IFMT) { case S_IFREG: case 0: return tomoyo_condition_keyword[TOMOYO_TYPE_IS_FILE]; case S_IFDIR: return tomoyo_condition_keyword[TOMOYO_TYPE_IS_DIRECTORY]; case S_IFLNK: return tomoyo_condition_keyword[TOMOYO_TYPE_IS_SYMLINK]; case S_IFIFO: return tomoyo_condition_keyword[TOMOYO_TYPE_IS_FIFO]; case S_IFSOCK: return tomoyo_condition_keyword[TOMOYO_TYPE_IS_SOCKET]; case S_IFBLK: return tomoyo_condition_keyword[TOMOYO_TYPE_IS_BLOCK_DEV]; case S_IFCHR: return tomoyo_condition_keyword[TOMOYO_TYPE_IS_CHAR_DEV]; } return "unknown"; /* This should not happen. */ } /** * tomoyo_print_header - Get header line of audit log. * * @r: Pointer to "struct tomoyo_request_info". * * Returns string representation. * * This function uses kmalloc(), so caller must kfree() if this function * didn't return NULL. */ static char *tomoyo_print_header(struct tomoyo_request_info *r) { struct tomoyo_time stamp; const pid_t gpid = task_pid_nr(current); struct tomoyo_obj_info *obj = r->obj; static const int tomoyo_buffer_len = 4096; char *buffer = kmalloc(tomoyo_buffer_len, GFP_NOFS); int pos; u8 i; if (!buffer) return NULL; tomoyo_convert_time(ktime_get_real_seconds(), &stamp); pos = snprintf(buffer, tomoyo_buffer_len - 1, "#%04u/%02u/%02u %02u:%02u:%02u# profile=%u mode=%s granted=%s (global-pid=%u) task={ pid=%u ppid=%u uid=%u gid=%u euid=%u egid=%u suid=%u sgid=%u fsuid=%u fsgid=%u }", stamp.year, stamp.month, stamp.day, stamp.hour, stamp.min, stamp.sec, r->profile, tomoyo_mode[r->mode], str_yes_no(r->granted), gpid, tomoyo_sys_getpid(), tomoyo_sys_getppid(), from_kuid(&init_user_ns, current_uid()), from_kgid(&init_user_ns, current_gid()), from_kuid(&init_user_ns, current_euid()), from_kgid(&init_user_ns, current_egid()), from_kuid(&init_user_ns, current_suid()), from_kgid(&init_user_ns, current_sgid()), from_kuid(&init_user_ns, current_fsuid()), from_kgid(&init_user_ns, current_fsgid())); if (!obj) goto no_obj_info; if (!obj->validate_done) { tomoyo_get_attributes(obj); obj->validate_done = true; } for (i = 0; i < TOMOYO_MAX_PATH_STAT; i++) { struct tomoyo_mini_stat *stat; unsigned int dev; umode_t mode; if (!obj->stat_valid[i]) continue; stat = &obj->stat[i]; dev = stat->dev; mode = stat->mode; if (i & 1) { pos += snprintf(buffer + pos, tomoyo_buffer_len - 1 - pos, " path%u.parent={ uid=%u gid=%u ino=%lu perm=0%o }", (i >> 1) + 1, from_kuid(&init_user_ns, stat->uid), from_kgid(&init_user_ns, stat->gid), (unsigned long)stat->ino, stat->mode & S_IALLUGO); continue; } pos += snprintf(buffer + pos, tomoyo_buffer_len - 1 - pos, " path%u={ uid=%u gid=%u ino=%lu major=%u minor=%u perm=0%o type=%s", (i >> 1) + 1, from_kuid(&init_user_ns, stat->uid), from_kgid(&init_user_ns, stat->gid), (unsigned long)stat->ino, MAJOR(dev), MINOR(dev), mode & S_IALLUGO, tomoyo_filetype(mode)); if (S_ISCHR(mode) || S_ISBLK(mode)) { dev = stat->rdev; pos += snprintf(buffer + pos, tomoyo_buffer_len - 1 - pos, " dev_major=%u dev_minor=%u", MAJOR(dev), MINOR(dev)); } pos += snprintf(buffer + pos, tomoyo_buffer_len - 1 - pos, " }"); } no_obj_info: if (pos < tomoyo_buffer_len - 1) return buffer; kfree(buffer); return NULL; } /** * tomoyo_init_log - Allocate buffer for audit logs. * * @r: Pointer to "struct tomoyo_request_info". * @len: Buffer size needed for @fmt and @args. * @fmt: The printf()'s format string. * @args: va_list structure for @fmt. * * Returns pointer to allocated memory. * * This function uses kzalloc(), so caller must kfree() if this function * didn't return NULL. */ char *tomoyo_init_log(struct tomoyo_request_info *r, int len, const char *fmt, va_list args) { char *buf = NULL; char *bprm_info = NULL; const char *header = NULL; char *realpath = NULL; const char *symlink = NULL; int pos; const char *domainname = r->domain->domainname->name; header = tomoyo_print_header(r); if (!header) return NULL; /* +10 is for '\n' etc. and '\0'. */ len += strlen(domainname) + strlen(header) + 10; if (r->ee) { struct file *file = r->ee->bprm->file; realpath = tomoyo_realpath_from_path(&file->f_path); bprm_info = tomoyo_print_bprm(r->ee->bprm, &r->ee->dump); if (!realpath || !bprm_info) goto out; /* +80 is for " exec={ realpath=\"%s\" argc=%d envc=%d %s }" */ len += strlen(realpath) + 80 + strlen(bprm_info); } else if (r->obj && r->obj->symlink_target) { symlink = r->obj->symlink_target->name; /* +18 is for " symlink.target=\"%s\"" */ len += 18 + strlen(symlink); } len = kmalloc_size_roundup(len); buf = kzalloc(len, GFP_NOFS); if (!buf) goto out; len--; pos = snprintf(buf, len, "%s", header); if (realpath) { struct linux_binprm *bprm = r->ee->bprm; pos += snprintf(buf + pos, len - pos, " exec={ realpath=\"%s\" argc=%d envc=%d %s }", realpath, bprm->argc, bprm->envc, bprm_info); } else if (symlink) pos += snprintf(buf + pos, len - pos, " symlink.target=\"%s\"", symlink); pos += snprintf(buf + pos, len - pos, "\n%s\n", domainname); vsnprintf(buf + pos, len - pos, fmt, args); out: kfree(realpath); kfree(bprm_info); kfree(header); return buf; } /* Wait queue for /sys/kernel/security/tomoyo/audit. */ static DECLARE_WAIT_QUEUE_HEAD(tomoyo_log_wait); /* Structure for audit log. */ struct tomoyo_log { struct list_head list; char *log; int size; }; /* The list for "struct tomoyo_log". */ static LIST_HEAD(tomoyo_log); /* Lock for "struct list_head tomoyo_log". */ static DEFINE_SPINLOCK(tomoyo_log_lock); /* Length of "struct list_head tomoyo_log". */ static unsigned int tomoyo_log_count; /** * tomoyo_get_audit - Get audit mode. * * @ns: Pointer to "struct tomoyo_policy_namespace". * @profile: Profile number. * @index: Index number of functionality. * @matched_acl: Pointer to "struct tomoyo_acl_info". * @is_granted: True if granted log, false otherwise. * * Returns true if this request should be audited, false otherwise. */ static bool tomoyo_get_audit(const struct tomoyo_policy_namespace *ns, const u8 profile, const u8 index, const struct tomoyo_acl_info *matched_acl, const bool is_granted) { u8 mode; const u8 category = tomoyo_index2category[index] + TOMOYO_MAX_MAC_INDEX; struct tomoyo_profile *p; if (!tomoyo_policy_loaded) return false; p = tomoyo_profile(ns, profile); if (tomoyo_log_count >= p->pref[TOMOYO_PREF_MAX_AUDIT_LOG]) return false; if (is_granted && matched_acl && matched_acl->cond && matched_acl->cond->grant_log != TOMOYO_GRANTLOG_AUTO) return matched_acl->cond->grant_log == TOMOYO_GRANTLOG_YES; mode = p->config[index]; if (mode == TOMOYO_CONFIG_USE_DEFAULT) mode = p->config[category]; if (mode == TOMOYO_CONFIG_USE_DEFAULT) mode = p->default_config; if (is_granted) return mode & TOMOYO_CONFIG_WANT_GRANT_LOG; return mode & TOMOYO_CONFIG_WANT_REJECT_LOG; } /** * tomoyo_write_log2 - Write an audit log. * * @r: Pointer to "struct tomoyo_request_info". * @len: Buffer size needed for @fmt and @args. * @fmt: The printf()'s format string. * @args: va_list structure for @fmt. * * Returns nothing. */ void tomoyo_write_log2(struct tomoyo_request_info *r, int len, const char *fmt, va_list args) { char *buf; struct tomoyo_log *entry; bool quota_exceeded = false; if (!tomoyo_get_audit(r->domain->ns, r->profile, r->type, r->matched_acl, r->granted)) goto out; buf = tomoyo_init_log(r, len, fmt, args); if (!buf) goto out; entry = kzalloc(sizeof(*entry), GFP_NOFS); if (!entry) { kfree(buf); goto out; } entry->log = buf; len = kmalloc_size_roundup(strlen(buf) + 1); /* * The entry->size is used for memory quota checks. * Don't go beyond strlen(entry->log). */ entry->size = len + kmalloc_size_roundup(sizeof(*entry)); spin_lock(&tomoyo_log_lock); if (tomoyo_memory_quota[TOMOYO_MEMORY_AUDIT] && tomoyo_memory_used[TOMOYO_MEMORY_AUDIT] + entry->size >= tomoyo_memory_quota[TOMOYO_MEMORY_AUDIT]) { quota_exceeded = true; } else { tomoyo_memory_used[TOMOYO_MEMORY_AUDIT] += entry->size; list_add_tail(&entry->list, &tomoyo_log); tomoyo_log_count++; } spin_unlock(&tomoyo_log_lock); if (quota_exceeded) { kfree(buf); kfree(entry); goto out; } wake_up(&tomoyo_log_wait); out: return; } /** * tomoyo_write_log - Write an audit log. * * @r: Pointer to "struct tomoyo_request_info". * @fmt: The printf()'s format string, followed by parameters. * * Returns nothing. */ void tomoyo_write_log(struct tomoyo_request_info *r, const char *fmt, ...) { va_list args; int len; va_start(args, fmt); len = vsnprintf(NULL, 0, fmt, args) + 1; va_end(args); va_start(args, fmt); tomoyo_write_log2(r, len, fmt, args); va_end(args); } /** * tomoyo_read_log - Read an audit log. * * @head: Pointer to "struct tomoyo_io_buffer". * * Returns nothing. */ void tomoyo_read_log(struct tomoyo_io_buffer *head) { struct tomoyo_log *ptr = NULL; if (head->r.w_pos) return; kfree(head->read_buf); head->read_buf = NULL; spin_lock(&tomoyo_log_lock); if (!list_empty(&tomoyo_log)) { ptr = list_entry(tomoyo_log.next, typeof(*ptr), list); list_del(&ptr->list); tomoyo_log_count--; tomoyo_memory_used[TOMOYO_MEMORY_AUDIT] -= ptr->size; } spin_unlock(&tomoyo_log_lock); if (ptr) { head->read_buf = ptr->log; head->r.w[head->r.w_pos++] = head->read_buf; kfree(ptr); } } /** * tomoyo_poll_log - Wait for an audit log. * * @file: Pointer to "struct file". * @wait: Pointer to "poll_table". Maybe NULL. * * Returns EPOLLIN | EPOLLRDNORM when ready to read an audit log. */ __poll_t tomoyo_poll_log(struct file *file, poll_table *wait) { if (tomoyo_log_count) return EPOLLIN | EPOLLRDNORM; poll_wait(file, &tomoyo_log_wait, wait); if (tomoyo_log_count) return EPOLLIN | EPOLLRDNORM; return 0; }
1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 // SPDX-License-Identifier: GPL-2.0-only /* * STP SAP demux * * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> */ #include <linux/mutex.h> #include <linux/skbuff.h> #include <linux/etherdevice.h> #include <linux/llc.h> #include <linux/slab.h> #include <linux/module.h> #include <net/llc.h> #include <net/llc_pdu.h> #include <net/stp.h> /* 01:80:c2:00:00:20 - 01:80:c2:00:00:2F */ #define GARP_ADDR_MIN 0x20 #define GARP_ADDR_MAX 0x2F #define GARP_ADDR_RANGE (GARP_ADDR_MAX - GARP_ADDR_MIN) static const struct stp_proto __rcu *garp_protos[GARP_ADDR_RANGE + 1] __read_mostly; static const struct stp_proto __rcu *stp_proto __read_mostly; static struct llc_sap *sap __read_mostly; static unsigned int sap_registered; static DEFINE_MUTEX(stp_proto_mutex); /* Called under rcu_read_lock from LLC */ static int stp_pdu_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { const struct ethhdr *eh = eth_hdr(skb); const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); const struct stp_proto *proto; if (pdu->ssap != LLC_SAP_BSPAN || pdu->dsap != LLC_SAP_BSPAN || pdu->ctrl_1 != LLC_PDU_TYPE_U) goto err; if (eh->h_dest[5] >= GARP_ADDR_MIN && eh->h_dest[5] <= GARP_ADDR_MAX) { proto = rcu_dereference(garp_protos[eh->h_dest[5] - GARP_ADDR_MIN]); if (proto && !ether_addr_equal(eh->h_dest, proto->group_address)) goto err; } else proto = rcu_dereference(stp_proto); if (!proto) goto err; proto->rcv(proto, skb, dev); return 0; err: kfree_skb(skb); return 0; } int stp_proto_register(const struct stp_proto *proto) { int err = 0; mutex_lock(&stp_proto_mutex); if (sap_registered++ == 0) { sap = llc_sap_open(LLC_SAP_BSPAN, stp_pdu_rcv); if (!sap) { err = -ENOMEM; goto out; } } if (is_zero_ether_addr(proto->group_address)) rcu_assign_pointer(stp_proto, proto); else rcu_assign_pointer(garp_protos[proto->group_address[5] - GARP_ADDR_MIN], proto); out: mutex_unlock(&stp_proto_mutex); return err; } EXPORT_SYMBOL_GPL(stp_proto_register); void stp_proto_unregister(const struct stp_proto *proto) { mutex_lock(&stp_proto_mutex); if (is_zero_ether_addr(proto->group_address)) RCU_INIT_POINTER(stp_proto, NULL); else RCU_INIT_POINTER(garp_protos[proto->group_address[5] - GARP_ADDR_MIN], NULL); synchronize_rcu(); if (--sap_registered == 0) llc_sap_put(sap); mutex_unlock(&stp_proto_mutex); } EXPORT_SYMBOL_GPL(stp_proto_unregister); MODULE_LICENSE("GPL");
104 515 515 514 515 515 515 514 514 501 501 110 110 110 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001 Intel Corp. * * This file is part of the SCTP kernel implementation * * This file contains sctp stream maniuplation primitives and helpers. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Xin Long <lucien.xin@gmail.com> */ #include <linux/list.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/stream_sched.h> static void sctp_stream_shrink_out(struct sctp_stream *stream, __u16 outcnt) { struct sctp_association *asoc; struct sctp_chunk *ch, *temp; struct sctp_outq *outq; asoc = container_of(stream, struct sctp_association, stream); outq = &asoc->outqueue; list_for_each_entry_safe(ch, temp, &outq->out_chunk_list, list) { __u16 sid = sctp_chunk_stream_no(ch); if (sid < outcnt) continue; sctp_sched_dequeue_common(outq, ch); /* No need to call dequeue_done here because * the chunks are not scheduled by now. */ /* Mark as failed send. */ sctp_chunk_fail(ch, (__force __u32)SCTP_ERROR_INV_STRM); if (asoc->peer.prsctp_capable && SCTP_PR_PRIO_ENABLED(ch->sinfo.sinfo_flags)) asoc->sent_cnt_removable--; sctp_chunk_free(ch); } } static void sctp_stream_free_ext(struct sctp_stream *stream, __u16 sid) { struct sctp_sched_ops *sched; if (!SCTP_SO(stream, sid)->ext) return; sched = sctp_sched_ops_from_stream(stream); sched->free_sid(stream, sid); kfree(SCTP_SO(stream, sid)->ext); SCTP_SO(stream, sid)->ext = NULL; } /* Migrates chunks from stream queues to new stream queues if needed, * but not across associations. Also, removes those chunks to streams * higher than the new max. */ static void sctp_stream_outq_migrate(struct sctp_stream *stream, struct sctp_stream *new, __u16 outcnt) { int i; if (stream->outcnt > outcnt) sctp_stream_shrink_out(stream, outcnt); if (new) { /* Here we actually move the old ext stuff into the new * buffer, because we want to keep it. Then * sctp_stream_update will swap ->out pointers. */ for (i = 0; i < outcnt; i++) { sctp_stream_free_ext(new, i); SCTP_SO(new, i)->ext = SCTP_SO(stream, i)->ext; SCTP_SO(stream, i)->ext = NULL; } } for (i = outcnt; i < stream->outcnt; i++) sctp_stream_free_ext(stream, i); } static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt, gfp_t gfp) { int ret; if (outcnt <= stream->outcnt) goto out; ret = genradix_prealloc(&stream->out, outcnt, gfp); if (ret) return ret; out: stream->outcnt = outcnt; return 0; } static int sctp_stream_alloc_in(struct sctp_stream *stream, __u16 incnt, gfp_t gfp) { int ret; if (incnt <= stream->incnt) goto out; ret = genradix_prealloc(&stream->in, incnt, gfp); if (ret) return ret; out: stream->incnt = incnt; return 0; } int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, gfp_t gfp) { struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); int i, ret = 0; gfp |= __GFP_NOWARN; /* Initial stream->out size may be very big, so free it and alloc * a new one with new outcnt to save memory if needed. */ if (outcnt == stream->outcnt) goto handle_in; /* Filter out chunks queued on streams that won't exist anymore */ sched->unsched_all(stream); sctp_stream_outq_migrate(stream, NULL, outcnt); sched->sched_all(stream); ret = sctp_stream_alloc_out(stream, outcnt, gfp); if (ret) return ret; for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; handle_in: sctp_stream_interleave_init(stream); if (!incnt) return 0; return sctp_stream_alloc_in(stream, incnt, gfp); } int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid) { struct sctp_stream_out_ext *soute; int ret; soute = kzalloc(sizeof(*soute), GFP_KERNEL); if (!soute) return -ENOMEM; SCTP_SO(stream, sid)->ext = soute; ret = sctp_sched_init_sid(stream, sid, GFP_KERNEL); if (ret) { kfree(SCTP_SO(stream, sid)->ext); SCTP_SO(stream, sid)->ext = NULL; } return ret; } void sctp_stream_free(struct sctp_stream *stream) { struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); int i; sched->unsched_all(stream); for (i = 0; i < stream->outcnt; i++) sctp_stream_free_ext(stream, i); genradix_free(&stream->out); genradix_free(&stream->in); } void sctp_stream_clear(struct sctp_stream *stream) { int i; for (i = 0; i < stream->outcnt; i++) { SCTP_SO(stream, i)->mid = 0; SCTP_SO(stream, i)->mid_uo = 0; } for (i = 0; i < stream->incnt; i++) SCTP_SI(stream, i)->mid = 0; } void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new) { struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); sched->unsched_all(stream); sctp_stream_outq_migrate(stream, new, new->outcnt); sctp_stream_free(stream); stream->out = new->out; stream->in = new->in; stream->outcnt = new->outcnt; stream->incnt = new->incnt; sched->sched_all(stream); new->out.tree.root = NULL; new->in.tree.root = NULL; new->outcnt = 0; new->incnt = 0; } static int sctp_send_reconf(struct sctp_association *asoc, struct sctp_chunk *chunk) { int retval = 0; retval = sctp_primitive_RECONF(asoc->base.net, asoc, chunk); if (retval) sctp_chunk_free(chunk); return retval; } static bool sctp_stream_outq_is_empty(struct sctp_stream *stream, __u16 str_nums, __be16 *str_list) { struct sctp_association *asoc; __u16 i; asoc = container_of(stream, struct sctp_association, stream); if (!asoc->outqueue.out_qlen) return true; if (!str_nums) return false; for (i = 0; i < str_nums; i++) { __u16 sid = ntohs(str_list[i]); if (SCTP_SO(stream, sid)->ext && !list_empty(&SCTP_SO(stream, sid)->ext->outq)) return false; } return true; } int sctp_send_reset_streams(struct sctp_association *asoc, struct sctp_reset_streams *params) { struct sctp_stream *stream = &asoc->stream; __u16 i, str_nums, *str_list; struct sctp_chunk *chunk; int retval = -EINVAL; __be16 *nstr_list; bool out, in; if (!asoc->peer.reconf_capable || !(asoc->strreset_enable & SCTP_ENABLE_RESET_STREAM_REQ)) { retval = -ENOPROTOOPT; goto out; } if (asoc->strreset_outstanding) { retval = -EINPROGRESS; goto out; } out = params->srs_flags & SCTP_STREAM_RESET_OUTGOING; in = params->srs_flags & SCTP_STREAM_RESET_INCOMING; if (!out && !in) goto out; str_nums = params->srs_number_streams; str_list = params->srs_stream_list; if (str_nums) { int param_len = 0; if (out) { for (i = 0; i < str_nums; i++) if (str_list[i] >= stream->outcnt) goto out; param_len = str_nums * sizeof(__u16) + sizeof(struct sctp_strreset_outreq); } if (in) { for (i = 0; i < str_nums; i++) if (str_list[i] >= stream->incnt) goto out; param_len += str_nums * sizeof(__u16) + sizeof(struct sctp_strreset_inreq); } if (param_len > SCTP_MAX_CHUNK_LEN - sizeof(struct sctp_reconf_chunk)) goto out; } nstr_list = kcalloc(str_nums, sizeof(__be16), GFP_KERNEL); if (!nstr_list) { retval = -ENOMEM; goto out; } for (i = 0; i < str_nums; i++) nstr_list[i] = htons(str_list[i]); if (out && !sctp_stream_outq_is_empty(stream, str_nums, nstr_list)) { kfree(nstr_list); retval = -EAGAIN; goto out; } chunk = sctp_make_strreset_req(asoc, str_nums, nstr_list, out, in); kfree(nstr_list); if (!chunk) { retval = -ENOMEM; goto out; } if (out) { if (str_nums) for (i = 0; i < str_nums; i++) SCTP_SO(stream, str_list[i])->state = SCTP_STREAM_CLOSED; else for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_CLOSED; } asoc->strreset_chunk = chunk; sctp_chunk_hold(asoc->strreset_chunk); retval = sctp_send_reconf(asoc, chunk); if (retval) { sctp_chunk_put(asoc->strreset_chunk); asoc->strreset_chunk = NULL; if (!out) goto out; if (str_nums) for (i = 0; i < str_nums; i++) SCTP_SO(stream, str_list[i])->state = SCTP_STREAM_OPEN; else for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; goto out; } asoc->strreset_outstanding = out + in; out: return retval; } int sctp_send_reset_assoc(struct sctp_association *asoc) { struct sctp_stream *stream = &asoc->stream; struct sctp_chunk *chunk = NULL; int retval; __u16 i; if (!asoc->peer.reconf_capable || !(asoc->strreset_enable & SCTP_ENABLE_RESET_ASSOC_REQ)) return -ENOPROTOOPT; if (asoc->strreset_outstanding) return -EINPROGRESS; if (!sctp_outq_is_empty(&asoc->outqueue)) return -EAGAIN; chunk = sctp_make_strreset_tsnreq(asoc); if (!chunk) return -ENOMEM; /* Block further xmit of data until this request is completed */ for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_CLOSED; asoc->strreset_chunk = chunk; sctp_chunk_hold(asoc->strreset_chunk); retval = sctp_send_reconf(asoc, chunk); if (retval) { sctp_chunk_put(asoc->strreset_chunk); asoc->strreset_chunk = NULL; for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; return retval; } asoc->strreset_outstanding = 1; return 0; } int sctp_send_add_streams(struct sctp_association *asoc, struct sctp_add_streams *params) { struct sctp_stream *stream = &asoc->stream; struct sctp_chunk *chunk = NULL; int retval; __u32 outcnt, incnt; __u16 out, in; if (!asoc->peer.reconf_capable || !(asoc->strreset_enable & SCTP_ENABLE_CHANGE_ASSOC_REQ)) { retval = -ENOPROTOOPT; goto out; } if (asoc->strreset_outstanding) { retval = -EINPROGRESS; goto out; } out = params->sas_outstrms; in = params->sas_instrms; outcnt = stream->outcnt + out; incnt = stream->incnt + in; if (outcnt > SCTP_MAX_STREAM || incnt > SCTP_MAX_STREAM || (!out && !in)) { retval = -EINVAL; goto out; } if (out) { retval = sctp_stream_alloc_out(stream, outcnt, GFP_KERNEL); if (retval) goto out; } chunk = sctp_make_strreset_addstrm(asoc, out, in); if (!chunk) { retval = -ENOMEM; goto out; } asoc->strreset_chunk = chunk; sctp_chunk_hold(asoc->strreset_chunk); retval = sctp_send_reconf(asoc, chunk); if (retval) { sctp_chunk_put(asoc->strreset_chunk); asoc->strreset_chunk = NULL; goto out; } asoc->strreset_outstanding = !!out + !!in; out: return retval; } static struct sctp_paramhdr *sctp_chunk_lookup_strreset_param( struct sctp_association *asoc, __be32 resp_seq, __be16 type) { struct sctp_chunk *chunk = asoc->strreset_chunk; struct sctp_reconf_chunk *hdr; union sctp_params param; if (!chunk) return NULL; hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr; sctp_walk_params(param, hdr) { /* sctp_strreset_tsnreq is actually the basic structure * of all stream reconf params, so it's safe to use it * to access request_seq. */ struct sctp_strreset_tsnreq *req = param.v; if ((!resp_seq || req->request_seq == resp_seq) && (!type || type == req->param_hdr.type)) return param.v; } return NULL; } static void sctp_update_strreset_result(struct sctp_association *asoc, __u32 result) { asoc->strreset_result[1] = asoc->strreset_result[0]; asoc->strreset_result[0] = result; } struct sctp_chunk *sctp_process_strreset_outreq( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp) { struct sctp_strreset_outreq *outreq = param.v; struct sctp_stream *stream = &asoc->stream; __u32 result = SCTP_STRRESET_DENIED; __be16 *str_p = NULL; __u32 request_seq; __u16 i, nums; request_seq = ntohl(outreq->request_seq); if (ntohl(outreq->send_reset_at_tsn) > sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map)) { result = SCTP_STRRESET_IN_PROGRESS; goto err; } if (TSN_lt(asoc->strreset_inseq, request_seq) || TSN_lt(request_seq, asoc->strreset_inseq - 2)) { result = SCTP_STRRESET_ERR_BAD_SEQNO; goto err; } else if (TSN_lt(request_seq, asoc->strreset_inseq)) { i = asoc->strreset_inseq - request_seq - 1; result = asoc->strreset_result[i]; goto err; } asoc->strreset_inseq++; /* Check strreset_enable after inseq inc, as sender cannot tell * the peer doesn't enable strreset after receiving response with * result denied, as well as to keep consistent with bsd. */ if (!(asoc->strreset_enable & SCTP_ENABLE_RESET_STREAM_REQ)) goto out; nums = (ntohs(param.p->length) - sizeof(*outreq)) / sizeof(__u16); str_p = outreq->list_of_streams; for (i = 0; i < nums; i++) { if (ntohs(str_p[i]) >= stream->incnt) { result = SCTP_STRRESET_ERR_WRONG_SSN; goto out; } } if (asoc->strreset_chunk) { if (!sctp_chunk_lookup_strreset_param( asoc, outreq->response_seq, SCTP_PARAM_RESET_IN_REQUEST)) { /* same process with outstanding isn't 0 */ result = SCTP_STRRESET_ERR_IN_PROGRESS; goto out; } asoc->strreset_outstanding--; asoc->strreset_outseq++; if (!asoc->strreset_outstanding) { struct sctp_transport *t; t = asoc->strreset_chunk->transport; if (del_timer(&t->reconf_timer)) sctp_transport_put(t); sctp_chunk_put(asoc->strreset_chunk); asoc->strreset_chunk = NULL; } } if (nums) for (i = 0; i < nums; i++) SCTP_SI(stream, ntohs(str_p[i]))->mid = 0; else for (i = 0; i < stream->incnt; i++) SCTP_SI(stream, i)->mid = 0; result = SCTP_STRRESET_PERFORMED; *evp = sctp_ulpevent_make_stream_reset_event(asoc, SCTP_STREAM_RESET_INCOMING_SSN, nums, str_p, GFP_ATOMIC); out: sctp_update_strreset_result(asoc, result); err: return sctp_make_strreset_resp(asoc, result, request_seq); } struct sctp_chunk *sctp_process_strreset_inreq( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp) { struct sctp_strreset_inreq *inreq = param.v; struct sctp_stream *stream = &asoc->stream; __u32 result = SCTP_STRRESET_DENIED; struct sctp_chunk *chunk = NULL; __u32 request_seq; __u16 i, nums; __be16 *str_p; request_seq = ntohl(inreq->request_seq); if (TSN_lt(asoc->strreset_inseq, request_seq) || TSN_lt(request_seq, asoc->strreset_inseq - 2)) { result = SCTP_STRRESET_ERR_BAD_SEQNO; goto err; } else if (TSN_lt(request_seq, asoc->strreset_inseq)) { i = asoc->strreset_inseq - request_seq - 1; result = asoc->strreset_result[i]; if (result == SCTP_STRRESET_PERFORMED) return NULL; goto err; } asoc->strreset_inseq++; if (!(asoc->strreset_enable & SCTP_ENABLE_RESET_STREAM_REQ)) goto out; if (asoc->strreset_outstanding) { result = SCTP_STRRESET_ERR_IN_PROGRESS; goto out; } nums = (ntohs(param.p->length) - sizeof(*inreq)) / sizeof(__u16); str_p = inreq->list_of_streams; for (i = 0; i < nums; i++) { if (ntohs(str_p[i]) >= stream->outcnt) { result = SCTP_STRRESET_ERR_WRONG_SSN; goto out; } } if (!sctp_stream_outq_is_empty(stream, nums, str_p)) { result = SCTP_STRRESET_IN_PROGRESS; asoc->strreset_inseq--; goto err; } chunk = sctp_make_strreset_req(asoc, nums, str_p, 1, 0); if (!chunk) goto out; if (nums) for (i = 0; i < nums; i++) SCTP_SO(stream, ntohs(str_p[i]))->state = SCTP_STREAM_CLOSED; else for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_CLOSED; asoc->strreset_chunk = chunk; asoc->strreset_outstanding = 1; sctp_chunk_hold(asoc->strreset_chunk); result = SCTP_STRRESET_PERFORMED; out: sctp_update_strreset_result(asoc, result); err: if (!chunk) chunk = sctp_make_strreset_resp(asoc, result, request_seq); return chunk; } struct sctp_chunk *sctp_process_strreset_tsnreq( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp) { __u32 init_tsn = 0, next_tsn = 0, max_tsn_seen; struct sctp_strreset_tsnreq *tsnreq = param.v; struct sctp_stream *stream = &asoc->stream; __u32 result = SCTP_STRRESET_DENIED; __u32 request_seq; __u16 i; request_seq = ntohl(tsnreq->request_seq); if (TSN_lt(asoc->strreset_inseq, request_seq) || TSN_lt(request_seq, asoc->strreset_inseq - 2)) { result = SCTP_STRRESET_ERR_BAD_SEQNO; goto err; } else if (TSN_lt(request_seq, asoc->strreset_inseq)) { i = asoc->strreset_inseq - request_seq - 1; result = asoc->strreset_result[i]; if (result == SCTP_STRRESET_PERFORMED) { next_tsn = asoc->ctsn_ack_point + 1; init_tsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map) + 1; } goto err; } if (!sctp_outq_is_empty(&asoc->outqueue)) { result = SCTP_STRRESET_IN_PROGRESS; goto err; } asoc->strreset_inseq++; if (!(asoc->strreset_enable & SCTP_ENABLE_RESET_ASSOC_REQ)) goto out; if (asoc->strreset_outstanding) { result = SCTP_STRRESET_ERR_IN_PROGRESS; goto out; } /* G4: The same processing as though a FWD-TSN chunk (as defined in * [RFC3758]) with all streams affected and a new cumulative TSN * ACK of the Receiver's Next TSN minus 1 were received MUST be * performed. */ max_tsn_seen = sctp_tsnmap_get_max_tsn_seen(&asoc->peer.tsn_map); asoc->stream.si->report_ftsn(&asoc->ulpq, max_tsn_seen); /* G1: Compute an appropriate value for the Receiver's Next TSN -- the * TSN that the peer should use to send the next DATA chunk. The * value SHOULD be the smallest TSN not acknowledged by the * receiver of the request plus 2^31. */ init_tsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map) + (1 << 31); sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL, init_tsn, GFP_ATOMIC); /* G3: The same processing as though a SACK chunk with no gap report * and a cumulative TSN ACK of the Sender's Next TSN minus 1 were * received MUST be performed. */ sctp_outq_free(&asoc->outqueue); /* G2: Compute an appropriate value for the local endpoint's next TSN, * i.e., the next TSN assigned by the receiver of the SSN/TSN reset * chunk. The value SHOULD be the highest TSN sent by the receiver * of the request plus 1. */ next_tsn = asoc->next_tsn; asoc->ctsn_ack_point = next_tsn - 1; asoc->adv_peer_ack_point = asoc->ctsn_ack_point; /* G5: The next expected and outgoing SSNs MUST be reset to 0 for all * incoming and outgoing streams. */ for (i = 0; i < stream->outcnt; i++) { SCTP_SO(stream, i)->mid = 0; SCTP_SO(stream, i)->mid_uo = 0; } for (i = 0; i < stream->incnt; i++) SCTP_SI(stream, i)->mid = 0; result = SCTP_STRRESET_PERFORMED; *evp = sctp_ulpevent_make_assoc_reset_event(asoc, 0, init_tsn, next_tsn, GFP_ATOMIC); out: sctp_update_strreset_result(asoc, result); err: return sctp_make_strreset_tsnresp(asoc, result, request_seq, next_tsn, init_tsn); } struct sctp_chunk *sctp_process_strreset_addstrm_out( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp) { struct sctp_strreset_addstrm *addstrm = param.v; struct sctp_stream *stream = &asoc->stream; __u32 result = SCTP_STRRESET_DENIED; __u32 request_seq, incnt; __u16 in, i; request_seq = ntohl(addstrm->request_seq); if (TSN_lt(asoc->strreset_inseq, request_seq) || TSN_lt(request_seq, asoc->strreset_inseq - 2)) { result = SCTP_STRRESET_ERR_BAD_SEQNO; goto err; } else if (TSN_lt(request_seq, asoc->strreset_inseq)) { i = asoc->strreset_inseq - request_seq - 1; result = asoc->strreset_result[i]; goto err; } asoc->strreset_inseq++; if (!(asoc->strreset_enable & SCTP_ENABLE_CHANGE_ASSOC_REQ)) goto out; in = ntohs(addstrm->number_of_streams); incnt = stream->incnt + in; if (!in || incnt > SCTP_MAX_STREAM) goto out; if (sctp_stream_alloc_in(stream, incnt, GFP_ATOMIC)) goto out; if (asoc->strreset_chunk) { if (!sctp_chunk_lookup_strreset_param( asoc, 0, SCTP_PARAM_RESET_ADD_IN_STREAMS)) { /* same process with outstanding isn't 0 */ result = SCTP_STRRESET_ERR_IN_PROGRESS; goto out; } asoc->strreset_outstanding--; asoc->strreset_outseq++; if (!asoc->strreset_outstanding) { struct sctp_transport *t; t = asoc->strreset_chunk->transport; if (del_timer(&t->reconf_timer)) sctp_transport_put(t); sctp_chunk_put(asoc->strreset_chunk); asoc->strreset_chunk = NULL; } } stream->incnt = incnt; result = SCTP_STRRESET_PERFORMED; *evp = sctp_ulpevent_make_stream_change_event(asoc, 0, ntohs(addstrm->number_of_streams), 0, GFP_ATOMIC); out: sctp_update_strreset_result(asoc, result); err: return sctp_make_strreset_resp(asoc, result, request_seq); } struct sctp_chunk *sctp_process_strreset_addstrm_in( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp) { struct sctp_strreset_addstrm *addstrm = param.v; struct sctp_stream *stream = &asoc->stream; __u32 result = SCTP_STRRESET_DENIED; struct sctp_chunk *chunk = NULL; __u32 request_seq, outcnt; __u16 out, i; int ret; request_seq = ntohl(addstrm->request_seq); if (TSN_lt(asoc->strreset_inseq, request_seq) || TSN_lt(request_seq, asoc->strreset_inseq - 2)) { result = SCTP_STRRESET_ERR_BAD_SEQNO; goto err; } else if (TSN_lt(request_seq, asoc->strreset_inseq)) { i = asoc->strreset_inseq - request_seq - 1; result = asoc->strreset_result[i]; if (result == SCTP_STRRESET_PERFORMED) return NULL; goto err; } asoc->strreset_inseq++; if (!(asoc->strreset_enable & SCTP_ENABLE_CHANGE_ASSOC_REQ)) goto out; if (asoc->strreset_outstanding) { result = SCTP_STRRESET_ERR_IN_PROGRESS; goto out; } out = ntohs(addstrm->number_of_streams); outcnt = stream->outcnt + out; if (!out || outcnt > SCTP_MAX_STREAM) goto out; ret = sctp_stream_alloc_out(stream, outcnt, GFP_ATOMIC); if (ret) goto out; chunk = sctp_make_strreset_addstrm(asoc, out, 0); if (!chunk) goto out; asoc->strreset_chunk = chunk; asoc->strreset_outstanding = 1; sctp_chunk_hold(asoc->strreset_chunk); stream->outcnt = outcnt; result = SCTP_STRRESET_PERFORMED; out: sctp_update_strreset_result(asoc, result); err: if (!chunk) chunk = sctp_make_strreset_resp(asoc, result, request_seq); return chunk; } struct sctp_chunk *sctp_process_strreset_resp( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp) { struct sctp_stream *stream = &asoc->stream; struct sctp_strreset_resp *resp = param.v; struct sctp_transport *t; __u16 i, nums, flags = 0; struct sctp_paramhdr *req; __u32 result; req = sctp_chunk_lookup_strreset_param(asoc, resp->response_seq, 0); if (!req) return NULL; result = ntohl(resp->result); if (result != SCTP_STRRESET_PERFORMED) { /* if in progress, do nothing but retransmit */ if (result == SCTP_STRRESET_IN_PROGRESS) return NULL; else if (result == SCTP_STRRESET_DENIED) flags = SCTP_STREAM_RESET_DENIED; else flags = SCTP_STREAM_RESET_FAILED; } if (req->type == SCTP_PARAM_RESET_OUT_REQUEST) { struct sctp_strreset_outreq *outreq; __be16 *str_p; outreq = (struct sctp_strreset_outreq *)req; str_p = outreq->list_of_streams; nums = (ntohs(outreq->param_hdr.length) - sizeof(*outreq)) / sizeof(__u16); if (result == SCTP_STRRESET_PERFORMED) { struct sctp_stream_out *sout; if (nums) { for (i = 0; i < nums; i++) { sout = SCTP_SO(stream, ntohs(str_p[i])); sout->mid = 0; sout->mid_uo = 0; } } else { for (i = 0; i < stream->outcnt; i++) { sout = SCTP_SO(stream, i); sout->mid = 0; sout->mid_uo = 0; } } } flags |= SCTP_STREAM_RESET_OUTGOING_SSN; for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; *evp = sctp_ulpevent_make_stream_reset_event(asoc, flags, nums, str_p, GFP_ATOMIC); } else if (req->type == SCTP_PARAM_RESET_IN_REQUEST) { struct sctp_strreset_inreq *inreq; __be16 *str_p; /* if the result is performed, it's impossible for inreq */ if (result == SCTP_STRRESET_PERFORMED) return NULL; inreq = (struct sctp_strreset_inreq *)req; str_p = inreq->list_of_streams; nums = (ntohs(inreq->param_hdr.length) - sizeof(*inreq)) / sizeof(__u16); flags |= SCTP_STREAM_RESET_INCOMING_SSN; *evp = sctp_ulpevent_make_stream_reset_event(asoc, flags, nums, str_p, GFP_ATOMIC); } else if (req->type == SCTP_PARAM_RESET_TSN_REQUEST) { struct sctp_strreset_resptsn *resptsn; __u32 stsn, rtsn; /* check for resptsn, as sctp_verify_reconf didn't do it*/ if (ntohs(param.p->length) != sizeof(*resptsn)) return NULL; resptsn = (struct sctp_strreset_resptsn *)resp; stsn = ntohl(resptsn->senders_next_tsn); rtsn = ntohl(resptsn->receivers_next_tsn); if (result == SCTP_STRRESET_PERFORMED) { __u32 mtsn = sctp_tsnmap_get_max_tsn_seen( &asoc->peer.tsn_map); LIST_HEAD(temp); asoc->stream.si->report_ftsn(&asoc->ulpq, mtsn); sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL, stsn, GFP_ATOMIC); /* Clean up sacked and abandoned queues only. As the * out_chunk_list may not be empty, splice it to temp, * then get it back after sctp_outq_free is done. */ list_splice_init(&asoc->outqueue.out_chunk_list, &temp); sctp_outq_free(&asoc->outqueue); list_splice_init(&temp, &asoc->outqueue.out_chunk_list); asoc->next_tsn = rtsn; asoc->ctsn_ack_point = asoc->next_tsn - 1; asoc->adv_peer_ack_point = asoc->ctsn_ack_point; for (i = 0; i < stream->outcnt; i++) { SCTP_SO(stream, i)->mid = 0; SCTP_SO(stream, i)->mid_uo = 0; } for (i = 0; i < stream->incnt; i++) SCTP_SI(stream, i)->mid = 0; } for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; *evp = sctp_ulpevent_make_assoc_reset_event(asoc, flags, stsn, rtsn, GFP_ATOMIC); } else if (req->type == SCTP_PARAM_RESET_ADD_OUT_STREAMS) { struct sctp_strreset_addstrm *addstrm; __u16 number; addstrm = (struct sctp_strreset_addstrm *)req; nums = ntohs(addstrm->number_of_streams); number = stream->outcnt - nums; if (result == SCTP_STRRESET_PERFORMED) { for (i = number; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; } else { sctp_stream_shrink_out(stream, number); stream->outcnt = number; } *evp = sctp_ulpevent_make_stream_change_event(asoc, flags, 0, nums, GFP_ATOMIC); } else if (req->type == SCTP_PARAM_RESET_ADD_IN_STREAMS) { struct sctp_strreset_addstrm *addstrm; /* if the result is performed, it's impossible for addstrm in * request. */ if (result == SCTP_STRRESET_PERFORMED) return NULL; addstrm = (struct sctp_strreset_addstrm *)req; nums = ntohs(addstrm->number_of_streams); *evp = sctp_ulpevent_make_stream_change_event(asoc, flags, nums, 0, GFP_ATOMIC); } asoc->strreset_outstanding--; asoc->strreset_outseq++; /* remove everything for this reconf request */ if (!asoc->strreset_outstanding) { t = asoc->strreset_chunk->transport; if (del_timer(&t->reconf_timer)) sctp_transport_put(t); sctp_chunk_put(asoc->strreset_chunk); asoc->strreset_chunk = NULL; } return NULL; }
2 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 // SPDX-License-Identifier: GPL-2.0 /* Watch queue and general notification mechanism, built on pipes * * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * See Documentation/core-api/watch_queue.rst */ #define pr_fmt(fmt) "watchq: " fmt #include <linux/module.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/printk.h> #include <linux/miscdevice.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/poll.h> #include <linux/uaccess.h> #include <linux/vmalloc.h> #include <linux/file.h> #include <linux/security.h> #include <linux/cred.h> #include <linux/sched/signal.h> #include <linux/watch_queue.h> #include <linux/pipe_fs_i.h> MODULE_DESCRIPTION("Watch queue"); MODULE_AUTHOR("Red Hat, Inc."); #define WATCH_QUEUE_NOTE_SIZE 128 #define WATCH_QUEUE_NOTES_PER_PAGE (PAGE_SIZE / WATCH_QUEUE_NOTE_SIZE) /* * This must be called under the RCU read-lock, which makes * sure that the wqueue still exists. It can then take the lock, * and check that the wqueue hasn't been destroyed, which in * turn makes sure that the notification pipe still exists. */ static inline bool lock_wqueue(struct watch_queue *wqueue) { spin_lock_bh(&wqueue->lock); if (unlikely(!wqueue->pipe)) { spin_unlock_bh(&wqueue->lock); return false; } return true; } static inline void unlock_wqueue(struct watch_queue *wqueue) { spin_unlock_bh(&wqueue->lock); } static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct watch_queue *wqueue = (struct watch_queue *)buf->private; struct page *page; unsigned int bit; /* We need to work out which note within the page this refers to, but * the note might have been maximum size, so merely ANDing the offset * off doesn't work. OTOH, the note must've been more than zero size. */ bit = buf->offset + buf->len; if ((bit & (WATCH_QUEUE_NOTE_SIZE - 1)) == 0) bit -= WATCH_QUEUE_NOTE_SIZE; bit /= WATCH_QUEUE_NOTE_SIZE; page = buf->page; bit += page->index; set_bit(bit, wqueue->notes_bitmap); generic_pipe_buf_release(pipe, buf); } // No try_steal function => no stealing #define watch_queue_pipe_buf_try_steal NULL /* New data written to a pipe may be appended to a buffer with this type. */ static const struct pipe_buf_operations watch_queue_pipe_buf_ops = { .release = watch_queue_pipe_buf_release, .try_steal = watch_queue_pipe_buf_try_steal, .get = generic_pipe_buf_get, }; /* * Post a notification to a watch queue. * * Must be called with the RCU lock for reading, and the * watch_queue lock held, which guarantees that the pipe * hasn't been released. */ static bool post_one_notification(struct watch_queue *wqueue, struct watch_notification *n) { void *p; struct pipe_inode_info *pipe = wqueue->pipe; struct pipe_buffer *buf; struct page *page; unsigned int head, tail, mask, note, offset, len; bool done = false; spin_lock_irq(&pipe->rd_wait.lock); mask = pipe->ring_size - 1; head = pipe->head; tail = pipe->tail; if (pipe_full(head, tail, pipe->ring_size)) goto lost; note = find_first_bit(wqueue->notes_bitmap, wqueue->nr_notes); if (note >= wqueue->nr_notes) goto lost; page = wqueue->notes[note / WATCH_QUEUE_NOTES_PER_PAGE]; offset = note % WATCH_QUEUE_NOTES_PER_PAGE * WATCH_QUEUE_NOTE_SIZE; get_page(page); len = n->info & WATCH_INFO_LENGTH; p = kmap_atomic(page); memcpy(p + offset, n, len); kunmap_atomic(p); buf = &pipe->bufs[head & mask]; buf->page = page; buf->private = (unsigned long)wqueue; buf->ops = &watch_queue_pipe_buf_ops; buf->offset = offset; buf->len = len; buf->flags = PIPE_BUF_FLAG_WHOLE; smp_store_release(&pipe->head, head + 1); /* vs pipe_read() */ if (!test_and_clear_bit(note, wqueue->notes_bitmap)) { spin_unlock_irq(&pipe->rd_wait.lock); BUG(); } wake_up_interruptible_sync_poll_locked(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); done = true; out: spin_unlock_irq(&pipe->rd_wait.lock); if (done) kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); return done; lost: buf = &pipe->bufs[(head - 1) & mask]; buf->flags |= PIPE_BUF_FLAG_LOSS; goto out; } /* * Apply filter rules to a notification. */ static bool filter_watch_notification(const struct watch_filter *wf, const struct watch_notification *n) { const struct watch_type_filter *wt; unsigned int st_bits = sizeof(wt->subtype_filter[0]) * 8; unsigned int st_index = n->subtype / st_bits; unsigned int st_bit = 1U << (n->subtype % st_bits); int i; if (!test_bit(n->type, wf->type_filter)) return false; for (i = 0; i < wf->nr_filters; i++) { wt = &wf->filters[i]; if (n->type == wt->type && (wt->subtype_filter[st_index] & st_bit) && (n->info & wt->info_mask) == wt->info_filter) return true; } return false; /* If there is a filter, the default is to reject. */ } /** * __post_watch_notification - Post an event notification * @wlist: The watch list to post the event to. * @n: The notification record to post. * @cred: The creds of the process that triggered the notification. * @id: The ID to match on the watch. * * Post a notification of an event into a set of watch queues and let the users * know. * * The size of the notification should be set in n->info & WATCH_INFO_LENGTH and * should be in units of sizeof(*n). */ void __post_watch_notification(struct watch_list *wlist, struct watch_notification *n, const struct cred *cred, u64 id) { const struct watch_filter *wf; struct watch_queue *wqueue; struct watch *watch; if (((n->info & WATCH_INFO_LENGTH) >> WATCH_INFO_LENGTH__SHIFT) == 0) { WARN_ON(1); return; } rcu_read_lock(); hlist_for_each_entry_rcu(watch, &wlist->watchers, list_node) { if (watch->id != id) continue; n->info &= ~WATCH_INFO_ID; n->info |= watch->info_id; wqueue = rcu_dereference(watch->queue); wf = rcu_dereference(wqueue->filter); if (wf && !filter_watch_notification(wf, n)) continue; if (security_post_notification(watch->cred, cred, n) < 0) continue; if (lock_wqueue(wqueue)) { post_one_notification(wqueue, n); unlock_wqueue(wqueue); } } rcu_read_unlock(); } EXPORT_SYMBOL(__post_watch_notification); /* * Allocate sufficient pages to preallocation for the requested number of * notifications. */ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) { struct watch_queue *wqueue = pipe->watch_queue; struct page **pages; unsigned long *bitmap; unsigned long user_bufs; int ret, i, nr_pages; if (!wqueue) return -ENODEV; if (wqueue->notes) return -EBUSY; if (nr_notes < 1 || nr_notes > 512) /* TODO: choose a better hard limit */ return -EINVAL; nr_pages = (nr_notes + WATCH_QUEUE_NOTES_PER_PAGE - 1); nr_pages /= WATCH_QUEUE_NOTES_PER_PAGE; user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_pages); if (nr_pages > pipe->max_usage && (too_many_pipe_buffers_hard(user_bufs) || too_many_pipe_buffers_soft(user_bufs)) && pipe_is_unprivileged_user()) { ret = -EPERM; goto error; } nr_notes = nr_pages * WATCH_QUEUE_NOTES_PER_PAGE; ret = pipe_resize_ring(pipe, roundup_pow_of_two(nr_notes)); if (ret < 0) goto error; ret = -ENOMEM; pages = kcalloc(sizeof(struct page *), nr_pages, GFP_KERNEL); if (!pages) goto error; for (i = 0; i < nr_pages; i++) { pages[i] = alloc_page(GFP_KERNEL); if (!pages[i]) goto error_p; pages[i]->index = i * WATCH_QUEUE_NOTES_PER_PAGE; } bitmap = bitmap_alloc(nr_notes, GFP_KERNEL); if (!bitmap) goto error_p; bitmap_fill(bitmap, nr_notes); wqueue->notes = pages; wqueue->notes_bitmap = bitmap; wqueue->nr_pages = nr_pages; wqueue->nr_notes = nr_notes; return 0; error_p: while (--i >= 0) __free_page(pages[i]); kfree(pages); error: (void) account_pipe_buffers(pipe->user, nr_pages, pipe->nr_accounted); return ret; } /* * Set the filter on a watch queue. */ long watch_queue_set_filter(struct pipe_inode_info *pipe, struct watch_notification_filter __user *_filter) { struct watch_notification_type_filter *tf; struct watch_notification_filter filter; struct watch_type_filter *q; struct watch_filter *wfilter; struct watch_queue *wqueue = pipe->watch_queue; int ret, nr_filter = 0, i; if (!wqueue) return -ENODEV; if (!_filter) { /* Remove the old filter */ wfilter = NULL; goto set; } /* Grab the user's filter specification */ if (copy_from_user(&filter, _filter, sizeof(filter)) != 0) return -EFAULT; if (filter.nr_filters == 0 || filter.nr_filters > 16 || filter.__reserved != 0) return -EINVAL; tf = memdup_user(_filter->filters, filter.nr_filters * sizeof(*tf)); if (IS_ERR(tf)) return PTR_ERR(tf); ret = -EINVAL; for (i = 0; i < filter.nr_filters; i++) { if ((tf[i].info_filter & ~tf[i].info_mask) || tf[i].info_mask & WATCH_INFO_LENGTH) goto err_filter; /* Ignore any unknown types */ if (tf[i].type >= WATCH_TYPE__NR) continue; nr_filter++; } /* Now we need to build the internal filter from only the relevant * user-specified filters. */ ret = -ENOMEM; wfilter = kzalloc(struct_size(wfilter, filters, nr_filter), GFP_KERNEL); if (!wfilter) goto err_filter; wfilter->nr_filters = nr_filter; q = wfilter->filters; for (i = 0; i < filter.nr_filters; i++) { if (tf[i].type >= WATCH_TYPE__NR) continue; q->type = tf[i].type; q->info_filter = tf[i].info_filter; q->info_mask = tf[i].info_mask; q->subtype_filter[0] = tf[i].subtype_filter[0]; __set_bit(q->type, wfilter->type_filter); q++; } kfree(tf); set: pipe_lock(pipe); wfilter = rcu_replace_pointer(wqueue->filter, wfilter, lockdep_is_held(&pipe->mutex)); pipe_unlock(pipe); if (wfilter) kfree_rcu(wfilter, rcu); return 0; err_filter: kfree(tf); return ret; } static void __put_watch_queue(struct kref *kref) { struct watch_queue *wqueue = container_of(kref, struct watch_queue, usage); struct watch_filter *wfilter; int i; for (i = 0; i < wqueue->nr_pages; i++) __free_page(wqueue->notes[i]); kfree(wqueue->notes); bitmap_free(wqueue->notes_bitmap); wfilter = rcu_access_pointer(wqueue->filter); if (wfilter) kfree_rcu(wfilter, rcu); kfree_rcu(wqueue, rcu); } /** * put_watch_queue - Dispose of a ref on a watchqueue. * @wqueue: The watch queue to unref. */ void put_watch_queue(struct watch_queue *wqueue) { kref_put(&wqueue->usage, __put_watch_queue); } EXPORT_SYMBOL(put_watch_queue); static void free_watch(struct rcu_head *rcu) { struct watch *watch = container_of(rcu, struct watch, rcu); put_watch_queue(rcu_access_pointer(watch->queue)); atomic_dec(&watch->cred->user->nr_watches); put_cred(watch->cred); kfree(watch); } static void __put_watch(struct kref *kref) { struct watch *watch = container_of(kref, struct watch, usage); call_rcu(&watch->rcu, free_watch); } /* * Discard a watch. */ static void put_watch(struct watch *watch) { kref_put(&watch->usage, __put_watch); } /** * init_watch - Initialise a watch * @watch: The watch to initialise. * @wqueue: The queue to assign. * * Initialise a watch and set the watch queue. */ void init_watch(struct watch *watch, struct watch_queue *wqueue) { kref_init(&watch->usage); INIT_HLIST_NODE(&watch->list_node); INIT_HLIST_NODE(&watch->queue_node); rcu_assign_pointer(watch->queue, wqueue); } static int add_one_watch(struct watch *watch, struct watch_list *wlist, struct watch_queue *wqueue) { const struct cred *cred; struct watch *w; hlist_for_each_entry(w, &wlist->watchers, list_node) { struct watch_queue *wq = rcu_access_pointer(w->queue); if (wqueue == wq && watch->id == w->id) return -EBUSY; } cred = current_cred(); if (atomic_inc_return(&cred->user->nr_watches) > task_rlimit(current, RLIMIT_NOFILE)) { atomic_dec(&cred->user->nr_watches); return -EAGAIN; } watch->cred = get_cred(cred); rcu_assign_pointer(watch->watch_list, wlist); kref_get(&wqueue->usage); kref_get(&watch->usage); hlist_add_head(&watch->queue_node, &wqueue->watches); hlist_add_head_rcu(&watch->list_node, &wlist->watchers); return 0; } /** * add_watch_to_object - Add a watch on an object to a watch list * @watch: The watch to add * @wlist: The watch list to add to * * @watch->queue must have been set to point to the queue to post notifications * to and the watch list of the object to be watched. @watch->cred must also * have been set to the appropriate credentials and a ref taken on them. * * The caller must pin the queue and the list both and must hold the list * locked against racing watch additions/removals. */ int add_watch_to_object(struct watch *watch, struct watch_list *wlist) { struct watch_queue *wqueue; int ret = -ENOENT; rcu_read_lock(); wqueue = rcu_access_pointer(watch->queue); if (lock_wqueue(wqueue)) { spin_lock(&wlist->lock); ret = add_one_watch(watch, wlist, wqueue); spin_unlock(&wlist->lock); unlock_wqueue(wqueue); } rcu_read_unlock(); return ret; } EXPORT_SYMBOL(add_watch_to_object); /** * remove_watch_from_object - Remove a watch or all watches from an object. * @wlist: The watch list to remove from * @wq: The watch queue of interest (ignored if @all is true) * @id: The ID of the watch to remove (ignored if @all is true) * @all: True to remove all objects * * Remove a specific watch or all watches from an object. A notification is * sent to the watcher to tell them that this happened. */ int remove_watch_from_object(struct watch_list *wlist, struct watch_queue *wq, u64 id, bool all) { struct watch_notification_removal n; struct watch_queue *wqueue; struct watch *watch; int ret = -EBADSLT; rcu_read_lock(); again: spin_lock(&wlist->lock); hlist_for_each_entry(watch, &wlist->watchers, list_node) { if (all || (watch->id == id && rcu_access_pointer(watch->queue) == wq)) goto found; } spin_unlock(&wlist->lock); goto out; found: ret = 0; hlist_del_init_rcu(&watch->list_node); rcu_assign_pointer(watch->watch_list, NULL); spin_unlock(&wlist->lock); /* We now own the reference on watch that used to belong to wlist. */ n.watch.type = WATCH_TYPE_META; n.watch.subtype = WATCH_META_REMOVAL_NOTIFICATION; n.watch.info = watch->info_id | watch_sizeof(n.watch); n.id = id; if (id != 0) n.watch.info = watch->info_id | watch_sizeof(n); wqueue = rcu_dereference(watch->queue); if (lock_wqueue(wqueue)) { post_one_notification(wqueue, &n.watch); if (!hlist_unhashed(&watch->queue_node)) { hlist_del_init_rcu(&watch->queue_node); put_watch(watch); } unlock_wqueue(wqueue); } if (wlist->release_watch) { void (*release_watch)(struct watch *); release_watch = wlist->release_watch; rcu_read_unlock(); (*release_watch)(watch); rcu_read_lock(); } put_watch(watch); if (all && !hlist_empty(&wlist->watchers)) goto again; out: rcu_read_unlock(); return ret; } EXPORT_SYMBOL(remove_watch_from_object); /* * Remove all the watches that are contributory to a queue. This has the * potential to race with removal of the watches by the destruction of the * objects being watched or with the distribution of notifications. */ void watch_queue_clear(struct watch_queue *wqueue) { struct watch_list *wlist; struct watch *watch; bool release; rcu_read_lock(); spin_lock_bh(&wqueue->lock); /* * This pipe can be freed by callers like free_pipe_info(). * Removing this reference also prevents new notifications. */ wqueue->pipe = NULL; while (!hlist_empty(&wqueue->watches)) { watch = hlist_entry(wqueue->watches.first, struct watch, queue_node); hlist_del_init_rcu(&watch->queue_node); /* We now own a ref on the watch. */ spin_unlock_bh(&wqueue->lock); /* We can't do the next bit under the queue lock as we need to * get the list lock - which would cause a deadlock if someone * was removing from the opposite direction at the same time or * posting a notification. */ wlist = rcu_dereference(watch->watch_list); if (wlist) { void (*release_watch)(struct watch *); spin_lock(&wlist->lock); release = !hlist_unhashed(&watch->list_node); if (release) { hlist_del_init_rcu(&watch->list_node); rcu_assign_pointer(watch->watch_list, NULL); /* We now own a second ref on the watch. */ } release_watch = wlist->release_watch; spin_unlock(&wlist->lock); if (release) { if (release_watch) { rcu_read_unlock(); /* This might need to call dput(), so * we have to drop all the locks. */ (*release_watch)(watch); rcu_read_lock(); } put_watch(watch); } } put_watch(watch); spin_lock_bh(&wqueue->lock); } spin_unlock_bh(&wqueue->lock); rcu_read_unlock(); } /** * get_watch_queue - Get a watch queue from its file descriptor. * @fd: The fd to query. */ struct watch_queue *get_watch_queue(int fd) { struct pipe_inode_info *pipe; struct watch_queue *wqueue = ERR_PTR(-EINVAL); struct fd f; f = fdget(fd); if (f.file) { pipe = get_pipe_info(f.file, false); if (pipe && pipe->watch_queue) { wqueue = pipe->watch_queue; kref_get(&wqueue->usage); } fdput(f); } return wqueue; } EXPORT_SYMBOL(get_watch_queue); /* * Initialise a watch queue */ int watch_queue_init(struct pipe_inode_info *pipe) { struct watch_queue *wqueue; wqueue = kzalloc(sizeof(*wqueue), GFP_KERNEL); if (!wqueue) return -ENOMEM; wqueue->pipe = pipe; kref_init(&wqueue->usage); spin_lock_init(&wqueue->lock); INIT_HLIST_HEAD(&wqueue->watches); pipe->watch_queue = wqueue; return 0; }
28 28 28 28 28 28 22 22 22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 // SPDX-License-Identifier: GPL-2.0 #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/netdevice.h> #include <net/gro_cells.h> struct gro_cell { struct sk_buff_head napi_skbs; struct napi_struct napi; }; int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb) { struct net_device *dev = skb->dev; struct gro_cell *cell; int res; rcu_read_lock(); if (unlikely(!(dev->flags & IFF_UP))) goto drop; if (!gcells->cells || skb_cloned(skb) || netif_elide_gro(dev)) { res = netif_rx(skb); goto unlock; } cell = this_cpu_ptr(gcells->cells); if (skb_queue_len(&cell->napi_skbs) > READ_ONCE(netdev_max_backlog)) { drop: dev_core_stats_rx_dropped_inc(dev); kfree_skb(skb); res = NET_RX_DROP; goto unlock; } __skb_queue_tail(&cell->napi_skbs, skb); if (skb_queue_len(&cell->napi_skbs) == 1) napi_schedule(&cell->napi); res = NET_RX_SUCCESS; unlock: rcu_read_unlock(); return res; } EXPORT_SYMBOL(gro_cells_receive); /* called under BH context */ static int gro_cell_poll(struct napi_struct *napi, int budget) { struct gro_cell *cell = container_of(napi, struct gro_cell, napi); struct sk_buff *skb; int work_done = 0; while (work_done < budget) { skb = __skb_dequeue(&cell->napi_skbs); if (!skb) break; napi_gro_receive(napi, skb); work_done++; } if (work_done < budget) napi_complete_done(napi, work_done); return work_done; } int gro_cells_init(struct gro_cells *gcells, struct net_device *dev) { int i; gcells->cells = alloc_percpu(struct gro_cell); if (!gcells->cells) return -ENOMEM; for_each_possible_cpu(i) { struct gro_cell *cell = per_cpu_ptr(gcells->cells, i); __skb_queue_head_init(&cell->napi_skbs); set_bit(NAPI_STATE_NO_BUSY_POLL, &cell->napi.state); netif_napi_add(dev, &cell->napi, gro_cell_poll); napi_enable(&cell->napi); } return 0; } EXPORT_SYMBOL(gro_cells_init); struct percpu_free_defer { struct rcu_head rcu; void __percpu *ptr; }; static void percpu_free_defer_callback(struct rcu_head *head) { struct percpu_free_defer *defer; defer = container_of(head, struct percpu_free_defer, rcu); free_percpu(defer->ptr); kfree(defer); } void gro_cells_destroy(struct gro_cells *gcells) { struct percpu_free_defer *defer; int i; if (!gcells->cells) return; for_each_possible_cpu(i) { struct gro_cell *cell = per_cpu_ptr(gcells->cells, i); napi_disable(&cell->napi); __netif_napi_del(&cell->napi); __skb_queue_purge(&cell->napi_skbs); } /* We need to observe an rcu grace period before freeing ->cells, * because netpoll could access dev->napi_list under rcu protection. * Try hard using call_rcu() instead of synchronize_rcu(), * because we might be called from cleanup_net(), and we * definitely do not want to block this critical task. */ defer = kmalloc(sizeof(*defer), GFP_KERNEL | __GFP_NOWARN); if (likely(defer)) { defer->ptr = gcells->cells; call_rcu(&defer->rcu, percpu_free_defer_callback); } else { /* We do not hold RTNL at this point, synchronize_net() * would not be able to expedite this sync. */ synchronize_rcu_expedited(); free_percpu(gcells->cells); } gcells->cells = NULL; } EXPORT_SYMBOL(gro_cells_destroy);
1179 638 638 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 // SPDX-License-Identifier: GPL-2.0-or-later /* * x86 instruction attribute tables * * Written by Masami Hiramatsu <mhiramat@redhat.com> */ #include <asm/insn.h> /* __ignore_sync_check__ */ /* Attribute tables are generated from opcode map */ #include "inat-tables.c" /* Attribute search APIs */ insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode) { return inat_primary_table[opcode]; } int inat_get_last_prefix_id(insn_byte_t last_pfx) { insn_attr_t lpfx_attr; lpfx_attr = inat_get_opcode_attribute(last_pfx); return inat_last_prefix_id(lpfx_attr); } insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, int lpfx_id, insn_attr_t esc_attr) { const insn_attr_t *table; int n; n = inat_escape_id(esc_attr); table = inat_escape_tables[n][0]; if (!table) return 0; if (inat_has_variant(table[opcode]) && lpfx_id) { table = inat_escape_tables[n][lpfx_id]; if (!table) return 0; } return table[opcode]; } insn_attr_t inat_get_group_attribute(insn_byte_t modrm, int lpfx_id, insn_attr_t grp_attr) { const insn_attr_t *table; int n; n = inat_group_id(grp_attr); table = inat_group_tables[n][0]; if (!table) return inat_group_common_attribute(grp_attr); if (inat_has_variant(table[X86_MODRM_REG(modrm)]) && lpfx_id) { table = inat_group_tables[n][lpfx_id]; if (!table) return inat_group_common_attribute(grp_attr); } return table[X86_MODRM_REG(modrm)] | inat_group_common_attribute(grp_attr); } insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m, insn_byte_t vex_p) { const insn_attr_t *table; if (vex_m > X86_VEX_M_MAX || vex_p > INAT_LSTPFX_MAX) return 0; /* At first, this checks the master table */ table = inat_avx_tables[vex_m][0]; if (!table) return 0; if (!inat_is_group(table[opcode]) && vex_p) { /* If this is not a group, get attribute directly */ table = inat_avx_tables[vex_m][vex_p]; if (!table) return 0; } return table[opcode]; }
6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2003-2005 Devicescape Software, Inc. * Copyright (c) 2006 Jiri Benc <jbenc@suse.cz> * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright(c) 2016 Intel Deutschland GmbH * Copyright (C) 2018 - 2022 Intel Corporation */ #include <linux/debugfs.h> #include <linux/ieee80211.h> #include "ieee80211_i.h" #include "debugfs.h" #include "debugfs_sta.h" #include "sta_info.h" #include "driver-ops.h" /* sta attributtes */ #define STA_READ(name, field, format_string) \ static ssize_t sta_ ##name## _read(struct file *file, \ char __user *userbuf, \ size_t count, loff_t *ppos) \ { \ struct sta_info *sta = file->private_data; \ return mac80211_format_buffer(userbuf, count, ppos, \ format_string, sta->field); \ } #define STA_READ_D(name, field) STA_READ(name, field, "%d\n") #define STA_OPS(name) \ static const struct file_operations sta_ ##name## _ops = { \ .read = sta_##name##_read, \ .open = simple_open, \ .llseek = generic_file_llseek, \ } #define STA_OPS_RW(name) \ static const struct file_operations sta_ ##name## _ops = { \ .read = sta_##name##_read, \ .write = sta_##name##_write, \ .open = simple_open, \ .llseek = generic_file_llseek, \ } #define STA_FILE(name, field, format) \ STA_READ_##format(name, field) \ STA_OPS(name) STA_FILE(aid, sta.aid, D); static const char * const sta_flag_names[] = { #define FLAG(F) [WLAN_STA_##F] = #F FLAG(AUTH), FLAG(ASSOC), FLAG(PS_STA), FLAG(AUTHORIZED), FLAG(SHORT_PREAMBLE), FLAG(WDS), FLAG(CLEAR_PS_FILT), FLAG(MFP), FLAG(BLOCK_BA), FLAG(PS_DRIVER), FLAG(PSPOLL), FLAG(TDLS_PEER), FLAG(TDLS_PEER_AUTH), FLAG(TDLS_INITIATOR), FLAG(TDLS_CHAN_SWITCH), FLAG(TDLS_OFF_CHANNEL), FLAG(TDLS_WIDER_BW), FLAG(UAPSD), FLAG(SP), FLAG(4ADDR_EVENT), FLAG(INSERTED), FLAG(RATE_CONTROL), FLAG(TOFFSET_KNOWN), FLAG(MPSP_OWNER), FLAG(MPSP_RECIPIENT), FLAG(PS_DELIVER), FLAG(USES_ENCRYPTION), FLAG(DECAP_OFFLOAD), #undef FLAG }; static ssize_t sta_flags_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { char buf[16 * NUM_WLAN_STA_FLAGS], *pos = buf; char *end = buf + sizeof(buf) - 1; struct sta_info *sta = file->private_data; unsigned int flg; BUILD_BUG_ON(ARRAY_SIZE(sta_flag_names) != NUM_WLAN_STA_FLAGS); for (flg = 0; flg < NUM_WLAN_STA_FLAGS; flg++) { if (test_sta_flag(sta, flg)) pos += scnprintf(pos, end - pos, "%s\n", sta_flag_names[flg]); } return simple_read_from_buffer(userbuf, count, ppos, buf, strlen(buf)); } STA_OPS(flags); static ssize_t sta_num_ps_buf_frames_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { struct sta_info *sta = file->private_data; char buf[17*IEEE80211_NUM_ACS], *p = buf; int ac; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) p += scnprintf(p, sizeof(buf)+buf-p, "AC%d: %d\n", ac, skb_queue_len(&sta->ps_tx_buf[ac]) + skb_queue_len(&sta->tx_filtered[ac])); return simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); } STA_OPS(num_ps_buf_frames); static ssize_t sta_last_seq_ctrl_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { char buf[15*IEEE80211_NUM_TIDS], *p = buf; int i; struct sta_info *sta = file->private_data; for (i = 0; i < IEEE80211_NUM_TIDS; i++) p += scnprintf(p, sizeof(buf)+buf-p, "%x ", le16_to_cpu(sta->last_seq_ctrl[i])); p += scnprintf(p, sizeof(buf)+buf-p, "\n"); return simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); } STA_OPS(last_seq_ctrl); #define AQM_TXQ_ENTRY_LEN 130 static ssize_t sta_aqm_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { struct sta_info *sta = file->private_data; struct ieee80211_local *local = sta->local; size_t bufsz = AQM_TXQ_ENTRY_LEN * (IEEE80211_NUM_TIDS + 2); char *buf = kzalloc(bufsz, GFP_KERNEL), *p = buf; struct txq_info *txqi; ssize_t rv; int i; if (!buf) return -ENOMEM; spin_lock_bh(&local->fq.lock); rcu_read_lock(); p += scnprintf(p, bufsz + buf - p, "target %uus interval %uus ecn %s\n", codel_time_to_us(sta->cparams.target), codel_time_to_us(sta->cparams.interval), sta->cparams.ecn ? "yes" : "no"); p += scnprintf(p, bufsz + buf - p, "tid ac backlog-bytes backlog-packets new-flows drops marks overlimit collisions tx-bytes tx-packets flags\n"); for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { if (!sta->sta.txq[i]) continue; txqi = to_txq_info(sta->sta.txq[i]); p += scnprintf(p, bufsz + buf - p, "%d %d %u %u %u %u %u %u %u %u %u 0x%lx(%s%s%s%s)\n", txqi->txq.tid, txqi->txq.ac, txqi->tin.backlog_bytes, txqi->tin.backlog_packets, txqi->tin.flows, txqi->cstats.drop_count, txqi->cstats.ecn_mark, txqi->tin.overlimit, txqi->tin.collisions, txqi->tin.tx_bytes, txqi->tin.tx_packets, txqi->flags, test_bit(IEEE80211_TXQ_STOP, &txqi->flags) ? "STOP" : "RUN", test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags) ? " AMPDU" : "", test_bit(IEEE80211_TXQ_NO_AMSDU, &txqi->flags) ? " NO-AMSDU" : "", test_bit(IEEE80211_TXQ_DIRTY, &txqi->flags) ? " DIRTY" : ""); } rcu_read_unlock(); spin_unlock_bh(&local->fq.lock); rv = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); kfree(buf); return rv; } STA_OPS(aqm); static ssize_t sta_airtime_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { struct sta_info *sta = file->private_data; struct ieee80211_local *local = sta->sdata->local; size_t bufsz = 400; char *buf = kzalloc(bufsz, GFP_KERNEL), *p = buf; u64 rx_airtime = 0, tx_airtime = 0; s32 deficit[IEEE80211_NUM_ACS]; ssize_t rv; int ac; if (!buf) return -ENOMEM; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { spin_lock_bh(&local->active_txq_lock[ac]); rx_airtime += sta->airtime[ac].rx_airtime; tx_airtime += sta->airtime[ac].tx_airtime; deficit[ac] = sta->airtime[ac].deficit; spin_unlock_bh(&local->active_txq_lock[ac]); } p += scnprintf(p, bufsz + buf - p, "RX: %llu us\nTX: %llu us\nWeight: %u\n" "Deficit: VO: %d us VI: %d us BE: %d us BK: %d us\n", rx_airtime, tx_airtime, sta->airtime_weight, deficit[0], deficit[1], deficit[2], deficit[3]); rv = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); kfree(buf); return rv; } static ssize_t sta_airtime_write(struct file *file, const char __user *userbuf, size_t count, loff_t *ppos) { struct sta_info *sta = file->private_data; struct ieee80211_local *local = sta->sdata->local; int ac; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { spin_lock_bh(&local->active_txq_lock[ac]); sta->airtime[ac].rx_airtime = 0; sta->airtime[ac].tx_airtime = 0; sta->airtime[ac].deficit = sta->airtime_weight; spin_unlock_bh(&local->active_txq_lock[ac]); } return count; } STA_OPS_RW(airtime); static ssize_t sta_aql_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { struct sta_info *sta = file->private_data; struct ieee80211_local *local = sta->sdata->local; size_t bufsz = 400; char *buf = kzalloc(bufsz, GFP_KERNEL), *p = buf; u32 q_depth[IEEE80211_NUM_ACS]; u32 q_limit_l[IEEE80211_NUM_ACS], q_limit_h[IEEE80211_NUM_ACS]; ssize_t rv; int ac; if (!buf) return -ENOMEM; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { spin_lock_bh(&local->active_txq_lock[ac]); q_limit_l[ac] = sta->airtime[ac].aql_limit_low; q_limit_h[ac] = sta->airtime[ac].aql_limit_high; spin_unlock_bh(&local->active_txq_lock[ac]); q_depth[ac] = atomic_read(&sta->airtime[ac].aql_tx_pending); } p += scnprintf(p, bufsz + buf - p, "Q depth: VO: %u us VI: %u us BE: %u us BK: %u us\n" "Q limit[low/high]: VO: %u/%u VI: %u/%u BE: %u/%u BK: %u/%u\n", q_depth[0], q_depth[1], q_depth[2], q_depth[3], q_limit_l[0], q_limit_h[0], q_limit_l[1], q_limit_h[1], q_limit_l[2], q_limit_h[2], q_limit_l[3], q_limit_h[3]); rv = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); kfree(buf); return rv; } static ssize_t sta_aql_write(struct file *file, const char __user *userbuf, size_t count, loff_t *ppos) { struct sta_info *sta = file->private_data; u32 ac, q_limit_l, q_limit_h; char _buf[100] = {}, *buf = _buf; if (count > sizeof(_buf)) return -EINVAL; if (copy_from_user(buf, userbuf, count)) return -EFAULT; buf[sizeof(_buf) - 1] = '\0'; if (sscanf(buf, "limit %u %u %u", &ac, &q_limit_l, &q_limit_h) != 3) return -EINVAL; if (ac >= IEEE80211_NUM_ACS) return -EINVAL; sta->airtime[ac].aql_limit_low = q_limit_l; sta->airtime[ac].aql_limit_high = q_limit_h; return count; } STA_OPS_RW(aql); static ssize_t sta_agg_status_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { char *buf, *p; ssize_t bufsz = 71 + IEEE80211_NUM_TIDS * 40; int i; struct sta_info *sta = file->private_data; struct tid_ampdu_rx *tid_rx; struct tid_ampdu_tx *tid_tx; ssize_t ret; buf = kzalloc(bufsz, GFP_KERNEL); if (!buf) return -ENOMEM; p = buf; rcu_read_lock(); p += scnprintf(p, bufsz + buf - p, "next dialog_token: %#02x\n", sta->ampdu_mlme.dialog_token_allocator + 1); p += scnprintf(p, bufsz + buf - p, "TID\t\tRX\tDTKN\tSSN\t\tTX\tDTKN\tpending\n"); for (i = 0; i < IEEE80211_NUM_TIDS; i++) { bool tid_rx_valid; tid_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[i]); tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[i]); tid_rx_valid = test_bit(i, sta->ampdu_mlme.agg_session_valid); p += scnprintf(p, bufsz + buf - p, "%02d", i); p += scnprintf(p, bufsz + buf - p, "\t\t%x", tid_rx_valid); p += scnprintf(p, bufsz + buf - p, "\t%#.2x", tid_rx_valid ? sta->ampdu_mlme.tid_rx_token[i] : 0); p += scnprintf(p, bufsz + buf - p, "\t%#.3x", tid_rx ? tid_rx->ssn : 0); p += scnprintf(p, bufsz + buf - p, "\t\t%x", !!tid_tx); p += scnprintf(p, bufsz + buf - p, "\t%#.2x", tid_tx ? tid_tx->dialog_token : 0); p += scnprintf(p, bufsz + buf - p, "\t%03d", tid_tx ? skb_queue_len(&tid_tx->pending) : 0); p += scnprintf(p, bufsz + buf - p, "\n"); } rcu_read_unlock(); ret = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); kfree(buf); return ret; } static ssize_t sta_agg_status_write(struct file *file, const char __user *userbuf, size_t count, loff_t *ppos) { char _buf[25] = {}, *buf = _buf; struct sta_info *sta = file->private_data; bool start, tx; unsigned long tid; char *pos; int ret, timeout = 5000; if (count > sizeof(_buf)) return -EINVAL; if (copy_from_user(buf, userbuf, count)) return -EFAULT; buf[sizeof(_buf) - 1] = '\0'; pos = buf; buf = strsep(&pos, " "); if (!buf) return -EINVAL; if (!strcmp(buf, "tx")) tx = true; else if (!strcmp(buf, "rx")) tx = false; else return -EINVAL; buf = strsep(&pos, " "); if (!buf) return -EINVAL; if (!strcmp(buf, "start")) { start = true; if (!tx) return -EINVAL; } else if (!strcmp(buf, "stop")) { start = false; } else { return -EINVAL; } buf = strsep(&pos, " "); if (!buf) return -EINVAL; if (sscanf(buf, "timeout=%d", &timeout) == 1) { buf = strsep(&pos, " "); if (!buf || !tx || !start) return -EINVAL; } ret = kstrtoul(buf, 0, &tid); if (ret || tid >= IEEE80211_NUM_TIDS) return -EINVAL; if (tx) { if (start) ret = ieee80211_start_tx_ba_session(&sta->sta, tid, timeout); else ret = ieee80211_stop_tx_ba_session(&sta->sta, tid); } else { __ieee80211_stop_rx_ba_session(sta, tid, WLAN_BACK_RECIPIENT, 3, true); ret = 0; } return ret ?: count; } STA_OPS_RW(agg_status); /* link sta attributes */ #define LINK_STA_OPS(name) \ static const struct file_operations link_sta_ ##name## _ops = { \ .read = link_sta_##name##_read, \ .open = simple_open, \ .llseek = generic_file_llseek, \ } static ssize_t link_sta_addr_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { struct link_sta_info *link_sta = file->private_data; u8 mac[3 * ETH_ALEN + 1]; snprintf(mac, sizeof(mac), "%pM\n", link_sta->pub->addr); return simple_read_from_buffer(userbuf, count, ppos, mac, 3 * ETH_ALEN); } LINK_STA_OPS(addr); static ssize_t link_sta_ht_capa_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { #define PRINT_HT_CAP(_cond, _str) \ do { \ if (_cond) \ p += scnprintf(p, bufsz + buf - p, "\t" _str "\n"); \ } while (0) char *buf, *p; int i; ssize_t bufsz = 512; struct link_sta_info *link_sta = file->private_data; struct ieee80211_sta_ht_cap *htc = &link_sta->pub->ht_cap; ssize_t ret; buf = kzalloc(bufsz, GFP_KERNEL); if (!buf) return -ENOMEM; p = buf; p += scnprintf(p, bufsz + buf - p, "ht %ssupported\n", htc->ht_supported ? "" : "not "); if (htc->ht_supported) { p += scnprintf(p, bufsz + buf - p, "cap: %#.4x\n", htc->cap); PRINT_HT_CAP((htc->cap & BIT(0)), "RX LDPC"); PRINT_HT_CAP((htc->cap & BIT(1)), "HT20/HT40"); PRINT_HT_CAP(!(htc->cap & BIT(1)), "HT20"); PRINT_HT_CAP(((htc->cap >> 2) & 0x3) == 0, "Static SM Power Save"); PRINT_HT_CAP(((htc->cap >> 2) & 0x3) == 1, "Dynamic SM Power Save"); PRINT_HT_CAP(((htc->cap >> 2) & 0x3) == 3, "SM Power Save disabled"); PRINT_HT_CAP((htc->cap & BIT(4)), "RX Greenfield"); PRINT_HT_CAP((htc->cap & BIT(5)), "RX HT20 SGI"); PRINT_HT_CAP((htc->cap & BIT(6)), "RX HT40 SGI"); PRINT_HT_CAP((htc->cap & BIT(7)), "TX STBC"); PRINT_HT_CAP(((htc->cap >> 8) & 0x3) == 0, "No RX STBC"); PRINT_HT_CAP(((htc->cap >> 8) & 0x3) == 1, "RX STBC 1-stream"); PRINT_HT_CAP(((htc->cap >> 8) & 0x3) == 2, "RX STBC 2-streams"); PRINT_HT_CAP(((htc->cap >> 8) & 0x3) == 3, "RX STBC 3-streams"); PRINT_HT_CAP((htc->cap & BIT(10)), "HT Delayed Block Ack"); PRINT_HT_CAP(!(htc->cap & BIT(11)), "Max AMSDU length: " "3839 bytes"); PRINT_HT_CAP((htc->cap & BIT(11)), "Max AMSDU length: " "7935 bytes"); /* * For beacons and probe response this would mean the BSS * does or does not allow the usage of DSSS/CCK HT40. * Otherwise it means the STA does or does not use * DSSS/CCK HT40. */ PRINT_HT_CAP((htc->cap & BIT(12)), "DSSS/CCK HT40"); PRINT_HT_CAP(!(htc->cap & BIT(12)), "No DSSS/CCK HT40"); /* BIT(13) is reserved */ PRINT_HT_CAP((htc->cap & BIT(14)), "40 MHz Intolerant"); PRINT_HT_CAP((htc->cap & BIT(15)), "L-SIG TXOP protection"); p += scnprintf(p, bufsz + buf - p, "ampdu factor/density: %d/%d\n", htc->ampdu_factor, htc->ampdu_density); p += scnprintf(p, bufsz + buf - p, "MCS mask:"); for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) p += scnprintf(p, bufsz + buf - p, " %.2x", htc->mcs.rx_mask[i]); p += scnprintf(p, bufsz + buf - p, "\n"); /* If not set this is meaningless */ if (le16_to_cpu(htc->mcs.rx_highest)) { p += scnprintf(p, bufsz + buf - p, "MCS rx highest: %d Mbps\n", le16_to_cpu(htc->mcs.rx_highest)); } p += scnprintf(p, bufsz + buf - p, "MCS tx params: %x\n", htc->mcs.tx_params); } ret = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); kfree(buf); return ret; } LINK_STA_OPS(ht_capa); static ssize_t link_sta_vht_capa_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { char *buf, *p; struct link_sta_info *link_sta = file->private_data; struct ieee80211_sta_vht_cap *vhtc = &link_sta->pub->vht_cap; ssize_t ret; ssize_t bufsz = 512; buf = kzalloc(bufsz, GFP_KERNEL); if (!buf) return -ENOMEM; p = buf; p += scnprintf(p, bufsz + buf - p, "VHT %ssupported\n", vhtc->vht_supported ? "" : "not "); if (vhtc->vht_supported) { p += scnprintf(p, bufsz + buf - p, "cap: %#.8x\n", vhtc->cap); #define PFLAG(a, b) \ do { \ if (vhtc->cap & IEEE80211_VHT_CAP_ ## a) \ p += scnprintf(p, bufsz + buf - p, \ "\t\t%s\n", b); \ } while (0) switch (vhtc->cap & 0x3) { case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895: p += scnprintf(p, bufsz + buf - p, "\t\tMAX-MPDU-3895\n"); break; case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_7991: p += scnprintf(p, bufsz + buf - p, "\t\tMAX-MPDU-7991\n"); break; case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454: p += scnprintf(p, bufsz + buf - p, "\t\tMAX-MPDU-11454\n"); break; default: p += scnprintf(p, bufsz + buf - p, "\t\tMAX-MPDU-UNKNOWN\n"); } switch (vhtc->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK) { case 0: p += scnprintf(p, bufsz + buf - p, "\t\t80Mhz\n"); break; case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ: p += scnprintf(p, bufsz + buf - p, "\t\t160Mhz\n"); break; case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ: p += scnprintf(p, bufsz + buf - p, "\t\t80+80Mhz\n"); break; default: p += scnprintf(p, bufsz + buf - p, "\t\tUNKNOWN-MHZ: 0x%x\n", (vhtc->cap >> 2) & 0x3); } PFLAG(RXLDPC, "RXLDPC"); PFLAG(SHORT_GI_80, "SHORT-GI-80"); PFLAG(SHORT_GI_160, "SHORT-GI-160"); PFLAG(TXSTBC, "TXSTBC"); p += scnprintf(p, bufsz + buf - p, "\t\tRXSTBC_%d\n", (vhtc->cap >> 8) & 0x7); PFLAG(SU_BEAMFORMER_CAPABLE, "SU-BEAMFORMER-CAPABLE"); PFLAG(SU_BEAMFORMEE_CAPABLE, "SU-BEAMFORMEE-CAPABLE"); p += scnprintf(p, bufsz + buf - p, "\t\tBEAMFORMEE-STS: 0x%x\n", (vhtc->cap & IEEE80211_VHT_CAP_BEAMFORMEE_STS_MASK) >> IEEE80211_VHT_CAP_BEAMFORMEE_STS_SHIFT); p += scnprintf(p, bufsz + buf - p, "\t\tSOUNDING-DIMENSIONS: 0x%x\n", (vhtc->cap & IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_MASK) >> IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_SHIFT); PFLAG(MU_BEAMFORMER_CAPABLE, "MU-BEAMFORMER-CAPABLE"); PFLAG(MU_BEAMFORMEE_CAPABLE, "MU-BEAMFORMEE-CAPABLE"); PFLAG(VHT_TXOP_PS, "TXOP-PS"); PFLAG(HTC_VHT, "HTC-VHT"); p += scnprintf(p, bufsz + buf - p, "\t\tMPDU-LENGTH-EXPONENT: 0x%x\n", (vhtc->cap & IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK) >> IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT); PFLAG(VHT_LINK_ADAPTATION_VHT_UNSOL_MFB, "LINK-ADAPTATION-VHT-UNSOL-MFB"); p += scnprintf(p, bufsz + buf - p, "\t\tLINK-ADAPTATION-VHT-MRQ-MFB: 0x%x\n", (vhtc->cap & IEEE80211_VHT_CAP_VHT_LINK_ADAPTATION_VHT_MRQ_MFB) >> 26); PFLAG(RX_ANTENNA_PATTERN, "RX-ANTENNA-PATTERN"); PFLAG(TX_ANTENNA_PATTERN, "TX-ANTENNA-PATTERN"); p += scnprintf(p, bufsz + buf - p, "RX MCS: %.4x\n", le16_to_cpu(vhtc->vht_mcs.rx_mcs_map)); if (vhtc->vht_mcs.rx_highest) p += scnprintf(p, bufsz + buf - p, "MCS RX highest: %d Mbps\n", le16_to_cpu(vhtc->vht_mcs.rx_highest)); p += scnprintf(p, bufsz + buf - p, "TX MCS: %.4x\n", le16_to_cpu(vhtc->vht_mcs.tx_mcs_map)); if (vhtc->vht_mcs.tx_highest) p += scnprintf(p, bufsz + buf - p, "MCS TX highest: %d Mbps\n", le16_to_cpu(vhtc->vht_mcs.tx_highest)); #undef PFLAG } ret = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); kfree(buf); return ret; } LINK_STA_OPS(vht_capa); static ssize_t link_sta_he_capa_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { char *buf, *p; size_t buf_sz = PAGE_SIZE; struct link_sta_info *link_sta = file->private_data; struct ieee80211_sta_he_cap *hec = &link_sta->pub->he_cap; struct ieee80211_he_mcs_nss_supp *nss = &hec->he_mcs_nss_supp; u8 ppe_size; u8 *cap; int i; ssize_t ret; buf = kmalloc(buf_sz, GFP_KERNEL); if (!buf) return -ENOMEM; p = buf; p += scnprintf(p, buf_sz + buf - p, "HE %ssupported\n", hec->has_he ? "" : "not "); if (!hec->has_he) goto out; cap = hec->he_cap_elem.mac_cap_info; p += scnprintf(p, buf_sz + buf - p, "MAC-CAP: %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x\n", cap[0], cap[1], cap[2], cap[3], cap[4], cap[5]); #define PRINT(fmt, ...) \ p += scnprintf(p, buf_sz + buf - p, "\t\t" fmt "\n", \ ##__VA_ARGS__) #define PFLAG(t, n, a, b) \ do { \ if (cap[n] & IEEE80211_HE_##t##_CAP##n##_##a) \ PRINT("%s", b); \ } while (0) #define PFLAG_RANGE(t, i, n, s, m, off, fmt) \ do { \ u8 msk = IEEE80211_HE_##t##_CAP##i##_##n##_MASK; \ u8 idx = ((cap[i] & msk) >> (ffs(msk) - 1)) + off; \ PRINT(fmt, (s << idx) + (m * idx)); \ } while (0) #define PFLAG_RANGE_DEFAULT(t, i, n, s, m, off, fmt, a, b) \ do { \ if (cap[i] == IEEE80211_HE_##t ##_CAP##i##_##n##_##a) { \ PRINT("%s", b); \ break; \ } \ PFLAG_RANGE(t, i, n, s, m, off, fmt); \ } while (0) PFLAG(MAC, 0, HTC_HE, "HTC-HE"); PFLAG(MAC, 0, TWT_REQ, "TWT-REQ"); PFLAG(MAC, 0, TWT_RES, "TWT-RES"); PFLAG_RANGE_DEFAULT(MAC, 0, DYNAMIC_FRAG, 0, 1, 0, "DYNAMIC-FRAG-LEVEL-%d", NOT_SUPP, "NOT-SUPP"); PFLAG_RANGE_DEFAULT(MAC, 0, MAX_NUM_FRAG_MSDU, 1, 0, 0, "MAX-NUM-FRAG-MSDU-%d", UNLIMITED, "UNLIMITED"); PFLAG_RANGE_DEFAULT(MAC, 1, MIN_FRAG_SIZE, 128, 0, -1, "MIN-FRAG-SIZE-%d", UNLIMITED, "UNLIMITED"); PFLAG_RANGE_DEFAULT(MAC, 1, TF_MAC_PAD_DUR, 0, 8, 0, "TF-MAC-PAD-DUR-%dUS", MASK, "UNKNOWN"); PFLAG_RANGE(MAC, 1, MULTI_TID_AGG_RX_QOS, 0, 1, 1, "MULTI-TID-AGG-RX-QOS-%d"); if (cap[0] & IEEE80211_HE_MAC_CAP0_HTC_HE) { switch (((cap[2] << 1) | (cap[1] >> 7)) & 0x3) { case 0: PRINT("LINK-ADAPTATION-NO-FEEDBACK"); break; case 1: PRINT("LINK-ADAPTATION-RESERVED"); break; case 2: PRINT("LINK-ADAPTATION-UNSOLICITED-FEEDBACK"); break; case 3: PRINT("LINK-ADAPTATION-BOTH"); break; } } PFLAG(MAC, 2, ALL_ACK, "ALL-ACK"); PFLAG(MAC, 2, TRS, "TRS"); PFLAG(MAC, 2, BSR, "BSR"); PFLAG(MAC, 2, BCAST_TWT, "BCAST-TWT"); PFLAG(MAC, 2, 32BIT_BA_BITMAP, "32BIT-BA-BITMAP"); PFLAG(MAC, 2, MU_CASCADING, "MU-CASCADING"); PFLAG(MAC, 2, ACK_EN, "ACK-EN"); PFLAG(MAC, 3, OMI_CONTROL, "OMI-CONTROL"); PFLAG(MAC, 3, OFDMA_RA, "OFDMA-RA"); switch (cap[3] & IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_MASK) { case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_0: PRINT("MAX-AMPDU-LEN-EXP-USE-EXT-0"); break; case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_1: PRINT("MAX-AMPDU-LEN-EXP-VHT-EXT-1"); break; case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_2: PRINT("MAX-AMPDU-LEN-EXP-VHT-EXT-2"); break; case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3: PRINT("MAX-AMPDU-LEN-EXP-VHT-EXT-3"); break; } PFLAG(MAC, 3, AMSDU_FRAG, "AMSDU-FRAG"); PFLAG(MAC, 3, FLEX_TWT_SCHED, "FLEX-TWT-SCHED"); PFLAG(MAC, 3, RX_CTRL_FRAME_TO_MULTIBSS, "RX-CTRL-FRAME-TO-MULTIBSS"); PFLAG(MAC, 4, BSRP_BQRP_A_MPDU_AGG, "BSRP-BQRP-A-MPDU-AGG"); PFLAG(MAC, 4, QTP, "QTP"); PFLAG(MAC, 4, BQR, "BQR"); PFLAG(MAC, 4, PSR_RESP, "PSR-RESP"); PFLAG(MAC, 4, NDP_FB_REP, "NDP-FB-REP"); PFLAG(MAC, 4, OPS, "OPS"); PFLAG(MAC, 4, AMSDU_IN_AMPDU, "AMSDU-IN-AMPDU"); PRINT("MULTI-TID-AGG-TX-QOS-%d", ((cap[5] << 1) | (cap[4] >> 7)) & 0x7); PFLAG(MAC, 5, SUBCHAN_SELECTIVE_TRANSMISSION, "SUBCHAN-SELECTIVE-TRANSMISSION"); PFLAG(MAC, 5, UL_2x996_TONE_RU, "UL-2x996-TONE-RU"); PFLAG(MAC, 5, OM_CTRL_UL_MU_DATA_DIS_RX, "OM-CTRL-UL-MU-DATA-DIS-RX"); PFLAG(MAC, 5, HE_DYNAMIC_SM_PS, "HE-DYNAMIC-SM-PS"); PFLAG(MAC, 5, PUNCTURED_SOUNDING, "PUNCTURED-SOUNDING"); PFLAG(MAC, 5, HT_VHT_TRIG_FRAME_RX, "HT-VHT-TRIG-FRAME-RX"); cap = hec->he_cap_elem.phy_cap_info; p += scnprintf(p, buf_sz + buf - p, "PHY CAP: %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x\n", cap[0], cap[1], cap[2], cap[3], cap[4], cap[5], cap[6], cap[7], cap[8], cap[9], cap[10]); PFLAG(PHY, 0, CHANNEL_WIDTH_SET_40MHZ_IN_2G, "CHANNEL-WIDTH-SET-40MHZ-IN-2G"); PFLAG(PHY, 0, CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G, "CHANNEL-WIDTH-SET-40MHZ-80MHZ-IN-5G"); PFLAG(PHY, 0, CHANNEL_WIDTH_SET_160MHZ_IN_5G, "CHANNEL-WIDTH-SET-160MHZ-IN-5G"); PFLAG(PHY, 0, CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G, "CHANNEL-WIDTH-SET-80PLUS80-MHZ-IN-5G"); PFLAG(PHY, 0, CHANNEL_WIDTH_SET_RU_MAPPING_IN_2G, "CHANNEL-WIDTH-SET-RU-MAPPING-IN-2G"); PFLAG(PHY, 0, CHANNEL_WIDTH_SET_RU_MAPPING_IN_5G, "CHANNEL-WIDTH-SET-RU-MAPPING-IN-5G"); switch (cap[1] & IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK) { case IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_80MHZ_ONLY_SECOND_20MHZ: PRINT("PREAMBLE-PUNC-RX-80MHZ-ONLY-SECOND-20MHZ"); break; case IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_80MHZ_ONLY_SECOND_40MHZ: PRINT("PREAMBLE-PUNC-RX-80MHZ-ONLY-SECOND-40MHZ"); break; case IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_160MHZ_ONLY_SECOND_20MHZ: PRINT("PREAMBLE-PUNC-RX-160MHZ-ONLY-SECOND-20MHZ"); break; case IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_160MHZ_ONLY_SECOND_40MHZ: PRINT("PREAMBLE-PUNC-RX-160MHZ-ONLY-SECOND-40MHZ"); break; } PFLAG(PHY, 1, DEVICE_CLASS_A, "IEEE80211-HE-PHY-CAP1-DEVICE-CLASS-A"); PFLAG(PHY, 1, LDPC_CODING_IN_PAYLOAD, "LDPC-CODING-IN-PAYLOAD"); PFLAG(PHY, 1, HE_LTF_AND_GI_FOR_HE_PPDUS_0_8US, "HY-CAP1-HE-LTF-AND-GI-FOR-HE-PPDUS-0-8US"); PRINT("MIDAMBLE-RX-MAX-NSTS-%d", ((cap[2] << 1) | (cap[1] >> 7)) & 0x3); PFLAG(PHY, 2, NDP_4x_LTF_AND_3_2US, "NDP-4X-LTF-AND-3-2US"); PFLAG(PHY, 2, STBC_TX_UNDER_80MHZ, "STBC-TX-UNDER-80MHZ"); PFLAG(PHY, 2, STBC_RX_UNDER_80MHZ, "STBC-RX-UNDER-80MHZ"); PFLAG(PHY, 2, DOPPLER_TX, "DOPPLER-TX"); PFLAG(PHY, 2, DOPPLER_RX, "DOPPLER-RX"); PFLAG(PHY, 2, UL_MU_FULL_MU_MIMO, "UL-MU-FULL-MU-MIMO"); PFLAG(PHY, 2, UL_MU_PARTIAL_MU_MIMO, "UL-MU-PARTIAL-MU-MIMO"); switch (cap[3] & IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_MASK) { case IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_NO_DCM: PRINT("DCM-MAX-CONST-TX-NO-DCM"); break; case IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_BPSK: PRINT("DCM-MAX-CONST-TX-BPSK"); break; case IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_QPSK: PRINT("DCM-MAX-CONST-TX-QPSK"); break; case IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_16_QAM: PRINT("DCM-MAX-CONST-TX-16-QAM"); break; } PFLAG(PHY, 3, DCM_MAX_TX_NSS_1, "DCM-MAX-TX-NSS-1"); PFLAG(PHY, 3, DCM_MAX_TX_NSS_2, "DCM-MAX-TX-NSS-2"); switch (cap[3] & IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_MASK) { case IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_NO_DCM: PRINT("DCM-MAX-CONST-RX-NO-DCM"); break; case IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_BPSK: PRINT("DCM-MAX-CONST-RX-BPSK"); break; case IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_QPSK: PRINT("DCM-MAX-CONST-RX-QPSK"); break; case IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_16_QAM: PRINT("DCM-MAX-CONST-RX-16-QAM"); break; } PFLAG(PHY, 3, DCM_MAX_RX_NSS_1, "DCM-MAX-RX-NSS-1"); PFLAG(PHY, 3, DCM_MAX_RX_NSS_2, "DCM-MAX-RX-NSS-2"); PFLAG(PHY, 3, RX_PARTIAL_BW_SU_IN_20MHZ_MU, "RX-PARTIAL-BW-SU-IN-20MHZ-MU"); PFLAG(PHY, 3, SU_BEAMFORMER, "SU-BEAMFORMER"); PFLAG(PHY, 4, SU_BEAMFORMEE, "SU-BEAMFORMEE"); PFLAG(PHY, 4, MU_BEAMFORMER, "MU-BEAMFORMER"); PFLAG_RANGE(PHY, 4, BEAMFORMEE_MAX_STS_UNDER_80MHZ, 0, 1, 4, "BEAMFORMEE-MAX-STS-UNDER-%d"); PFLAG_RANGE(PHY, 4, BEAMFORMEE_MAX_STS_ABOVE_80MHZ, 0, 1, 4, "BEAMFORMEE-MAX-STS-ABOVE-%d"); PFLAG_RANGE(PHY, 5, BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ, 0, 1, 1, "NUM-SND-DIM-UNDER-80MHZ-%d"); PFLAG_RANGE(PHY, 5, BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ, 0, 1, 1, "NUM-SND-DIM-ABOVE-80MHZ-%d"); PFLAG(PHY, 5, NG16_SU_FEEDBACK, "NG16-SU-FEEDBACK"); PFLAG(PHY, 5, NG16_MU_FEEDBACK, "NG16-MU-FEEDBACK"); PFLAG(PHY, 6, CODEBOOK_SIZE_42_SU, "CODEBOOK-SIZE-42-SU"); PFLAG(PHY, 6, CODEBOOK_SIZE_75_MU, "CODEBOOK-SIZE-75-MU"); PFLAG(PHY, 6, TRIG_SU_BEAMFORMING_FB, "TRIG-SU-BEAMFORMING-FB"); PFLAG(PHY, 6, TRIG_MU_BEAMFORMING_PARTIAL_BW_FB, "MU-BEAMFORMING-PARTIAL-BW-FB"); PFLAG(PHY, 6, TRIG_CQI_FB, "TRIG-CQI-FB"); PFLAG(PHY, 6, PARTIAL_BW_EXT_RANGE, "PARTIAL-BW-EXT-RANGE"); PFLAG(PHY, 6, PARTIAL_BANDWIDTH_DL_MUMIMO, "PARTIAL-BANDWIDTH-DL-MUMIMO"); PFLAG(PHY, 6, PPE_THRESHOLD_PRESENT, "PPE-THRESHOLD-PRESENT"); PFLAG(PHY, 7, PSR_BASED_SR, "PSR-BASED-SR"); PFLAG(PHY, 7, POWER_BOOST_FACTOR_SUPP, "POWER-BOOST-FACTOR-SUPP"); PFLAG(PHY, 7, HE_SU_MU_PPDU_4XLTF_AND_08_US_GI, "HE-SU-MU-PPDU-4XLTF-AND-08-US-GI"); PFLAG_RANGE(PHY, 7, MAX_NC, 0, 1, 1, "MAX-NC-%d"); PFLAG(PHY, 7, STBC_TX_ABOVE_80MHZ, "STBC-TX-ABOVE-80MHZ"); PFLAG(PHY, 7, STBC_RX_ABOVE_80MHZ, "STBC-RX-ABOVE-80MHZ"); PFLAG(PHY, 8, HE_ER_SU_PPDU_4XLTF_AND_08_US_GI, "HE-ER-SU-PPDU-4XLTF-AND-08-US-GI"); PFLAG(PHY, 8, 20MHZ_IN_40MHZ_HE_PPDU_IN_2G, "20MHZ-IN-40MHZ-HE-PPDU-IN-2G"); PFLAG(PHY, 8, 20MHZ_IN_160MHZ_HE_PPDU, "20MHZ-IN-160MHZ-HE-PPDU"); PFLAG(PHY, 8, 80MHZ_IN_160MHZ_HE_PPDU, "80MHZ-IN-160MHZ-HE-PPDU"); PFLAG(PHY, 8, HE_ER_SU_1XLTF_AND_08_US_GI, "HE-ER-SU-1XLTF-AND-08-US-GI"); PFLAG(PHY, 8, MIDAMBLE_RX_TX_2X_AND_1XLTF, "MIDAMBLE-RX-TX-2X-AND-1XLTF"); switch (cap[8] & IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_MASK) { case IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_242: PRINT("DCM-MAX-RU-242"); break; case IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_484: PRINT("DCM-MAX-RU-484"); break; case IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_996: PRINT("DCM-MAX-RU-996"); break; case IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_2x996: PRINT("DCM-MAX-RU-2x996"); break; } PFLAG(PHY, 9, LONGER_THAN_16_SIGB_OFDM_SYM, "LONGER-THAN-16-SIGB-OFDM-SYM"); PFLAG(PHY, 9, NON_TRIGGERED_CQI_FEEDBACK, "NON-TRIGGERED-CQI-FEEDBACK"); PFLAG(PHY, 9, TX_1024_QAM_LESS_THAN_242_TONE_RU, "TX-1024-QAM-LESS-THAN-242-TONE-RU"); PFLAG(PHY, 9, RX_1024_QAM_LESS_THAN_242_TONE_RU, "RX-1024-QAM-LESS-THAN-242-TONE-RU"); PFLAG(PHY, 9, RX_FULL_BW_SU_USING_MU_WITH_COMP_SIGB, "RX-FULL-BW-SU-USING-MU-WITH-COMP-SIGB"); PFLAG(PHY, 9, RX_FULL_BW_SU_USING_MU_WITH_NON_COMP_SIGB, "RX-FULL-BW-SU-USING-MU-WITH-NON-COMP-SIGB"); switch (u8_get_bits(cap[9], IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_MASK)) { case IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_0US: PRINT("NOMINAL-PACKET-PADDING-0US"); break; case IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_8US: PRINT("NOMINAL-PACKET-PADDING-8US"); break; case IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_16US: PRINT("NOMINAL-PACKET-PADDING-16US"); break; } #undef PFLAG_RANGE_DEFAULT #undef PFLAG_RANGE #undef PFLAG #define PRINT_NSS_SUPP(f, n) \ do { \ int _i; \ u16 v = le16_to_cpu(nss->f); \ p += scnprintf(p, buf_sz + buf - p, n ": %#.4x\n", v); \ for (_i = 0; _i < 8; _i += 2) { \ switch ((v >> _i) & 0x3) { \ case 0: \ PRINT(n "-%d-SUPPORT-0-7", _i / 2); \ break; \ case 1: \ PRINT(n "-%d-SUPPORT-0-9", _i / 2); \ break; \ case 2: \ PRINT(n "-%d-SUPPORT-0-11", _i / 2); \ break; \ case 3: \ PRINT(n "-%d-NOT-SUPPORTED", _i / 2); \ break; \ } \ } \ } while (0) PRINT_NSS_SUPP(rx_mcs_80, "RX-MCS-80"); PRINT_NSS_SUPP(tx_mcs_80, "TX-MCS-80"); if (cap[0] & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G) { PRINT_NSS_SUPP(rx_mcs_160, "RX-MCS-160"); PRINT_NSS_SUPP(tx_mcs_160, "TX-MCS-160"); } if (cap[0] & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G) { PRINT_NSS_SUPP(rx_mcs_80p80, "RX-MCS-80P80"); PRINT_NSS_SUPP(tx_mcs_80p80, "TX-MCS-80P80"); } #undef PRINT_NSS_SUPP #undef PRINT if (!(cap[6] & IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT)) goto out; p += scnprintf(p, buf_sz + buf - p, "PPE-THRESHOLDS: %#.2x", hec->ppe_thres[0]); ppe_size = ieee80211_he_ppe_size(hec->ppe_thres[0], cap); for (i = 1; i < ppe_size; i++) { p += scnprintf(p, buf_sz + buf - p, " %#.2x", hec->ppe_thres[i]); } p += scnprintf(p, buf_sz + buf - p, "\n"); out: ret = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); kfree(buf); return ret; } LINK_STA_OPS(he_capa); static ssize_t link_sta_eht_capa_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { char *buf, *p; size_t buf_sz = PAGE_SIZE; struct link_sta_info *link_sta = file->private_data; struct ieee80211_sta_eht_cap *bec = &link_sta->pub->eht_cap; struct ieee80211_eht_cap_elem_fixed *fixed = &bec->eht_cap_elem; struct ieee80211_eht_mcs_nss_supp *nss = &bec->eht_mcs_nss_supp; u8 *cap; int i; ssize_t ret; static const char *mcs_desc[] = { "0-7", "8-9", "10-11", "12-13"}; buf = kmalloc(buf_sz, GFP_KERNEL); if (!buf) return -ENOMEM; p = buf; p += scnprintf(p, buf_sz + buf - p, "EHT %ssupported\n", bec->has_eht ? "" : "not "); if (!bec->has_eht) goto out; p += scnprintf(p, buf_sz + buf - p, "MAC-CAP: %#.2x %#.2x\n", fixed->mac_cap_info[0], fixed->mac_cap_info[1]); p += scnprintf(p, buf_sz + buf - p, "PHY-CAP: %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x\n", fixed->phy_cap_info[0], fixed->phy_cap_info[1], fixed->phy_cap_info[2], fixed->phy_cap_info[3], fixed->phy_cap_info[4], fixed->phy_cap_info[5], fixed->phy_cap_info[6], fixed->phy_cap_info[7], fixed->phy_cap_info[8]); #define PRINT(fmt, ...) \ p += scnprintf(p, buf_sz + buf - p, "\t\t" fmt "\n", \ ##__VA_ARGS__) #define PFLAG(t, n, a, b) \ do { \ if (cap[n] & IEEE80211_EHT_##t##_CAP##n##_##a) \ PRINT("%s", b); \ } while (0) cap = fixed->mac_cap_info; PFLAG(MAC, 0, EPCS_PRIO_ACCESS, "EPCS-PRIO-ACCESS"); PFLAG(MAC, 0, OM_CONTROL, "OM-CONTROL"); PFLAG(MAC, 0, TRIG_TXOP_SHARING_MODE1, "TRIG-TXOP-SHARING-MODE1"); PFLAG(MAC, 0, TRIG_TXOP_SHARING_MODE2, "TRIG-TXOP-SHARING-MODE2"); PFLAG(MAC, 0, RESTRICTED_TWT, "RESTRICTED-TWT"); PFLAG(MAC, 0, SCS_TRAFFIC_DESC, "SCS-TRAFFIC-DESC"); switch ((cap[0] & 0xc0) >> 6) { case IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_3895: PRINT("MAX-MPDU-LEN: 3985"); break; case IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_7991: PRINT("MAX-MPDU-LEN: 7991"); break; case IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_11454: PRINT("MAX-MPDU-LEN: 11454"); break; } cap = fixed->phy_cap_info; PFLAG(PHY, 0, 320MHZ_IN_6GHZ, "320MHZ-IN-6GHZ"); PFLAG(PHY, 0, 242_TONE_RU_GT20MHZ, "242-TONE-RU-GT20MHZ"); PFLAG(PHY, 0, NDP_4_EHT_LFT_32_GI, "NDP-4-EHT-LFT-32-GI"); PFLAG(PHY, 0, PARTIAL_BW_UL_MU_MIMO, "PARTIAL-BW-UL-MU-MIMO"); PFLAG(PHY, 0, SU_BEAMFORMER, "SU-BEAMFORMER"); PFLAG(PHY, 0, SU_BEAMFORMEE, "SU-BEAMFORMEE"); i = cap[0] >> 7; i |= (cap[1] & 0x3) << 1; PRINT("BEAMFORMEE-80-NSS: %i", i); PRINT("BEAMFORMEE-160-NSS: %i", (cap[1] >> 2) & 0x7); PRINT("BEAMFORMEE-320-NSS: %i", (cap[1] >> 5) & 0x7); PRINT("SOUNDING-DIM-80-NSS: %i", (cap[2] & 0x7)); PRINT("SOUNDING-DIM-160-NSS: %i", (cap[2] >> 3) & 0x7); i = cap[2] >> 6; i |= (cap[3] & 0x1) << 3; PRINT("SOUNDING-DIM-320-NSS: %i", i); PFLAG(PHY, 3, NG_16_SU_FEEDBACK, "NG-16-SU-FEEDBACK"); PFLAG(PHY, 3, NG_16_MU_FEEDBACK, "NG-16-MU-FEEDBACK"); PFLAG(PHY, 3, CODEBOOK_4_2_SU_FDBK, "CODEBOOK-4-2-SU-FDBK"); PFLAG(PHY, 3, CODEBOOK_7_5_MU_FDBK, "CODEBOOK-7-5-MU-FDBK"); PFLAG(PHY, 3, TRIG_SU_BF_FDBK, "TRIG-SU-BF-FDBK"); PFLAG(PHY, 3, TRIG_MU_BF_PART_BW_FDBK, "TRIG-MU-BF-PART-BW-FDBK"); PFLAG(PHY, 3, TRIG_CQI_FDBK, "TRIG-CQI-FDBK"); PFLAG(PHY, 4, PART_BW_DL_MU_MIMO, "PART-BW-DL-MU-MIMO"); PFLAG(PHY, 4, PSR_SR_SUPP, "PSR-SR-SUPP"); PFLAG(PHY, 4, POWER_BOOST_FACT_SUPP, "POWER-BOOST-FACT-SUPP"); PFLAG(PHY, 4, EHT_MU_PPDU_4_EHT_LTF_08_GI, "EHT-MU-PPDU-4-EHT-LTF-08-GI"); PRINT("MAX_NC: %i", cap[4] >> 4); PFLAG(PHY, 5, NON_TRIG_CQI_FEEDBACK, "NON-TRIG-CQI-FEEDBACK"); PFLAG(PHY, 5, TX_LESS_242_TONE_RU_SUPP, "TX-LESS-242-TONE-RU-SUPP"); PFLAG(PHY, 5, RX_LESS_242_TONE_RU_SUPP, "RX-LESS-242-TONE-RU-SUPP"); PFLAG(PHY, 5, PPE_THRESHOLD_PRESENT, "PPE_THRESHOLD_PRESENT"); switch (cap[5] >> 4 & 0x3) { case IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_0US: PRINT("NOMINAL_PKT_PAD: 0us"); break; case IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_8US: PRINT("NOMINAL_PKT_PAD: 8us"); break; case IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_16US: PRINT("NOMINAL_PKT_PAD: 16us"); break; case IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_20US: PRINT("NOMINAL_PKT_PAD: 20us"); break; } i = cap[5] >> 6; i |= cap[6] & 0x7; PRINT("MAX-NUM-SUPP-EHT-LTF: %i", i); PFLAG(PHY, 5, SUPP_EXTRA_EHT_LTF, "SUPP-EXTRA-EHT-LTF"); i = (cap[6] >> 3) & 0xf; PRINT("MCS15-SUPP-MASK: %i", i); PFLAG(PHY, 6, EHT_DUP_6GHZ_SUPP, "EHT-DUP-6GHZ-SUPP"); PFLAG(PHY, 7, 20MHZ_STA_RX_NDP_WIDER_BW, "20MHZ-STA-RX-NDP-WIDER-BW"); PFLAG(PHY, 7, NON_OFDMA_UL_MU_MIMO_80MHZ, "NON-OFDMA-UL-MU-MIMO-80MHZ"); PFLAG(PHY, 7, NON_OFDMA_UL_MU_MIMO_160MHZ, "NON-OFDMA-UL-MU-MIMO-160MHZ"); PFLAG(PHY, 7, NON_OFDMA_UL_MU_MIMO_320MHZ, "NON-OFDMA-UL-MU-MIMO-320MHZ"); PFLAG(PHY, 7, MU_BEAMFORMER_80MHZ, "MU-BEAMFORMER-80MHZ"); PFLAG(PHY, 7, MU_BEAMFORMER_160MHZ, "MU-BEAMFORMER-160MHZ"); PFLAG(PHY, 7, MU_BEAMFORMER_320MHZ, "MU-BEAMFORMER-320MHZ"); PFLAG(PHY, 7, TB_SOUNDING_FDBK_RATE_LIMIT, "TB-SOUNDING-FDBK-RATE-LIMIT"); PFLAG(PHY, 8, RX_1024QAM_WIDER_BW_DL_OFDMA, "RX-1024QAM-WIDER-BW-DL-OFDMA"); PFLAG(PHY, 8, RX_4096QAM_WIDER_BW_DL_OFDMA, "RX-4096QAM-WIDER-BW-DL-OFDMA"); #undef PFLAG PRINT(""); /* newline */ if (!(link_sta->pub->he_cap.he_cap_elem.phy_cap_info[0] & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_MASK_ALL)) { u8 *mcs_vals = (u8 *)(&nss->only_20mhz); for (i = 0; i < 4; i++) PRINT("EHT bw=20 MHz, max NSS for MCS %s: Rx=%u, Tx=%u", mcs_desc[i], mcs_vals[i] & 0xf, mcs_vals[i] >> 4); } else { u8 *mcs_vals = (u8 *)(&nss->bw._80); for (i = 0; i < 3; i++) PRINT("EHT bw <= 80 MHz, max NSS for MCS %s: Rx=%u, Tx=%u", mcs_desc[i + 1], mcs_vals[i] & 0xf, mcs_vals[i] >> 4); mcs_vals = (u8 *)(&nss->bw._160); for (i = 0; i < 3; i++) PRINT("EHT bw <= 160 MHz, max NSS for MCS %s: Rx=%u, Tx=%u", mcs_desc[i + 1], mcs_vals[i] & 0xf, mcs_vals[i] >> 4); mcs_vals = (u8 *)(&nss->bw._320); for (i = 0; i < 3; i++) PRINT("EHT bw <= 320 MHz, max NSS for MCS %s: Rx=%u, Tx=%u", mcs_desc[i + 1], mcs_vals[i] & 0xf, mcs_vals[i] >> 4); } if (cap[5] & IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT) { u8 ppe_size = ieee80211_eht_ppe_size(bec->eht_ppe_thres[0], cap); p += scnprintf(p, buf_sz + buf - p, "EHT PPE Thresholds: "); for (i = 0; i < ppe_size; i++) p += scnprintf(p, buf_sz + buf - p, "0x%02x ", bec->eht_ppe_thres[i]); PRINT(""); /* newline */ } out: ret = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); kfree(buf); return ret; } LINK_STA_OPS(eht_capa); #define DEBUGFS_ADD(name) \ debugfs_create_file(#name, 0400, \ sta->debugfs_dir, sta, &sta_ ##name## _ops) #define DEBUGFS_ADD_COUNTER(name, field) \ debugfs_create_ulong(#name, 0400, sta->debugfs_dir, &sta->field); void ieee80211_sta_debugfs_add(struct sta_info *sta) { struct ieee80211_local *local = sta->local; struct ieee80211_sub_if_data *sdata = sta->sdata; struct dentry *stations_dir = sta->sdata->debugfs.subdir_stations; u8 mac[3*ETH_ALEN]; if (!stations_dir) return; snprintf(mac, sizeof(mac), "%pM", sta->sta.addr); /* * This might fail due to a race condition: * When mac80211 unlinks a station, the debugfs entries * remain, but it is already possible to link a new * station with the same address which triggers adding * it to debugfs; therefore, if the old station isn't * destroyed quickly enough the old station's debugfs * dir might still be around. */ sta->debugfs_dir = debugfs_create_dir(mac, stations_dir); DEBUGFS_ADD(flags); DEBUGFS_ADD(aid); DEBUGFS_ADD(num_ps_buf_frames); DEBUGFS_ADD(last_seq_ctrl); DEBUGFS_ADD(agg_status); /* FIXME: Kept here as the statistics are only done on the deflink */ DEBUGFS_ADD_COUNTER(tx_filtered, deflink.status_stats.filtered); DEBUGFS_ADD(aqm); DEBUGFS_ADD(airtime); if (wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL)) DEBUGFS_ADD(aql); debugfs_create_xul("driver_buffered_tids", 0400, sta->debugfs_dir, &sta->driver_buffered_tids); drv_sta_add_debugfs(local, sdata, &sta->sta, sta->debugfs_dir); } void ieee80211_sta_debugfs_remove(struct sta_info *sta) { debugfs_remove_recursive(sta->debugfs_dir); sta->debugfs_dir = NULL; } #undef DEBUGFS_ADD #undef DEBUGFS_ADD_COUNTER #define DEBUGFS_ADD(name) \ debugfs_create_file(#name, 0400, \ link_sta->debugfs_dir, link_sta, &link_sta_ ##name## _ops) #define DEBUGFS_ADD_COUNTER(name, field) \ debugfs_create_ulong(#name, 0400, link_sta->debugfs_dir, &link_sta->field) void ieee80211_link_sta_debugfs_add(struct link_sta_info *link_sta) { if (WARN_ON(!link_sta->sta->debugfs_dir)) return; /* For non-MLO, leave the files in the main directory. */ if (link_sta->sta->sta.valid_links) { char link_dir_name[10]; snprintf(link_dir_name, sizeof(link_dir_name), "link-%d", link_sta->link_id); link_sta->debugfs_dir = debugfs_create_dir(link_dir_name, link_sta->sta->debugfs_dir); DEBUGFS_ADD(addr); } else { if (WARN_ON(link_sta != &link_sta->sta->deflink)) return; link_sta->debugfs_dir = link_sta->sta->debugfs_dir; } DEBUGFS_ADD(ht_capa); DEBUGFS_ADD(vht_capa); DEBUGFS_ADD(he_capa); DEBUGFS_ADD(eht_capa); DEBUGFS_ADD_COUNTER(rx_duplicates, rx_stats.num_duplicates); DEBUGFS_ADD_COUNTER(rx_fragments, rx_stats.fragments); } void ieee80211_link_sta_debugfs_remove(struct link_sta_info *link_sta) { if (!link_sta->debugfs_dir || !link_sta->sta->debugfs_dir) { link_sta->debugfs_dir = NULL; return; } if (link_sta->debugfs_dir == link_sta->sta->debugfs_dir) { WARN_ON(link_sta != &link_sta->sta->deflink); link_sta->sta->debugfs_dir = NULL; return; } debugfs_remove_recursive(link_sta->debugfs_dir); link_sta->debugfs_dir = NULL; } void ieee80211_link_sta_debugfs_drv_add(struct link_sta_info *link_sta) { if (WARN_ON(!link_sta->debugfs_dir)) return; drv_link_sta_add_debugfs(link_sta->sta->local, link_sta->sta->sdata, link_sta->pub, link_sta->debugfs_dir); } void ieee80211_link_sta_debugfs_drv_remove(struct link_sta_info *link_sta) { if (!link_sta->debugfs_dir) return; if (WARN_ON(link_sta->debugfs_dir == link_sta->sta->debugfs_dir)) return; /* Recreate the directory excluding the driver data */ debugfs_remove_recursive(link_sta->debugfs_dir); link_sta->debugfs_dir = NULL; ieee80211_link_sta_debugfs_add(link_sta); }
33 33 33 33 33 33 33 25 25 1 8 233 234 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 /* * Performance events x86 architecture code * * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> * Copyright (C) 2009 Google, Inc., Stephane Eranian * * For licencing details see kernel-base/COPYING */ #include <linux/perf_event.h> #include <linux/capability.h> #include <linux/notifier.h> #include <linux/hardirq.h> #include <linux/kprobes.h> #include <linux/export.h> #include <linux/init.h> #include <linux/kdebug.h> #include <linux/sched/mm.h> #include <linux/sched/clock.h> #include <linux/uaccess.h> #include <linux/slab.h> #include <linux/cpu.h> #include <linux/bitops.h> #include <linux/device.h> #include <linux/nospec.h> #include <linux/static_call.h> #include <asm/apic.h> #include <asm/stacktrace.h> #include <asm/nmi.h> #include <asm/smp.h> #include <asm/alternative.h> #include <asm/mmu_context.h> #include <asm/tlbflush.h> #include <asm/timer.h> #include <asm/desc.h> #include <asm/ldt.h> #include <asm/unwind.h> #include "perf_event.h" struct x86_pmu x86_pmu __read_mostly; static struct pmu pmu; DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, .pmu = &pmu, }; DEFINE_STATIC_KEY_FALSE(rdpmc_never_available_key); DEFINE_STATIC_KEY_FALSE(rdpmc_always_available_key); DEFINE_STATIC_KEY_FALSE(perf_is_hybrid); /* * This here uses DEFINE_STATIC_CALL_NULL() to get a static_call defined * from just a typename, as opposed to an actual function. */ DEFINE_STATIC_CALL_NULL(x86_pmu_handle_irq, *x86_pmu.handle_irq); DEFINE_STATIC_CALL_NULL(x86_pmu_disable_all, *x86_pmu.disable_all); DEFINE_STATIC_CALL_NULL(x86_pmu_enable_all, *x86_pmu.enable_all); DEFINE_STATIC_CALL_NULL(x86_pmu_enable, *x86_pmu.enable); DEFINE_STATIC_CALL_NULL(x86_pmu_disable, *x86_pmu.disable); DEFINE_STATIC_CALL_NULL(x86_pmu_assign, *x86_pmu.assign); DEFINE_STATIC_CALL_NULL(x86_pmu_add, *x86_pmu.add); DEFINE_STATIC_CALL_NULL(x86_pmu_del, *x86_pmu.del); DEFINE_STATIC_CALL_NULL(x86_pmu_read, *x86_pmu.read); DEFINE_STATIC_CALL_NULL(x86_pmu_set_period, *x86_pmu.set_period); DEFINE_STATIC_CALL_NULL(x86_pmu_update, *x86_pmu.update); DEFINE_STATIC_CALL_NULL(x86_pmu_limit_period, *x86_pmu.limit_period); DEFINE_STATIC_CALL_NULL(x86_pmu_schedule_events, *x86_pmu.schedule_events); DEFINE_STATIC_CALL_NULL(x86_pmu_get_event_constraints, *x86_pmu.get_event_constraints); DEFINE_STATIC_CALL_NULL(x86_pmu_put_event_constraints, *x86_pmu.put_event_constraints); DEFINE_STATIC_CALL_NULL(x86_pmu_start_scheduling, *x86_pmu.start_scheduling); DEFINE_STATIC_CALL_NULL(x86_pmu_commit_scheduling, *x86_pmu.commit_scheduling); DEFINE_STATIC_CALL_NULL(x86_pmu_stop_scheduling, *x86_pmu.stop_scheduling); DEFINE_STATIC_CALL_NULL(x86_pmu_sched_task, *x86_pmu.sched_task); DEFINE_STATIC_CALL_NULL(x86_pmu_swap_task_ctx, *x86_pmu.swap_task_ctx); DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs); DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases); DEFINE_STATIC_CALL_NULL(x86_pmu_filter, *x86_pmu.filter); /* * This one is magic, it will get called even when PMU init fails (because * there is no PMU), in which case it should simply return NULL. */ DEFINE_STATIC_CALL_RET0(x86_pmu_guest_get_msrs, *x86_pmu.guest_get_msrs); u64 __read_mostly hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; u64 __read_mostly hw_cache_extra_regs [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; /* * Propagate event elapsed time into the generic event. * Can only be executed on the CPU where the event is active. * Returns the delta events processed. */ u64 x86_perf_event_update(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; int shift = 64 - x86_pmu.cntval_bits; u64 prev_raw_count, new_raw_count; u64 delta; if (unlikely(!hwc->event_base)) return 0; /* * Careful: an NMI might modify the previous event value. * * Our tactic to handle this is to first atomically read and * exchange a new raw count - then add that new-prev delta * count to the generic event atomically: */ prev_raw_count = local64_read(&hwc->prev_count); do { rdpmcl(hwc->event_base_rdpmc, new_raw_count); } while (!local64_try_cmpxchg(&hwc->prev_count, &prev_raw_count, new_raw_count)); /* * Now we have the new raw value and have updated the prev * timestamp already. We can now calculate the elapsed delta * (event-)time and add that to the generic event. * * Careful, not all hw sign-extends above the physical width * of the count. */ delta = (new_raw_count << shift) - (prev_raw_count << shift); delta >>= shift; local64_add(delta, &event->count); local64_sub(delta, &hwc->period_left); return new_raw_count; } /* * Find and validate any extra registers to set up. */ static int x86_pmu_extra_regs(u64 config, struct perf_event *event) { struct extra_reg *extra_regs = hybrid(event->pmu, extra_regs); struct hw_perf_event_extra *reg; struct extra_reg *er; reg = &event->hw.extra_reg; if (!extra_regs) return 0; for (er = extra_regs; er->msr; er++) { if (er->event != (config & er->config_mask)) continue; if (event->attr.config1 & ~er->valid_mask) return -EINVAL; /* Check if the extra msrs can be safely accessed*/ if (!er->extra_msr_access) return -ENXIO; reg->idx = er->idx; reg->config = event->attr.config1; reg->reg = er->msr; break; } return 0; } static atomic_t active_events; static atomic_t pmc_refcount; static DEFINE_MUTEX(pmc_reserve_mutex); #ifdef CONFIG_X86_LOCAL_APIC static inline int get_possible_num_counters(void) { int i, num_counters = x86_pmu.num_counters; if (!is_hybrid()) return num_counters; for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) num_counters = max_t(int, num_counters, x86_pmu.hybrid_pmu[i].num_counters); return num_counters; } static bool reserve_pmc_hardware(void) { int i, num_counters = get_possible_num_counters(); for (i = 0; i < num_counters; i++) { if (!reserve_perfctr_nmi(x86_pmu_event_addr(i))) goto perfctr_fail; } for (i = 0; i < num_counters; i++) { if (!reserve_evntsel_nmi(x86_pmu_config_addr(i))) goto eventsel_fail; } return true; eventsel_fail: for (i--; i >= 0; i--) release_evntsel_nmi(x86_pmu_config_addr(i)); i = num_counters; perfctr_fail: for (i--; i >= 0; i--) release_perfctr_nmi(x86_pmu_event_addr(i)); return false; } static void release_pmc_hardware(void) { int i, num_counters = get_possible_num_counters(); for (i = 0; i < num_counters; i++) { release_perfctr_nmi(x86_pmu_event_addr(i)); release_evntsel_nmi(x86_pmu_config_addr(i)); } } #else static bool reserve_pmc_hardware(void) { return true; } static void release_pmc_hardware(void) {} #endif bool check_hw_exists(struct pmu *pmu, int num_counters, int num_counters_fixed) { u64 val, val_fail = -1, val_new= ~0; int i, reg, reg_fail = -1, ret = 0; int bios_fail = 0; int reg_safe = -1; /* * Check to see if the BIOS enabled any of the counters, if so * complain and bail. */ for (i = 0; i < num_counters; i++) { reg = x86_pmu_config_addr(i); ret = rdmsrl_safe(reg, &val); if (ret) goto msr_fail; if (val & ARCH_PERFMON_EVENTSEL_ENABLE) { bios_fail = 1; val_fail = val; reg_fail = reg; } else { reg_safe = i; } } if (num_counters_fixed) { reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; ret = rdmsrl_safe(reg, &val); if (ret) goto msr_fail; for (i = 0; i < num_counters_fixed; i++) { if (fixed_counter_disabled(i, pmu)) continue; if (val & (0x03ULL << i*4)) { bios_fail = 1; val_fail = val; reg_fail = reg; } } } /* * If all the counters are enabled, the below test will always * fail. The tools will also become useless in this scenario. * Just fail and disable the hardware counters. */ if (reg_safe == -1) { reg = reg_safe; goto msr_fail; } /* * Read the current value, change it and read it back to see if it * matches, this is needed to detect certain hardware emulators * (qemu/kvm) that don't trap on the MSR access and always return 0s. */ reg = x86_pmu_event_addr(reg_safe); if (rdmsrl_safe(reg, &val)) goto msr_fail; val ^= 0xffffUL; ret = wrmsrl_safe(reg, val); ret |= rdmsrl_safe(reg, &val_new); if (ret || val != val_new) goto msr_fail; /* * We still allow the PMU driver to operate: */ if (bios_fail) { pr_cont("Broken BIOS detected, complain to your hardware vendor.\n"); pr_err(FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg_fail, val_fail); } return true; msr_fail: if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { pr_cont("PMU not available due to virtualization, using software events only.\n"); } else { pr_cont("Broken PMU hardware detected, using software events only.\n"); pr_err("Failed to access perfctr msr (MSR %x is %Lx)\n", reg, val_new); } return false; } static void hw_perf_event_destroy(struct perf_event *event) { x86_release_hardware(); atomic_dec(&active_events); } void hw_perf_lbr_event_destroy(struct perf_event *event) { hw_perf_event_destroy(event); /* undo the lbr/bts event accounting */ x86_del_exclusive(x86_lbr_exclusive_lbr); } static inline int x86_pmu_initialized(void) { return x86_pmu.handle_irq != NULL; } static inline int set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event) { struct perf_event_attr *attr = &event->attr; unsigned int cache_type, cache_op, cache_result; u64 config, val; config = attr->config; cache_type = (config >> 0) & 0xff; if (cache_type >= PERF_COUNT_HW_CACHE_MAX) return -EINVAL; cache_type = array_index_nospec(cache_type, PERF_COUNT_HW_CACHE_MAX); cache_op = (config >> 8) & 0xff; if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX) return -EINVAL; cache_op = array_index_nospec(cache_op, PERF_COUNT_HW_CACHE_OP_MAX); cache_result = (config >> 16) & 0xff; if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) return -EINVAL; cache_result = array_index_nospec(cache_result, PERF_COUNT_HW_CACHE_RESULT_MAX); val = hybrid_var(event->pmu, hw_cache_event_ids)[cache_type][cache_op][cache_result]; if (val == 0) return -ENOENT; if (val == -1) return -EINVAL; hwc->config |= val; attr->config1 = hybrid_var(event->pmu, hw_cache_extra_regs)[cache_type][cache_op][cache_result]; return x86_pmu_extra_regs(val, event); } int x86_reserve_hardware(void) { int err = 0; if (!atomic_inc_not_zero(&pmc_refcount)) { mutex_lock(&pmc_reserve_mutex); if (atomic_read(&pmc_refcount) == 0) { if (!reserve_pmc_hardware()) { err = -EBUSY; } else { reserve_ds_buffers(); reserve_lbr_buffers(); } } if (!err) atomic_inc(&pmc_refcount); mutex_unlock(&pmc_reserve_mutex); } return err; } void x86_release_hardware(void) { if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) { release_pmc_hardware(); release_ds_buffers(); release_lbr_buffers(); mutex_unlock(&pmc_reserve_mutex); } } /* * Check if we can create event of a certain type (that no conflicting events * are present). */ int x86_add_exclusive(unsigned int what) { int i; /* * When lbr_pt_coexist we allow PT to coexist with either LBR or BTS. * LBR and BTS are still mutually exclusive. */ if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt) goto out; if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) { mutex_lock(&pmc_reserve_mutex); for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) { if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i])) goto fail_unlock; } atomic_inc(&x86_pmu.lbr_exclusive[what]); mutex_unlock(&pmc_reserve_mutex); } out: atomic_inc(&active_events); return 0; fail_unlock: mutex_unlock(&pmc_reserve_mutex); return -EBUSY; } void x86_del_exclusive(unsigned int what) { atomic_dec(&active_events); /* * See the comment in x86_add_exclusive(). */ if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt) return; atomic_dec(&x86_pmu.lbr_exclusive[what]); } int x86_setup_perfctr(struct perf_event *event) { struct perf_event_attr *attr = &event->attr; struct hw_perf_event *hwc = &event->hw; u64 config; if (!is_sampling_event(event)) { hwc->sample_period = x86_pmu.max_period; hwc->last_period = hwc->sample_period; local64_set(&hwc->period_left, hwc->sample_period); } if (attr->type == event->pmu->type) return x86_pmu_extra_regs(event->attr.config, event); if (attr->type == PERF_TYPE_HW_CACHE) return set_ext_hw_attr(hwc, event); if (attr->config >= x86_pmu.max_events) return -EINVAL; attr->config = array_index_nospec((unsigned long)attr->config, x86_pmu.max_events); /* * The generic map: */ config = x86_pmu.event_map(attr->config); if (config == 0) return -ENOENT; if (config == -1LL) return -EINVAL; hwc->config |= config; return 0; } /* * check that branch_sample_type is compatible with * settings needed for precise_ip > 1 which implies * using the LBR to capture ALL taken branches at the * priv levels of the measurement */ static inline int precise_br_compat(struct perf_event *event) { u64 m = event->attr.branch_sample_type; u64 b = 0; /* must capture all branches */ if (!(m & PERF_SAMPLE_BRANCH_ANY)) return 0; m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER; if (!event->attr.exclude_user) b |= PERF_SAMPLE_BRANCH_USER; if (!event->attr.exclude_kernel) b |= PERF_SAMPLE_BRANCH_KERNEL; /* * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86 */ return m == b; } int x86_pmu_max_precise(void) { int precise = 0; /* Support for constant skid */ if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { precise++; /* Support for IP fixup */ if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2) precise++; if (x86_pmu.pebs_prec_dist) precise++; } return precise; } int x86_pmu_hw_config(struct perf_event *event) { if (event->attr.precise_ip) { int precise = x86_pmu_max_precise(); if (event->attr.precise_ip > precise) return -EOPNOTSUPP; /* There's no sense in having PEBS for non sampling events: */ if (!is_sampling_event(event)) return -EINVAL; } /* * check that PEBS LBR correction does not conflict with * whatever the user is asking with attr->branch_sample_type */ if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) { u64 *br_type = &event->attr.branch_sample_type; if (has_branch_stack(event)) { if (!precise_br_compat(event)) return -EOPNOTSUPP; /* branch_sample_type is compatible */ } else { /* * user did not specify branch_sample_type * * For PEBS fixups, we capture all * the branches at the priv level of the * event. */ *br_type = PERF_SAMPLE_BRANCH_ANY; if (!event->attr.exclude_user) *br_type |= PERF_SAMPLE_BRANCH_USER; if (!event->attr.exclude_kernel) *br_type |= PERF_SAMPLE_BRANCH_KERNEL; } } if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK) event->attach_state |= PERF_ATTACH_TASK_DATA; /* * Generate PMC IRQs: * (keep 'enabled' bit clear for now) */ event->hw.config = ARCH_PERFMON_EVENTSEL_INT; /* * Count user and OS events unless requested not to */ if (!event->attr.exclude_user) event->hw.config |= ARCH_PERFMON_EVENTSEL_USR; if (!event->attr.exclude_kernel) event->hw.config |= ARCH_PERFMON_EVENTSEL_OS; if (event->attr.type == event->pmu->type) event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK; if (event->attr.sample_period && x86_pmu.limit_period) { s64 left = event->attr.sample_period; x86_pmu.limit_period(event, &left); if (left > event->attr.sample_period) return -EINVAL; } /* sample_regs_user never support XMM registers */ if (unlikely(event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK)) return -EINVAL; /* * Besides the general purpose registers, XMM registers may * be collected in PEBS on some platforms, e.g. Icelake */ if (unlikely(event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK)) { if (!(event->pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS)) return -EINVAL; if (!event->attr.precise_ip) return -EINVAL; } return x86_setup_perfctr(event); } /* * Setup the hardware configuration for a given attr_type */ static int __x86_pmu_event_init(struct perf_event *event) { int err; if (!x86_pmu_initialized()) return -ENODEV; err = x86_reserve_hardware(); if (err) return err; atomic_inc(&active_events); event->destroy = hw_perf_event_destroy; event->hw.idx = -1; event->hw.last_cpu = -1; event->hw.last_tag = ~0ULL; /* mark unused */ event->hw.extra_reg.idx = EXTRA_REG_NONE; event->hw.branch_reg.idx = EXTRA_REG_NONE; return x86_pmu.hw_config(event); } void x86_pmu_disable_all(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx; for (idx = 0; idx < x86_pmu.num_counters; idx++) { struct hw_perf_event *hwc = &cpuc->events[idx]->hw; u64 val; if (!test_bit(idx, cpuc->active_mask)) continue; rdmsrl(x86_pmu_config_addr(idx), val); if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE)) continue; val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrl(x86_pmu_config_addr(idx), val); if (is_counter_pair(hwc)) wrmsrl(x86_pmu_config_addr(idx + 1), 0); } } struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data) { return static_call(x86_pmu_guest_get_msrs)(nr, data); } EXPORT_SYMBOL_GPL(perf_guest_get_msrs); /* * There may be PMI landing after enabled=0. The PMI hitting could be before or * after disable_all. * * If PMI hits before disable_all, the PMU will be disabled in the NMI handler. * It will not be re-enabled in the NMI handler again, because enabled=0. After * handling the NMI, disable_all will be called, which will not change the * state either. If PMI hits after disable_all, the PMU is already disabled * before entering NMI handler. The NMI handler will not change the state * either. * * So either situation is harmless. */ static void x86_pmu_disable(struct pmu *pmu) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); if (!x86_pmu_initialized()) return; if (!cpuc->enabled) return; cpuc->n_added = 0; cpuc->enabled = 0; barrier(); static_call(x86_pmu_disable_all)(); } void x86_pmu_enable_all(int added) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx; for (idx = 0; idx < x86_pmu.num_counters; idx++) { struct hw_perf_event *hwc = &cpuc->events[idx]->hw; if (!test_bit(idx, cpuc->active_mask)) continue; __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); } } static inline int is_x86_event(struct perf_event *event) { int i; if (!is_hybrid()) return event->pmu == &pmu; for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { if (event->pmu == &x86_pmu.hybrid_pmu[i].pmu) return true; } return false; } struct pmu *x86_get_pmu(unsigned int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); /* * All CPUs of the hybrid type have been offline. * The x86_get_pmu() should not be invoked. */ if (WARN_ON_ONCE(!cpuc->pmu)) return &pmu; return cpuc->pmu; } /* * Event scheduler state: * * Assign events iterating over all events and counters, beginning * with events with least weights first. Keep the current iterator * state in struct sched_state. */ struct sched_state { int weight; int event; /* event index */ int counter; /* counter index */ int unassigned; /* number of events to be assigned left */ int nr_gp; /* number of GP counters used */ u64 used; }; /* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */ #define SCHED_STATES_MAX 2 struct perf_sched { int max_weight; int max_events; int max_gp; int saved_states; struct event_constraint **constraints; struct sched_state state; struct sched_state saved[SCHED_STATES_MAX]; }; /* * Initialize iterator that runs through all events and counters. */ static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints, int num, int wmin, int wmax, int gpmax) { int idx; memset(sched, 0, sizeof(*sched)); sched->max_events = num; sched->max_weight = wmax; sched->max_gp = gpmax; sched->constraints = constraints; for (idx = 0; idx < num; idx++) { if (constraints[idx]->weight == wmin) break; } sched->state.event = idx; /* start with min weight */ sched->state.weight = wmin; sched->state.unassigned = num; } static void perf_sched_save_state(struct perf_sched *sched) { if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX)) return; sched->saved[sched->saved_states] = sched->state; sched->saved_states++; } static bool perf_sched_restore_state(struct perf_sched *sched) { if (!sched->saved_states) return false; sched->saved_states--; sched->state = sched->saved[sched->saved_states]; /* this assignment didn't work out */ /* XXX broken vs EVENT_PAIR */ sched->state.used &= ~BIT_ULL(sched->state.counter); /* try the next one */ sched->state.counter++; return true; } /* * Select a counter for the current event to schedule. Return true on * success. */ static bool __perf_sched_find_counter(struct perf_sched *sched) { struct event_constraint *c; int idx; if (!sched->state.unassigned) return false; if (sched->state.event >= sched->max_events) return false; c = sched->constraints[sched->state.event]; /* Prefer fixed purpose counters */ if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) { idx = INTEL_PMC_IDX_FIXED; for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) { u64 mask = BIT_ULL(idx); if (sched->state.used & mask) continue; sched->state.used |= mask; goto done; } } /* Grab the first unused counter starting with idx */ idx = sched->state.counter; for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) { u64 mask = BIT_ULL(idx); if (c->flags & PERF_X86_EVENT_PAIR) mask |= mask << 1; if (sched->state.used & mask) continue; if (sched->state.nr_gp++ >= sched->max_gp) return false; sched->state.used |= mask; goto done; } return false; done: sched->state.counter = idx; if (c->overlap) perf_sched_save_state(sched); return true; } static bool perf_sched_find_counter(struct perf_sched *sched) { while (!__perf_sched_find_counter(sched)) { if (!perf_sched_restore_state(sched)) return false; } return true; } /* * Go through all unassigned events and find the next one to schedule. * Take events with the least weight first. Return true on success. */ static bool perf_sched_next_event(struct perf_sched *sched) { struct event_constraint *c; if (!sched->state.unassigned || !--sched->state.unassigned) return false; do { /* next event */ sched->state.event++; if (sched->state.event >= sched->max_events) { /* next weight */ sched->state.event = 0; sched->state.weight++; if (sched->state.weight > sched->max_weight) return false; } c = sched->constraints[sched->state.event]; } while (c->weight != sched->state.weight); sched->state.counter = 0; /* start with first counter */ return true; } /* * Assign a counter for each event. */ int perf_assign_events(struct event_constraint **constraints, int n, int wmin, int wmax, int gpmax, int *assign) { struct perf_sched sched; perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax); do { if (!perf_sched_find_counter(&sched)) break; /* failed */ if (assign) assign[sched.state.event] = sched.state.counter; } while (perf_sched_next_event(&sched)); return sched.state.unassigned; } EXPORT_SYMBOL_GPL(perf_assign_events); int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) { int num_counters = hybrid(cpuc->pmu, num_counters); struct event_constraint *c; struct perf_event *e; int n0, i, wmin, wmax, unsched = 0; struct hw_perf_event *hwc; u64 used_mask = 0; /* * Compute the number of events already present; see x86_pmu_add(), * validate_group() and x86_pmu_commit_txn(). For the former two * cpuc->n_events hasn't been updated yet, while for the latter * cpuc->n_txn contains the number of events added in the current * transaction. */ n0 = cpuc->n_events; if (cpuc->txn_flags & PERF_PMU_TXN_ADD) n0 -= cpuc->n_txn; static_call_cond(x86_pmu_start_scheduling)(cpuc); for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { c = cpuc->event_constraint[i]; /* * Previously scheduled events should have a cached constraint, * while new events should not have one. */ WARN_ON_ONCE((c && i >= n0) || (!c && i < n0)); /* * Request constraints for new events; or for those events that * have a dynamic constraint -- for those the constraint can * change due to external factors (sibling state, allow_tfa). */ if (!c || (c->flags & PERF_X86_EVENT_DYNAMIC)) { c = static_call(x86_pmu_get_event_constraints)(cpuc, i, cpuc->event_list[i]); cpuc->event_constraint[i] = c; } wmin = min(wmin, c->weight); wmax = max(wmax, c->weight); } /* * fastpath, try to reuse previous register */ for (i = 0; i < n; i++) { u64 mask; hwc = &cpuc->event_list[i]->hw; c = cpuc->event_constraint[i]; /* never assigned */ if (hwc->idx == -1) break; /* constraint still honored */ if (!test_bit(hwc->idx, c->idxmsk)) break; mask = BIT_ULL(hwc->idx); if (is_counter_pair(hwc)) mask |= mask << 1; /* not already used */ if (used_mask & mask) break; used_mask |= mask; if (assign) assign[i] = hwc->idx; } /* slow path */ if (i != n) { int gpmax = num_counters; /* * Do not allow scheduling of more than half the available * generic counters. * * This helps avoid counter starvation of sibling thread by * ensuring at most half the counters cannot be in exclusive * mode. There is no designated counters for the limits. Any * N/2 counters can be used. This helps with events with * specific counter constraints. */ if (is_ht_workaround_enabled() && !cpuc->is_fake && READ_ONCE(cpuc->excl_cntrs->exclusive_present)) gpmax /= 2; /* * Reduce the amount of available counters to allow fitting * the extra Merge events needed by large increment events. */ if (x86_pmu.flags & PMU_FL_PAIR) { gpmax = num_counters - cpuc->n_pair; WARN_ON(gpmax <= 0); } unsched = perf_assign_events(cpuc->event_constraint, n, wmin, wmax, gpmax, assign); } /* * In case of success (unsched = 0), mark events as committed, * so we do not put_constraint() in case new events are added * and fail to be scheduled * * We invoke the lower level commit callback to lock the resource * * We do not need to do all of this in case we are called to * validate an event group (assign == NULL) */ if (!unsched && assign) { for (i = 0; i < n; i++) static_call_cond(x86_pmu_commit_scheduling)(cpuc, i, assign[i]); } else { for (i = n0; i < n; i++) { e = cpuc->event_list[i]; /* * release events that failed scheduling */ static_call_cond(x86_pmu_put_event_constraints)(cpuc, e); cpuc->event_constraint[i] = NULL; } } static_call_cond(x86_pmu_stop_scheduling)(cpuc); return unsched ? -EINVAL : 0; } static int add_nr_metric_event(struct cpu_hw_events *cpuc, struct perf_event *event) { if (is_metric_event(event)) { if (cpuc->n_metric == INTEL_TD_METRIC_NUM) return -EINVAL; cpuc->n_metric++; cpuc->n_txn_metric++; } return 0; } static void del_nr_metric_event(struct cpu_hw_events *cpuc, struct perf_event *event) { if (is_metric_event(event)) cpuc->n_metric--; } static int collect_event(struct cpu_hw_events *cpuc, struct perf_event *event, int max_count, int n) { union perf_capabilities intel_cap = hybrid(cpuc->pmu, intel_cap); if (intel_cap.perf_metrics && add_nr_metric_event(cpuc, event)) return -EINVAL; if (n >= max_count + cpuc->n_metric) return -EINVAL; cpuc->event_list[n] = event; if (is_counter_pair(&event->hw)) { cpuc->n_pair++; cpuc->n_txn_pair++; } return 0; } /* * dogrp: true if must collect siblings events (group) * returns total number of events and error code */ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp) { int num_counters = hybrid(cpuc->pmu, num_counters); int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed); struct perf_event *event; int n, max_count; max_count = num_counters + num_counters_fixed; /* current number of events already accepted */ n = cpuc->n_events; if (!cpuc->n_events) cpuc->pebs_output = 0; if (!cpuc->is_fake && leader->attr.precise_ip) { /* * For PEBS->PT, if !aux_event, the group leader (PT) went * away, the group was broken down and this singleton event * can't schedule any more. */ if (is_pebs_pt(leader) && !leader->aux_event) return -EINVAL; /* * pebs_output: 0: no PEBS so far, 1: PT, 2: DS */ if (cpuc->pebs_output && cpuc->pebs_output != is_pebs_pt(leader) + 1) return -EINVAL; cpuc->pebs_output = is_pebs_pt(leader) + 1; } if (is_x86_event(leader)) { if (collect_event(cpuc, leader, max_count, n)) return -EINVAL; n++; } if (!dogrp) return n; for_each_sibling_event(event, leader) { if (!is_x86_event(event) || event->state <= PERF_EVENT_STATE_OFF) continue; if (collect_event(cpuc, event, max_count, n)) return -EINVAL; n++; } return n; } static inline void x86_assign_hw_event(struct perf_event *event, struct cpu_hw_events *cpuc, int i) { struct hw_perf_event *hwc = &event->hw; int idx; idx = hwc->idx = cpuc->assign[i]; hwc->last_cpu = smp_processor_id(); hwc->last_tag = ++cpuc->tags[i]; static_call_cond(x86_pmu_assign)(event, idx); switch (hwc->idx) { case INTEL_PMC_IDX_FIXED_BTS: case INTEL_PMC_IDX_FIXED_VLBR: hwc->config_base = 0; hwc->event_base = 0; break; case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: /* All the metric events are mapped onto the fixed counter 3. */ idx = INTEL_PMC_IDX_FIXED_SLOTS; fallthrough; case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS-1: hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (idx - INTEL_PMC_IDX_FIXED); hwc->event_base_rdpmc = (idx - INTEL_PMC_IDX_FIXED) | INTEL_PMC_FIXED_RDPMC_BASE; break; default: hwc->config_base = x86_pmu_config_addr(hwc->idx); hwc->event_base = x86_pmu_event_addr(hwc->idx); hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx); break; } } /** * x86_perf_rdpmc_index - Return PMC counter used for event * @event: the perf_event to which the PMC counter was assigned * * The counter assigned to this performance event may change if interrupts * are enabled. This counter should thus never be used while interrupts are * enabled. Before this function is used to obtain the assigned counter the * event should be checked for validity using, for example, * perf_event_read_local(), within the same interrupt disabled section in * which this counter is planned to be used. * * Return: The index of the performance monitoring counter assigned to * @perf_event. */ int x86_perf_rdpmc_index(struct perf_event *event) { lockdep_assert_irqs_disabled(); return event->hw.event_base_rdpmc; } static inline int match_prev_assignment(struct hw_perf_event *hwc, struct cpu_hw_events *cpuc, int i) { return hwc->idx == cpuc->assign[i] && hwc->last_cpu == smp_processor_id() && hwc->last_tag == cpuc->tags[i]; } static void x86_pmu_start(struct perf_event *event, int flags); static void x86_pmu_enable(struct pmu *pmu) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_event *event; struct hw_perf_event *hwc; int i, added = cpuc->n_added; if (!x86_pmu_initialized()) return; if (cpuc->enabled) return; if (cpuc->n_added) { int n_running = cpuc->n_events - cpuc->n_added; /* * apply assignment obtained either from * hw_perf_group_sched_in() or x86_pmu_enable() * * step1: save events moving to new counters */ for (i = 0; i < n_running; i++) { event = cpuc->event_list[i]; hwc = &event->hw; /* * we can avoid reprogramming counter if: * - assigned same counter as last time * - running on same CPU as last time * - no other event has used the counter since */ if (hwc->idx == -1 || match_prev_assignment(hwc, cpuc, i)) continue; /* * Ensure we don't accidentally enable a stopped * counter simply because we rescheduled. */ if (hwc->state & PERF_HES_STOPPED) hwc->state |= PERF_HES_ARCH; x86_pmu_stop(event, PERF_EF_UPDATE); } /* * step2: reprogram moved events into new counters */ for (i = 0; i < cpuc->n_events; i++) { event = cpuc->event_list[i]; hwc = &event->hw; if (!match_prev_assignment(hwc, cpuc, i)) x86_assign_hw_event(event, cpuc, i); else if (i < n_running) continue; if (hwc->state & PERF_HES_ARCH) continue; /* * if cpuc->enabled = 0, then no wrmsr as * per x86_pmu_enable_event() */ x86_pmu_start(event, PERF_EF_RELOAD); } cpuc->n_added = 0; perf_events_lapic_init(); } cpuc->enabled = 1; barrier(); static_call(x86_pmu_enable_all)(added); } DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); /* * Set the next IRQ period, based on the hwc->period_left value. * To be called with the event disabled in hw: */ int x86_perf_event_set_period(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; s64 left = local64_read(&hwc->period_left); s64 period = hwc->sample_period; int ret = 0, idx = hwc->idx; if (unlikely(!hwc->event_base)) return 0; /* * If we are way outside a reasonable range then just skip forward: */ if (unlikely(left <= -period)) { left = period; local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } if (unlikely(left <= 0)) { left += period; local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } /* * Quirk: certain CPUs dont like it if just 1 hw_event is left: */ if (unlikely(left < 2)) left = 2; if (left > x86_pmu.max_period) left = x86_pmu.max_period; static_call_cond(x86_pmu_limit_period)(event, &left); this_cpu_write(pmc_prev_left[idx], left); /* * The hw event starts counting from this event offset, * mark it to be able to extra future deltas: */ local64_set(&hwc->prev_count, (u64)-left); wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); /* * Sign extend the Merge event counter's upper 16 bits since * we currently declare a 48-bit counter width */ if (is_counter_pair(hwc)) wrmsrl(x86_pmu_event_addr(idx + 1), 0xffff); perf_event_update_userpage(event); return ret; } void x86_pmu_enable_event(struct perf_event *event) { if (__this_cpu_read(cpu_hw_events.enabled)) __x86_pmu_enable_event(&event->hw, ARCH_PERFMON_EVENTSEL_ENABLE); } /* * Add a single event to the PMU. * * The event is added to the group of enabled events * but only if it can be scheduled with existing events. */ static int x86_pmu_add(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc; int assign[X86_PMC_IDX_MAX]; int n, n0, ret; hwc = &event->hw; n0 = cpuc->n_events; ret = n = collect_events(cpuc, event, false); if (ret < 0) goto out; hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; if (!(flags & PERF_EF_START)) hwc->state |= PERF_HES_ARCH; /* * If group events scheduling transaction was started, * skip the schedulability test here, it will be performed * at commit time (->commit_txn) as a whole. * * If commit fails, we'll call ->del() on all events * for which ->add() was called. */ if (cpuc->txn_flags & PERF_PMU_TXN_ADD) goto done_collect; ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign); if (ret) goto out; /* * copy new assignment, now we know it is possible * will be used by hw_perf_enable() */ memcpy(cpuc->assign, assign, n*sizeof(int)); done_collect: /* * Commit the collect_events() state. See x86_pmu_del() and * x86_pmu_*_txn(). */ cpuc->n_events = n; cpuc->n_added += n - n0; cpuc->n_txn += n - n0; /* * This is before x86_pmu_enable() will call x86_pmu_start(), * so we enable LBRs before an event needs them etc.. */ static_call_cond(x86_pmu_add)(event); ret = 0; out: return ret; } static void x86_pmu_start(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx = event->hw.idx; if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) return; if (WARN_ON_ONCE(idx == -1)) return; if (flags & PERF_EF_RELOAD) { WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); static_call(x86_pmu_set_period)(event); } event->hw.state = 0; cpuc->events[idx] = event; __set_bit(idx, cpuc->active_mask); static_call(x86_pmu_enable)(event); perf_event_update_userpage(event); } void perf_event_print_debug(void) { u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; u64 pebs, debugctl; int cpu = smp_processor_id(); struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); int num_counters = hybrid(cpuc->pmu, num_counters); int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed); struct event_constraint *pebs_constraints = hybrid(cpuc->pmu, pebs_constraints); unsigned long flags; int idx; if (!num_counters) return; local_irq_save(flags); if (x86_pmu.version >= 2) { rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); pr_info("\n"); pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); pr_info("CPU#%d: status: %016llx\n", cpu, status); pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); if (pebs_constraints) { rdmsrl(MSR_IA32_PEBS_ENABLE, pebs); pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs); } if (x86_pmu.lbr_nr) { rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); pr_info("CPU#%d: debugctl: %016llx\n", cpu, debugctl); } } pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); for (idx = 0; idx < num_counters; idx++) { rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl); rdmsrl(x86_pmu_event_addr(idx), pmc_count); prev_left = per_cpu(pmc_prev_left[idx], cpu); pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", cpu, idx, pmc_ctrl); pr_info("CPU#%d: gen-PMC%d count: %016llx\n", cpu, idx, pmc_count); pr_info("CPU#%d: gen-PMC%d left: %016llx\n", cpu, idx, prev_left); } for (idx = 0; idx < num_counters_fixed; idx++) { if (fixed_counter_disabled(idx, cpuc->pmu)) continue; rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", cpu, idx, pmc_count); } local_irq_restore(flags); } void x86_pmu_stop(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; if (test_bit(hwc->idx, cpuc->active_mask)) { static_call(x86_pmu_disable)(event); __clear_bit(hwc->idx, cpuc->active_mask); cpuc->events[hwc->idx] = NULL; WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); hwc->state |= PERF_HES_STOPPED; } if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { /* * Drain the remaining delta count out of a event * that we are disabling: */ static_call(x86_pmu_update)(event); hwc->state |= PERF_HES_UPTODATE; } } static void x86_pmu_del(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); union perf_capabilities intel_cap = hybrid(cpuc->pmu, intel_cap); int i; /* * If we're called during a txn, we only need to undo x86_pmu.add. * The events never got scheduled and ->cancel_txn will truncate * the event_list. * * XXX assumes any ->del() called during a TXN will only be on * an event added during that same TXN. */ if (cpuc->txn_flags & PERF_PMU_TXN_ADD) goto do_del; __set_bit(event->hw.idx, cpuc->dirty); /* * Not a TXN, therefore cleanup properly. */ x86_pmu_stop(event, PERF_EF_UPDATE); for (i = 0; i < cpuc->n_events; i++) { if (event == cpuc->event_list[i]) break; } if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */ return; /* If we have a newly added event; make sure to decrease n_added. */ if (i >= cpuc->n_events - cpuc->n_added) --cpuc->n_added; static_call_cond(x86_pmu_put_event_constraints)(cpuc, event); /* Delete the array entry. */ while (++i < cpuc->n_events) { cpuc->event_list[i-1] = cpuc->event_list[i]; cpuc->event_constraint[i-1] = cpuc->event_constraint[i]; } cpuc->event_constraint[i-1] = NULL; --cpuc->n_events; if (intel_cap.perf_metrics) del_nr_metric_event(cpuc, event); perf_event_update_userpage(event); do_del: /* * This is after x86_pmu_stop(); so we disable LBRs after any * event can need them etc.. */ static_call_cond(x86_pmu_del)(event); } int x86_pmu_handle_irq(struct pt_regs *regs) { struct perf_sample_data data; struct cpu_hw_events *cpuc; struct perf_event *event; int idx, handled = 0; u64 val; cpuc = this_cpu_ptr(&cpu_hw_events); /* * Some chipsets need to unmask the LVTPC in a particular spot * inside the nmi handler. As a result, the unmasking was pushed * into all the nmi handlers. * * This generic handler doesn't seem to have any issues where the * unmasking occurs so it was left at the top. */ apic_write(APIC_LVTPC, APIC_DM_NMI); for (idx = 0; idx < x86_pmu.num_counters; idx++) { if (!test_bit(idx, cpuc->active_mask)) continue; event = cpuc->events[idx]; val = static_call(x86_pmu_update)(event); if (val & (1ULL << (x86_pmu.cntval_bits - 1))) continue; /* * event overflow */ handled++; if (!static_call(x86_pmu_set_period)(event)) continue; perf_sample_data_init(&data, 0, event->hw.last_period); if (has_branch_stack(event)) perf_sample_save_brstack(&data, event, &cpuc->lbr_stack); if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); } if (handled) inc_irq_stat(apic_perf_irqs); return handled; } void perf_events_lapic_init(void) { if (!x86_pmu.apic || !x86_pmu_initialized()) return; /* * Always use NMI for PMU */ apic_write(APIC_LVTPC, APIC_DM_NMI); } static int perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) { u64 start_clock; u64 finish_clock; int ret; /* * All PMUs/events that share this PMI handler should make sure to * increment active_events for their events. */ if (!atomic_read(&active_events)) return NMI_DONE; start_clock = sched_clock(); ret = static_call(x86_pmu_handle_irq)(regs); finish_clock = sched_clock(); perf_sample_event_took(finish_clock - start_clock); return ret; } NOKPROBE_SYMBOL(perf_event_nmi_handler); struct event_constraint emptyconstraint; struct event_constraint unconstrained; static int x86_pmu_prepare_cpu(unsigned int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); int i; for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) cpuc->kfree_on_online[i] = NULL; if (x86_pmu.cpu_prepare) return x86_pmu.cpu_prepare(cpu); return 0; } static int x86_pmu_dead_cpu(unsigned int cpu) { if (x86_pmu.cpu_dead) x86_pmu.cpu_dead(cpu); return 0; } static int x86_pmu_online_cpu(unsigned int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); int i; for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) { kfree(cpuc->kfree_on_online[i]); cpuc->kfree_on_online[i] = NULL; } return 0; } static int x86_pmu_starting_cpu(unsigned int cpu) { if (x86_pmu.cpu_starting) x86_pmu.cpu_starting(cpu); return 0; } static int x86_pmu_dying_cpu(unsigned int cpu) { if (x86_pmu.cpu_dying) x86_pmu.cpu_dying(cpu); return 0; } static void __init pmu_check_apic(void) { if (boot_cpu_has(X86_FEATURE_APIC)) return; x86_pmu.apic = 0; pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); pr_info("no hardware sampling interrupt available.\n"); /* * If we have a PMU initialized but no APIC * interrupts, we cannot sample hardware * events (user-space has to fall back and * sample via a hrtimer based software event): */ pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT; } static struct attribute_group x86_pmu_format_group __ro_after_init = { .name = "format", .attrs = NULL, }; ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page) { struct perf_pmu_events_attr *pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr); u64 config = 0; if (pmu_attr->id < x86_pmu.max_events) config = x86_pmu.event_map(pmu_attr->id); /* string trumps id */ if (pmu_attr->event_str) return sprintf(page, "%s\n", pmu_attr->event_str); return x86_pmu.events_sysfs_show(page, config); } EXPORT_SYMBOL_GPL(events_sysfs_show); ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr, char *page) { struct perf_pmu_events_ht_attr *pmu_attr = container_of(attr, struct perf_pmu_events_ht_attr, attr); /* * Report conditional events depending on Hyper-Threading. * * This is overly conservative as usually the HT special * handling is not needed if the other CPU thread is idle. * * Note this does not (and cannot) handle the case when thread * siblings are invisible, for example with virtualization * if they are owned by some other guest. The user tool * has to re-read when a thread sibling gets onlined later. */ return sprintf(page, "%s", topology_max_smt_threads() > 1 ? pmu_attr->event_str_ht : pmu_attr->event_str_noht); } ssize_t events_hybrid_sysfs_show(struct device *dev, struct device_attribute *attr, char *page) { struct perf_pmu_events_hybrid_attr *pmu_attr = container_of(attr, struct perf_pmu_events_hybrid_attr, attr); struct x86_hybrid_pmu *pmu; const char *str, *next_str; int i; if (hweight64(pmu_attr->pmu_type) == 1) return sprintf(page, "%s", pmu_attr->event_str); /* * Hybrid PMUs may support the same event name, but with different * event encoding, e.g., the mem-loads event on an Atom PMU has * different event encoding from a Core PMU. * * The event_str includes all event encodings. Each event encoding * is divided by ";". The order of the event encodings must follow * the order of the hybrid PMU index. */ pmu = container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu); str = pmu_attr->event_str; for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { if (!(x86_pmu.hybrid_pmu[i].cpu_type & pmu_attr->pmu_type)) continue; if (x86_pmu.hybrid_pmu[i].cpu_type & pmu->cpu_type) { next_str = strchr(str, ';'); if (next_str) return snprintf(page, next_str - str + 1, "%s", str); else return sprintf(page, "%s", str); } str = strchr(str, ';'); str++; } return 0; } EXPORT_SYMBOL_GPL(events_hybrid_sysfs_show); EVENT_ATTR(cpu-cycles, CPU_CYCLES ); EVENT_ATTR(instructions, INSTRUCTIONS ); EVENT_ATTR(cache-references, CACHE_REFERENCES ); EVENT_ATTR(cache-misses, CACHE_MISSES ); EVENT_ATTR(branch-instructions, BRANCH_INSTRUCTIONS ); EVENT_ATTR(branch-misses, BRANCH_MISSES ); EVENT_ATTR(bus-cycles, BUS_CYCLES ); EVENT_ATTR(stalled-cycles-frontend, STALLED_CYCLES_FRONTEND ); EVENT_ATTR(stalled-cycles-backend, STALLED_CYCLES_BACKEND ); EVENT_ATTR(ref-cycles, REF_CPU_CYCLES ); static struct attribute *empty_attrs; static struct attribute *events_attr[] = { EVENT_PTR(CPU_CYCLES), EVENT_PTR(INSTRUCTIONS), EVENT_PTR(CACHE_REFERENCES), EVENT_PTR(CACHE_MISSES), EVENT_PTR(BRANCH_INSTRUCTIONS), EVENT_PTR(BRANCH_MISSES), EVENT_PTR(BUS_CYCLES), EVENT_PTR(STALLED_CYCLES_FRONTEND), EVENT_PTR(STALLED_CYCLES_BACKEND), EVENT_PTR(REF_CPU_CYCLES), NULL, }; /* * Remove all undefined events (x86_pmu.event_map(id) == 0) * out of events_attr attributes. */ static umode_t is_visible(struct kobject *kobj, struct attribute *attr, int idx) { struct perf_pmu_events_attr *pmu_attr; if (idx >= x86_pmu.max_events) return 0; pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr.attr); /* str trumps id */ return pmu_attr->event_str || x86_pmu.event_map(idx) ? attr->mode : 0; } static struct attribute_group x86_pmu_events_group __ro_after_init = { .name = "events", .attrs = events_attr, .is_visible = is_visible, }; ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event) { u64 umask = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; u64 cmask = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24; bool edge = (config & ARCH_PERFMON_EVENTSEL_EDGE); bool pc = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL); bool any = (config & ARCH_PERFMON_EVENTSEL_ANY); bool inv = (config & ARCH_PERFMON_EVENTSEL_INV); ssize_t ret; /* * We have whole page size to spend and just little data * to write, so we can safely use sprintf. */ ret = sprintf(page, "event=0x%02llx", event); if (umask) ret += sprintf(page + ret, ",umask=0x%02llx", umask); if (edge) ret += sprintf(page + ret, ",edge"); if (pc) ret += sprintf(page + ret, ",pc"); if (any) ret += sprintf(page + ret, ",any"); if (inv) ret += sprintf(page + ret, ",inv"); if (cmask) ret += sprintf(page + ret, ",cmask=0x%02llx", cmask); ret += sprintf(page + ret, "\n"); return ret; } static struct attribute_group x86_pmu_attr_group; static struct attribute_group x86_pmu_caps_group; static void x86_pmu_static_call_update(void) { static_call_update(x86_pmu_handle_irq, x86_pmu.handle_irq); static_call_update(x86_pmu_disable_all, x86_pmu.disable_all); static_call_update(x86_pmu_enable_all, x86_pmu.enable_all); static_call_update(x86_pmu_enable, x86_pmu.enable); static_call_update(x86_pmu_disable, x86_pmu.disable); static_call_update(x86_pmu_assign, x86_pmu.assign); static_call_update(x86_pmu_add, x86_pmu.add); static_call_update(x86_pmu_del, x86_pmu.del); static_call_update(x86_pmu_read, x86_pmu.read); static_call_update(x86_pmu_set_period, x86_pmu.set_period); static_call_update(x86_pmu_update, x86_pmu.update); static_call_update(x86_pmu_limit_period, x86_pmu.limit_period); static_call_update(x86_pmu_schedule_events, x86_pmu.schedule_events); static_call_update(x86_pmu_get_event_constraints, x86_pmu.get_event_constraints); static_call_update(x86_pmu_put_event_constraints, x86_pmu.put_event_constraints); static_call_update(x86_pmu_start_scheduling, x86_pmu.start_scheduling); static_call_update(x86_pmu_commit_scheduling, x86_pmu.commit_scheduling); static_call_update(x86_pmu_stop_scheduling, x86_pmu.stop_scheduling); static_call_update(x86_pmu_sched_task, x86_pmu.sched_task); static_call_update(x86_pmu_swap_task_ctx, x86_pmu.swap_task_ctx); static_call_update(x86_pmu_drain_pebs, x86_pmu.drain_pebs); static_call_update(x86_pmu_pebs_aliases, x86_pmu.pebs_aliases); static_call_update(x86_pmu_guest_get_msrs, x86_pmu.guest_get_msrs); static_call_update(x86_pmu_filter, x86_pmu.filter); } static void _x86_pmu_read(struct perf_event *event) { static_call(x86_pmu_update)(event); } void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed, u64 intel_ctrl) { pr_info("... version: %d\n", x86_pmu.version); pr_info("... bit width: %d\n", x86_pmu.cntval_bits); pr_info("... generic registers: %d\n", num_counters); pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask); pr_info("... max period: %016Lx\n", x86_pmu.max_period); pr_info("... fixed-purpose events: %lu\n", hweight64((((1ULL << num_counters_fixed) - 1) << INTEL_PMC_IDX_FIXED) & intel_ctrl)); pr_info("... event mask: %016Lx\n", intel_ctrl); } static int __init init_hw_perf_events(void) { struct x86_pmu_quirk *quirk; int err; pr_info("Performance Events: "); switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_INTEL: err = intel_pmu_init(); break; case X86_VENDOR_AMD: err = amd_pmu_init(); break; case X86_VENDOR_HYGON: err = amd_pmu_init(); x86_pmu.name = "HYGON"; break; case X86_VENDOR_ZHAOXIN: case X86_VENDOR_CENTAUR: err = zhaoxin_pmu_init(); break; default: err = -ENOTSUPP; } if (err != 0) { pr_cont("no PMU driver, software events only.\n"); err = 0; goto out_bad_pmu; } pmu_check_apic(); /* sanity check that the hardware exists or is emulated */ if (!check_hw_exists(&pmu, x86_pmu.num_counters, x86_pmu.num_counters_fixed)) goto out_bad_pmu; pr_cont("%s PMU driver.\n", x86_pmu.name); x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next) quirk->func(); if (!x86_pmu.intel_ctrl) x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; perf_events_lapic_init(); register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI"); unconstrained = (struct event_constraint) __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1, 0, x86_pmu.num_counters, 0, 0); x86_pmu_format_group.attrs = x86_pmu.format_attrs; if (!x86_pmu.events_sysfs_show) x86_pmu_events_group.attrs = &empty_attrs; pmu.attr_update = x86_pmu.attr_update; if (!is_hybrid()) { x86_pmu_show_pmu_cap(x86_pmu.num_counters, x86_pmu.num_counters_fixed, x86_pmu.intel_ctrl); } if (!x86_pmu.read) x86_pmu.read = _x86_pmu_read; if (!x86_pmu.guest_get_msrs) x86_pmu.guest_get_msrs = (void *)&__static_call_return0; if (!x86_pmu.set_period) x86_pmu.set_period = x86_perf_event_set_period; if (!x86_pmu.update) x86_pmu.update = x86_perf_event_update; x86_pmu_static_call_update(); /* * Install callbacks. Core will call them for each online * cpu. */ err = cpuhp_setup_state(CPUHP_PERF_X86_PREPARE, "perf/x86:prepare", x86_pmu_prepare_cpu, x86_pmu_dead_cpu); if (err) return err; err = cpuhp_setup_state(CPUHP_AP_PERF_X86_STARTING, "perf/x86:starting", x86_pmu_starting_cpu, x86_pmu_dying_cpu); if (err) goto out; err = cpuhp_setup_state(CPUHP_AP_PERF_X86_ONLINE, "perf/x86:online", x86_pmu_online_cpu, NULL); if (err) goto out1; if (!is_hybrid()) { err = perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW); if (err) goto out2; } else { struct x86_hybrid_pmu *hybrid_pmu; int i, j; for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { hybrid_pmu = &x86_pmu.hybrid_pmu[i]; hybrid_pmu->pmu = pmu; hybrid_pmu->pmu.type = -1; hybrid_pmu->pmu.attr_update = x86_pmu.attr_update; hybrid_pmu->pmu.capabilities |= PERF_PMU_CAP_EXTENDED_HW_TYPE; err = perf_pmu_register(&hybrid_pmu->pmu, hybrid_pmu->name, (hybrid_pmu->cpu_type == hybrid_big) ? PERF_TYPE_RAW : -1); if (err) break; } if (i < x86_pmu.num_hybrid_pmus) { for (j = 0; j < i; j++) perf_pmu_unregister(&x86_pmu.hybrid_pmu[j].pmu); pr_warn("Failed to register hybrid PMUs\n"); kfree(x86_pmu.hybrid_pmu); x86_pmu.hybrid_pmu = NULL; x86_pmu.num_hybrid_pmus = 0; goto out2; } } return 0; out2: cpuhp_remove_state(CPUHP_AP_PERF_X86_ONLINE); out1: cpuhp_remove_state(CPUHP_AP_PERF_X86_STARTING); out: cpuhp_remove_state(CPUHP_PERF_X86_PREPARE); out_bad_pmu: memset(&x86_pmu, 0, sizeof(x86_pmu)); return err; } early_initcall(init_hw_perf_events); static void x86_pmu_read(struct perf_event *event) { static_call(x86_pmu_read)(event); } /* * Start group events scheduling transaction * Set the flag to make pmu::enable() not perform the * schedulability test, it will be performed at commit time * * We only support PERF_PMU_TXN_ADD transactions. Save the * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD * transactions. */ static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); WARN_ON_ONCE(cpuc->txn_flags); /* txn already in flight */ cpuc->txn_flags = txn_flags; if (txn_flags & ~PERF_PMU_TXN_ADD) return; perf_pmu_disable(pmu); __this_cpu_write(cpu_hw_events.n_txn, 0); __this_cpu_write(cpu_hw_events.n_txn_pair, 0); __this_cpu_write(cpu_hw_events.n_txn_metric, 0); } /* * Stop group events scheduling transaction * Clear the flag and pmu::enable() will perform the * schedulability test. */ static void x86_pmu_cancel_txn(struct pmu *pmu) { unsigned int txn_flags; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */ txn_flags = cpuc->txn_flags; cpuc->txn_flags = 0; if (txn_flags & ~PERF_PMU_TXN_ADD) return; /* * Truncate collected array by the number of events added in this * transaction. See x86_pmu_add() and x86_pmu_*_txn(). */ __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn)); __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn)); __this_cpu_sub(cpu_hw_events.n_pair, __this_cpu_read(cpu_hw_events.n_txn_pair)); __this_cpu_sub(cpu_hw_events.n_metric, __this_cpu_read(cpu_hw_events.n_txn_metric)); perf_pmu_enable(pmu); } /* * Commit group events scheduling transaction * Perform the group schedulability test as a whole * Return 0 if success * * Does not cancel the transaction on failure; expects the caller to do this. */ static int x86_pmu_commit_txn(struct pmu *pmu) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int assign[X86_PMC_IDX_MAX]; int n, ret; WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */ if (cpuc->txn_flags & ~PERF_PMU_TXN_ADD) { cpuc->txn_flags = 0; return 0; } n = cpuc->n_events; if (!x86_pmu_initialized()) return -EAGAIN; ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign); if (ret) return ret; /* * copy new assignment, now we know it is possible * will be used by hw_perf_enable() */ memcpy(cpuc->assign, assign, n*sizeof(int)); cpuc->txn_flags = 0; perf_pmu_enable(pmu); return 0; } /* * a fake_cpuc is used to validate event groups. Due to * the extra reg logic, we need to also allocate a fake * per_core and per_cpu structure. Otherwise, group events * using extra reg may conflict without the kernel being * able to catch this when the last event gets added to * the group. */ static void free_fake_cpuc(struct cpu_hw_events *cpuc) { intel_cpuc_finish(cpuc); kfree(cpuc); } static struct cpu_hw_events *allocate_fake_cpuc(struct pmu *event_pmu) { struct cpu_hw_events *cpuc; int cpu; cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL); if (!cpuc) return ERR_PTR(-ENOMEM); cpuc->is_fake = 1; if (is_hybrid()) { struct x86_hybrid_pmu *h_pmu; h_pmu = hybrid_pmu(event_pmu); if (cpumask_empty(&h_pmu->supported_cpus)) goto error; cpu = cpumask_first(&h_pmu->supported_cpus); } else cpu = raw_smp_processor_id(); cpuc->pmu = event_pmu; if (intel_cpuc_prepare(cpuc, cpu)) goto error; return cpuc; error: free_fake_cpuc(cpuc); return ERR_PTR(-ENOMEM); } /* * validate that we can schedule this event */ static int validate_event(struct perf_event *event) { struct cpu_hw_events *fake_cpuc; struct event_constraint *c; int ret = 0; fake_cpuc = allocate_fake_cpuc(event->pmu); if (IS_ERR(fake_cpuc)) return PTR_ERR(fake_cpuc); c = x86_pmu.get_event_constraints(fake_cpuc, 0, event); if (!c || !c->weight) ret = -EINVAL; if (x86_pmu.put_event_constraints) x86_pmu.put_event_constraints(fake_cpuc, event); free_fake_cpuc(fake_cpuc); return ret; } /* * validate a single event group * * validation include: * - check events are compatible which each other * - events do not compete for the same counter * - number of events <= number of counters * * validation ensures the group can be loaded onto the * PMU if it was the only group available. */ static int validate_group(struct perf_event *event) { struct perf_event *leader = event->group_leader; struct cpu_hw_events *fake_cpuc; int ret = -EINVAL, n; /* * Reject events from different hybrid PMUs. */ if (is_hybrid()) { struct perf_event *sibling; struct pmu *pmu = NULL; if (is_x86_event(leader)) pmu = leader->pmu; for_each_sibling_event(sibling, leader) { if (!is_x86_event(sibling)) continue; if (!pmu) pmu = sibling->pmu; else if (pmu != sibling->pmu) return ret; } } fake_cpuc = allocate_fake_cpuc(event->pmu); if (IS_ERR(fake_cpuc)) return PTR_ERR(fake_cpuc); /* * the event is not yet connected with its * siblings therefore we must first collect * existing siblings, then add the new event * before we can simulate the scheduling */ n = collect_events(fake_cpuc, leader, true); if (n < 0) goto out; fake_cpuc->n_events = n; n = collect_events(fake_cpuc, event, false); if (n < 0) goto out; fake_cpuc->n_events = 0; ret = x86_pmu.schedule_events(fake_cpuc, n, NULL); out: free_fake_cpuc(fake_cpuc); return ret; } static int x86_pmu_event_init(struct perf_event *event) { struct x86_hybrid_pmu *pmu = NULL; int err; if ((event->attr.type != event->pmu->type) && (event->attr.type != PERF_TYPE_HARDWARE) && (event->attr.type != PERF_TYPE_HW_CACHE)) return -ENOENT; if (is_hybrid() && (event->cpu != -1)) { pmu = hybrid_pmu(event->pmu); if (!cpumask_test_cpu(event->cpu, &pmu->supported_cpus)) return -ENOENT; } err = __x86_pmu_event_init(event); if (!err) { if (event->group_leader != event) err = validate_group(event); else err = validate_event(event); } if (err) { if (event->destroy) event->destroy(event); event->destroy = NULL; } if (READ_ONCE(x86_pmu.attr_rdpmc) && !(event->hw.flags & PERF_X86_EVENT_LARGE_PEBS)) event->hw.flags |= PERF_EVENT_FLAG_USER_READ_CNT; return err; } void perf_clear_dirty_counters(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int i; /* Don't need to clear the assigned counter. */ for (i = 0; i < cpuc->n_events; i++) __clear_bit(cpuc->assign[i], cpuc->dirty); if (bitmap_empty(cpuc->dirty, X86_PMC_IDX_MAX)) return; for_each_set_bit(i, cpuc->dirty, X86_PMC_IDX_MAX) { if (i >= INTEL_PMC_IDX_FIXED) { /* Metrics and fake events don't have corresponding HW counters. */ if ((i - INTEL_PMC_IDX_FIXED) >= hybrid(cpuc->pmu, num_counters_fixed)) continue; wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + (i - INTEL_PMC_IDX_FIXED), 0); } else { wrmsrl(x86_pmu_event_addr(i), 0); } } bitmap_zero(cpuc->dirty, X86_PMC_IDX_MAX); } static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm) { if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)) return; /* * This function relies on not being called concurrently in two * tasks in the same mm. Otherwise one task could observe * perf_rdpmc_allowed > 1 and return all the way back to * userspace with CR4.PCE clear while another task is still * doing on_each_cpu_mask() to propagate CR4.PCE. * * For now, this can't happen because all callers hold mmap_lock * for write. If this changes, we'll need a different solution. */ mmap_assert_write_locked(mm); if (atomic_inc_return(&mm->context.perf_rdpmc_allowed) == 1) on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1); } static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm) { if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)) return; if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed)) on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1); } static int x86_pmu_event_idx(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; if (!(hwc->flags & PERF_EVENT_FLAG_USER_READ_CNT)) return 0; if (is_metric_idx(hwc->idx)) return INTEL_PMC_FIXED_RDPMC_METRICS + 1; else return hwc->event_base_rdpmc + 1; } static ssize_t get_attr_rdpmc(struct device *cdev, struct device_attribute *attr, char *buf) { return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc); } static ssize_t set_attr_rdpmc(struct device *cdev, struct device_attribute *attr, const char *buf, size_t count) { unsigned long val; ssize_t ret; ret = kstrtoul(buf, 0, &val); if (ret) return ret; if (val > 2) return -EINVAL; if (x86_pmu.attr_rdpmc_broken) return -ENOTSUPP; if (val != x86_pmu.attr_rdpmc) { /* * Changing into or out of never available or always available, * aka perf-event-bypassing mode. This path is extremely slow, * but only root can trigger it, so it's okay. */ if (val == 0) static_branch_inc(&rdpmc_never_available_key); else if (x86_pmu.attr_rdpmc == 0) static_branch_dec(&rdpmc_never_available_key); if (val == 2) static_branch_inc(&rdpmc_always_available_key); else if (x86_pmu.attr_rdpmc == 2) static_branch_dec(&rdpmc_always_available_key); on_each_cpu(cr4_update_pce, NULL, 1); x86_pmu.attr_rdpmc = val; } return count; } static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc); static struct attribute *x86_pmu_attrs[] = { &dev_attr_rdpmc.attr, NULL, }; static struct attribute_group x86_pmu_attr_group __ro_after_init = { .attrs = x86_pmu_attrs, }; static ssize_t max_precise_show(struct device *cdev, struct device_attribute *attr, char *buf) { return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise()); } static DEVICE_ATTR_RO(max_precise); static struct attribute *x86_pmu_caps_attrs[] = { &dev_attr_max_precise.attr, NULL }; static struct attribute_group x86_pmu_caps_group __ro_after_init = { .name = "caps", .attrs = x86_pmu_caps_attrs, }; static const struct attribute_group *x86_pmu_attr_groups[] = { &x86_pmu_attr_group, &x86_pmu_format_group, &x86_pmu_events_group, &x86_pmu_caps_group, NULL, }; static void x86_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) { static_call_cond(x86_pmu_sched_task)(pmu_ctx, sched_in); } static void x86_pmu_swap_task_ctx(struct perf_event_pmu_context *prev_epc, struct perf_event_pmu_context *next_epc) { static_call_cond(x86_pmu_swap_task_ctx)(prev_epc, next_epc); } void perf_check_microcode(void) { if (x86_pmu.check_microcode) x86_pmu.check_microcode(); } static int x86_pmu_check_period(struct perf_event *event, u64 value) { if (x86_pmu.check_period && x86_pmu.check_period(event, value)) return -EINVAL; if (value && x86_pmu.limit_period) { s64 left = value; x86_pmu.limit_period(event, &left); if (left > value) return -EINVAL; } return 0; } static int x86_pmu_aux_output_match(struct perf_event *event) { if (!(pmu.capabilities & PERF_PMU_CAP_AUX_OUTPUT)) return 0; if (x86_pmu.aux_output_match) return x86_pmu.aux_output_match(event); return 0; } static bool x86_pmu_filter(struct pmu *pmu, int cpu) { bool ret = false; static_call_cond(x86_pmu_filter)(pmu, cpu, &ret); return ret; } static struct pmu pmu = { .pmu_enable = x86_pmu_enable, .pmu_disable = x86_pmu_disable, .attr_groups = x86_pmu_attr_groups, .event_init = x86_pmu_event_init, .event_mapped = x86_pmu_event_mapped, .event_unmapped = x86_pmu_event_unmapped, .add = x86_pmu_add, .del = x86_pmu_del, .start = x86_pmu_start, .stop = x86_pmu_stop, .read = x86_pmu_read, .start_txn = x86_pmu_start_txn, .cancel_txn = x86_pmu_cancel_txn, .commit_txn = x86_pmu_commit_txn, .event_idx = x86_pmu_event_idx, .sched_task = x86_pmu_sched_task, .swap_task_ctx = x86_pmu_swap_task_ctx, .check_period = x86_pmu_check_period, .aux_output_match = x86_pmu_aux_output_match, .filter = x86_pmu_filter, }; void arch_perf_update_userpage(struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now) { struct cyc2ns_data data; u64 offset; userpg->cap_user_time = 0; userpg->cap_user_time_zero = 0; userpg->cap_user_rdpmc = !!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT); userpg->pmc_width = x86_pmu.cntval_bits; if (!using_native_sched_clock() || !sched_clock_stable()) return; cyc2ns_read_begin(&data); offset = data.cyc2ns_offset + __sched_clock_offset; /* * Internal timekeeping for enabled/running/stopped times * is always in the local_clock domain. */ userpg->cap_user_time = 1; userpg->time_mult = data.cyc2ns_mul; userpg->time_shift = data.cyc2ns_shift; userpg->time_offset = offset - now; /* * cap_user_time_zero doesn't make sense when we're using a different * time base for the records. */ if (!event->attr.use_clockid) { userpg->cap_user_time_zero = 1; userpg->time_zero = offset; } cyc2ns_read_end(); } /* * Determine whether the regs were taken from an irq/exception handler rather * than from perf_arch_fetch_caller_regs(). */ static bool perf_hw_regs(struct pt_regs *regs) { return regs->flags & X86_EFLAGS_FIXED; } void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { struct unwind_state state; unsigned long addr; if (perf_guest_state()) { /* TODO: We don't support guest os callchain now */ return; } if (perf_callchain_store(entry, regs->ip)) return; if (perf_hw_regs(regs)) unwind_start(&state, current, regs, NULL); else unwind_start(&state, current, NULL, (void *)regs->sp); for (; !unwind_done(&state); unwind_next_frame(&state)) { addr = unwind_get_return_address(&state); if (!addr || perf_callchain_store(entry, addr)) return; } } static inline int valid_user_frame(const void __user *fp, unsigned long size) { return __access_ok(fp, size); } static unsigned long get_segment_base(unsigned int segment) { struct desc_struct *desc; unsigned int idx = segment >> 3; if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) { #ifdef CONFIG_MODIFY_LDT_SYSCALL struct ldt_struct *ldt; /* IRQs are off, so this synchronizes with smp_store_release */ ldt = READ_ONCE(current->active_mm->context.ldt); if (!ldt || idx >= ldt->nr_entries) return 0; desc = &ldt->entries[idx]; #else return 0; #endif } else { if (idx >= GDT_ENTRIES) return 0; desc = raw_cpu_ptr(gdt_page.gdt) + idx; } return get_desc_base(desc); } #ifdef CONFIG_IA32_EMULATION #include <linux/compat.h> static inline int perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry) { /* 32-bit process in 64-bit kernel. */ unsigned long ss_base, cs_base; struct stack_frame_ia32 frame; const struct stack_frame_ia32 __user *fp; if (user_64bit_mode(regs)) return 0; cs_base = get_segment_base(regs->cs); ss_base = get_segment_base(regs->ss); fp = compat_ptr(ss_base + regs->bp); pagefault_disable(); while (entry->nr < entry->max_stack) { if (!valid_user_frame(fp, sizeof(frame))) break; if (__get_user(frame.next_frame, &fp->next_frame)) break; if (__get_user(frame.return_address, &fp->return_address)) break; perf_callchain_store(entry, cs_base + frame.return_address); fp = compat_ptr(ss_base + frame.next_frame); } pagefault_enable(); return 1; } #else static inline int perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry) { return 0; } #endif void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { struct stack_frame frame; const struct stack_frame __user *fp; if (perf_guest_state()) { /* TODO: We don't support guest os callchain now */ return; } /* * We don't know what to do with VM86 stacks.. ignore them for now. */ if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM)) return; fp = (void __user *)regs->bp; perf_callchain_store(entry, regs->ip); if (!nmi_uaccess_okay()) return; if (perf_callchain_user32(regs, entry)) return; pagefault_disable(); while (entry->nr < entry->max_stack) { if (!valid_user_frame(fp, sizeof(frame))) break; if (__get_user(frame.next_frame, &fp->next_frame)) break; if (__get_user(frame.return_address, &fp->return_address)) break; perf_callchain_store(entry, frame.return_address); fp = (void __user *)frame.next_frame; } pagefault_enable(); } /* * Deal with code segment offsets for the various execution modes: * * VM86 - the good olde 16 bit days, where the linear address is * 20 bits and we use regs->ip + 0x10 * regs->cs. * * IA32 - Where we need to look at GDT/LDT segment descriptor tables * to figure out what the 32bit base address is. * * X32 - has TIF_X32 set, but is running in x86_64 * * X86_64 - CS,DS,SS,ES are all zero based. */ static unsigned long code_segment_base(struct pt_regs *regs) { /* * For IA32 we look at the GDT/LDT segment base to convert the * effective IP to a linear address. */ #ifdef CONFIG_X86_32 /* * If we are in VM86 mode, add the segment offset to convert to a * linear address. */ if (regs->flags & X86_VM_MASK) return 0x10 * regs->cs; if (user_mode(regs) && regs->cs != __USER_CS) return get_segment_base(regs->cs); #else if (user_mode(regs) && !user_64bit_mode(regs) && regs->cs != __USER32_CS) return get_segment_base(regs->cs); #endif return 0; } unsigned long perf_instruction_pointer(struct pt_regs *regs) { if (perf_guest_state()) return perf_guest_get_ip(); return regs->ip + code_segment_base(regs); } unsigned long perf_misc_flags(struct pt_regs *regs) { unsigned int guest_state = perf_guest_state(); int misc = 0; if (guest_state) { if (guest_state & PERF_GUEST_USER) misc |= PERF_RECORD_MISC_GUEST_USER; else misc |= PERF_RECORD_MISC_GUEST_KERNEL; } else { if (user_mode(regs)) misc |= PERF_RECORD_MISC_USER; else misc |= PERF_RECORD_MISC_KERNEL; } if (regs->flags & PERF_EFLAGS_EXACT) misc |= PERF_RECORD_MISC_EXACT_IP; return misc; } void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) { /* This API doesn't currently support enumerating hybrid PMUs. */ if (WARN_ON_ONCE(cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) || !x86_pmu_initialized()) { memset(cap, 0, sizeof(*cap)); return; } /* * Note, hybrid CPU models get tracked as having hybrid PMUs even when * all E-cores are disabled via BIOS. When E-cores are disabled, the * base PMU holds the correct number of counters for P-cores. */ cap->version = x86_pmu.version; cap->num_counters_gp = x86_pmu.num_counters; cap->num_counters_fixed = x86_pmu.num_counters_fixed; cap->bit_width_gp = x86_pmu.cntval_bits; cap->bit_width_fixed = x86_pmu.cntval_bits; cap->events_mask = (unsigned int)x86_pmu.events_maskl; cap->events_mask_len = x86_pmu.events_mask_len; cap->pebs_ept = x86_pmu.pebs_ept; } EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability); u64 perf_get_hw_event_config(int hw_event) { int max = x86_pmu.max_events; if (hw_event < max) return x86_pmu.event_map(array_index_nospec(hw_event, max)); return 0; } EXPORT_SYMBOL_GPL(perf_get_hw_event_config);
6 1536 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 // SPDX-License-Identifier: GPL-2.0 /* drivers/net/wireless/virt_wifi.c * * A fake implementation of cfg80211_ops that can be tacked on to an ethernet * net_device to make it appear as a wireless connection. * * Copyright (C) 2018 Google, Inc. * * Author: schuffelen@google.com */ #include <net/cfg80211.h> #include <net/rtnetlink.h> #include <linux/etherdevice.h> #include <linux/math64.h> #include <linux/module.h> static struct wiphy *common_wiphy; struct virt_wifi_wiphy_priv { struct delayed_work scan_result; struct cfg80211_scan_request *scan_request; bool being_deleted; }; static struct ieee80211_channel channel_2ghz = { .band = NL80211_BAND_2GHZ, .center_freq = 2432, .hw_value = 2432, .max_power = 20, }; static struct ieee80211_rate bitrates_2ghz[] = { { .bitrate = 10 }, { .bitrate = 20 }, { .bitrate = 55 }, { .bitrate = 110 }, { .bitrate = 60 }, { .bitrate = 120 }, { .bitrate = 240 }, }; static struct ieee80211_supported_band band_2ghz = { .channels = &channel_2ghz, .bitrates = bitrates_2ghz, .band = NL80211_BAND_2GHZ, .n_channels = 1, .n_bitrates = ARRAY_SIZE(bitrates_2ghz), .ht_cap = { .ht_supported = true, .cap = IEEE80211_HT_CAP_SUP_WIDTH_20_40 | IEEE80211_HT_CAP_GRN_FLD | IEEE80211_HT_CAP_SGI_20 | IEEE80211_HT_CAP_SGI_40 | IEEE80211_HT_CAP_DSSSCCK40, .ampdu_factor = 0x3, .ampdu_density = 0x6, .mcs = { .rx_mask = {0xff, 0xff}, .tx_params = IEEE80211_HT_MCS_TX_DEFINED, }, }, }; static struct ieee80211_channel channel_5ghz = { .band = NL80211_BAND_5GHZ, .center_freq = 5240, .hw_value = 5240, .max_power = 20, }; static struct ieee80211_rate bitrates_5ghz[] = { { .bitrate = 60 }, { .bitrate = 120 }, { .bitrate = 240 }, }; #define RX_MCS_MAP (IEEE80211_VHT_MCS_SUPPORT_0_9 << 0 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 2 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 4 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 6 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 8 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 10 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 12 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 14) #define TX_MCS_MAP (IEEE80211_VHT_MCS_SUPPORT_0_9 << 0 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 2 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 4 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 6 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 8 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 10 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 12 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 14) static struct ieee80211_supported_band band_5ghz = { .channels = &channel_5ghz, .bitrates = bitrates_5ghz, .band = NL80211_BAND_5GHZ, .n_channels = 1, .n_bitrates = ARRAY_SIZE(bitrates_5ghz), .ht_cap = { .ht_supported = true, .cap = IEEE80211_HT_CAP_SUP_WIDTH_20_40 | IEEE80211_HT_CAP_GRN_FLD | IEEE80211_HT_CAP_SGI_20 | IEEE80211_HT_CAP_SGI_40 | IEEE80211_HT_CAP_DSSSCCK40, .ampdu_factor = 0x3, .ampdu_density = 0x6, .mcs = { .rx_mask = {0xff, 0xff}, .tx_params = IEEE80211_HT_MCS_TX_DEFINED, }, }, .vht_cap = { .vht_supported = true, .cap = IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454 | IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ | IEEE80211_VHT_CAP_RXLDPC | IEEE80211_VHT_CAP_SHORT_GI_80 | IEEE80211_VHT_CAP_SHORT_GI_160 | IEEE80211_VHT_CAP_TXSTBC | IEEE80211_VHT_CAP_RXSTBC_1 | IEEE80211_VHT_CAP_RXSTBC_2 | IEEE80211_VHT_CAP_RXSTBC_3 | IEEE80211_VHT_CAP_RXSTBC_4 | IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK, .vht_mcs = { .rx_mcs_map = cpu_to_le16(RX_MCS_MAP), .tx_mcs_map = cpu_to_le16(TX_MCS_MAP), } }, }; /* Assigned at module init. Guaranteed locally-administered and unicast. */ static u8 fake_router_bssid[ETH_ALEN] __ro_after_init = {}; static void virt_wifi_inform_bss(struct wiphy *wiphy) { u64 tsf = div_u64(ktime_get_boottime_ns(), 1000); struct cfg80211_bss *informed_bss; static const struct { u8 tag; u8 len; u8 ssid[8]; } __packed ssid = { .tag = WLAN_EID_SSID, .len = 8, .ssid = "VirtWifi", }; informed_bss = cfg80211_inform_bss(wiphy, &channel_5ghz, CFG80211_BSS_FTYPE_PRESP, fake_router_bssid, tsf, WLAN_CAPABILITY_ESS, 0, (void *)&ssid, sizeof(ssid), DBM_TO_MBM(-50), GFP_KERNEL); cfg80211_put_bss(wiphy, informed_bss); } /* Called with the rtnl lock held. */ static int virt_wifi_scan(struct wiphy *wiphy, struct cfg80211_scan_request *request) { struct virt_wifi_wiphy_priv *priv = wiphy_priv(wiphy); wiphy_debug(wiphy, "scan\n"); if (priv->scan_request || priv->being_deleted) return -EBUSY; priv->scan_request = request; schedule_delayed_work(&priv->scan_result, HZ * 2); return 0; } /* Acquires and releases the rdev BSS lock. */ static void virt_wifi_scan_result(struct work_struct *work) { struct virt_wifi_wiphy_priv *priv = container_of(work, struct virt_wifi_wiphy_priv, scan_result.work); struct wiphy *wiphy = priv_to_wiphy(priv); struct cfg80211_scan_info scan_info = { .aborted = false }; virt_wifi_inform_bss(wiphy); /* Schedules work which acquires and releases the rtnl lock. */ cfg80211_scan_done(priv->scan_request, &scan_info); priv->scan_request = NULL; } /* May acquire and release the rdev BSS lock. */ static void virt_wifi_cancel_scan(struct wiphy *wiphy) { struct virt_wifi_wiphy_priv *priv = wiphy_priv(wiphy); cancel_delayed_work_sync(&priv->scan_result); /* Clean up dangling callbacks if necessary. */ if (priv->scan_request) { struct cfg80211_scan_info scan_info = { .aborted = true }; /* Schedules work which acquires and releases the rtnl lock. */ cfg80211_scan_done(priv->scan_request, &scan_info); priv->scan_request = NULL; } } struct virt_wifi_netdev_priv { struct delayed_work connect; struct net_device *lowerdev; struct net_device *upperdev; u32 tx_packets; u32 tx_failed; u8 connect_requested_bss[ETH_ALEN]; bool is_up; bool is_connected; bool being_deleted; }; /* Called with the rtnl lock held. */ static int virt_wifi_connect(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_connect_params *sme) { struct virt_wifi_netdev_priv *priv = netdev_priv(netdev); bool could_schedule; if (priv->being_deleted || !priv->is_up) return -EBUSY; could_schedule = schedule_delayed_work(&priv->connect, HZ * 2); if (!could_schedule) return -EBUSY; if (sme->bssid) { ether_addr_copy(priv->connect_requested_bss, sme->bssid); } else { virt_wifi_inform_bss(wiphy); eth_zero_addr(priv->connect_requested_bss); } wiphy_debug(wiphy, "connect\n"); return 0; } /* Acquires and releases the rdev event lock. */ static void virt_wifi_connect_complete(struct work_struct *work) { struct virt_wifi_netdev_priv *priv = container_of(work, struct virt_wifi_netdev_priv, connect.work); u8 *requested_bss = priv->connect_requested_bss; bool right_addr = ether_addr_equal(requested_bss, fake_router_bssid); u16 status = WLAN_STATUS_SUCCESS; if (is_zero_ether_addr(requested_bss)) requested_bss = NULL; if (!priv->is_up || (requested_bss && !right_addr)) status = WLAN_STATUS_UNSPECIFIED_FAILURE; else priv->is_connected = true; /* Schedules an event that acquires the rtnl lock. */ cfg80211_connect_result(priv->upperdev, requested_bss, NULL, 0, NULL, 0, status, GFP_KERNEL); netif_carrier_on(priv->upperdev); } /* May acquire and release the rdev event lock. */ static void virt_wifi_cancel_connect(struct net_device *netdev) { struct virt_wifi_netdev_priv *priv = netdev_priv(netdev); /* If there is work pending, clean up dangling callbacks. */ if (cancel_delayed_work_sync(&priv->connect)) { /* Schedules an event that acquires the rtnl lock. */ cfg80211_connect_result(priv->upperdev, priv->connect_requested_bss, NULL, 0, NULL, 0, WLAN_STATUS_UNSPECIFIED_FAILURE, GFP_KERNEL); } } /* Called with the rtnl lock held. Acquires the rdev event lock. */ static int virt_wifi_disconnect(struct wiphy *wiphy, struct net_device *netdev, u16 reason_code) { struct virt_wifi_netdev_priv *priv = netdev_priv(netdev); if (priv->being_deleted) return -EBUSY; wiphy_debug(wiphy, "disconnect\n"); virt_wifi_cancel_connect(netdev); cfg80211_disconnected(netdev, reason_code, NULL, 0, true, GFP_KERNEL); priv->is_connected = false; netif_carrier_off(netdev); return 0; } /* Called with the rtnl lock held. */ static int virt_wifi_get_station(struct wiphy *wiphy, struct net_device *dev, const u8 *mac, struct station_info *sinfo) { struct virt_wifi_netdev_priv *priv = netdev_priv(dev); wiphy_debug(wiphy, "get_station\n"); if (!priv->is_connected || !ether_addr_equal(mac, fake_router_bssid)) return -ENOENT; sinfo->filled = BIT_ULL(NL80211_STA_INFO_TX_PACKETS) | BIT_ULL(NL80211_STA_INFO_TX_FAILED) | BIT_ULL(NL80211_STA_INFO_SIGNAL) | BIT_ULL(NL80211_STA_INFO_TX_BITRATE); sinfo->tx_packets = priv->tx_packets; sinfo->tx_failed = priv->tx_failed; /* For CFG80211_SIGNAL_TYPE_MBM, value is expressed in _dBm_ */ sinfo->signal = -50; sinfo->txrate = (struct rate_info) { .legacy = 10, /* units are 100kbit/s */ }; return 0; } /* Called with the rtnl lock held. */ static int virt_wifi_dump_station(struct wiphy *wiphy, struct net_device *dev, int idx, u8 *mac, struct station_info *sinfo) { struct virt_wifi_netdev_priv *priv = netdev_priv(dev); wiphy_debug(wiphy, "dump_station\n"); if (idx != 0 || !priv->is_connected) return -ENOENT; ether_addr_copy(mac, fake_router_bssid); return virt_wifi_get_station(wiphy, dev, fake_router_bssid, sinfo); } static const struct cfg80211_ops virt_wifi_cfg80211_ops = { .scan = virt_wifi_scan, .connect = virt_wifi_connect, .disconnect = virt_wifi_disconnect, .get_station = virt_wifi_get_station, .dump_station = virt_wifi_dump_station, }; /* Acquires and releases the rtnl lock. */ static struct wiphy *virt_wifi_make_wiphy(void) { struct wiphy *wiphy; struct virt_wifi_wiphy_priv *priv; int err; wiphy = wiphy_new(&virt_wifi_cfg80211_ops, sizeof(*priv)); if (!wiphy) return NULL; wiphy->max_scan_ssids = 4; wiphy->max_scan_ie_len = 1000; wiphy->signal_type = CFG80211_SIGNAL_TYPE_MBM; wiphy->bands[NL80211_BAND_2GHZ] = &band_2ghz; wiphy->bands[NL80211_BAND_5GHZ] = &band_5ghz; wiphy->bands[NL80211_BAND_60GHZ] = NULL; wiphy->interface_modes = BIT(NL80211_IFTYPE_STATION); priv = wiphy_priv(wiphy); priv->being_deleted = false; priv->scan_request = NULL; INIT_DELAYED_WORK(&priv->scan_result, virt_wifi_scan_result); err = wiphy_register(wiphy); if (err < 0) { wiphy_free(wiphy); return NULL; } return wiphy; } /* Acquires and releases the rtnl lock. */ static void virt_wifi_destroy_wiphy(struct wiphy *wiphy) { struct virt_wifi_wiphy_priv *priv; WARN(!wiphy, "%s called with null wiphy", __func__); if (!wiphy) return; priv = wiphy_priv(wiphy); priv->being_deleted = true; virt_wifi_cancel_scan(wiphy); if (wiphy->registered) wiphy_unregister(wiphy); wiphy_free(wiphy); } /* Enters and exits a RCU-bh critical section. */ static netdev_tx_t virt_wifi_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct virt_wifi_netdev_priv *priv = netdev_priv(dev); priv->tx_packets++; if (!priv->is_connected) { priv->tx_failed++; return NET_XMIT_DROP; } skb->dev = priv->lowerdev; return dev_queue_xmit(skb); } /* Called with rtnl lock held. */ static int virt_wifi_net_device_open(struct net_device *dev) { struct virt_wifi_netdev_priv *priv = netdev_priv(dev); priv->is_up = true; return 0; } /* Called with rtnl lock held. */ static int virt_wifi_net_device_stop(struct net_device *dev) { struct virt_wifi_netdev_priv *n_priv = netdev_priv(dev); n_priv->is_up = false; if (!dev->ieee80211_ptr) return 0; virt_wifi_cancel_scan(dev->ieee80211_ptr->wiphy); virt_wifi_cancel_connect(dev); netif_carrier_off(dev); return 0; } static int virt_wifi_net_device_get_iflink(const struct net_device *dev) { struct virt_wifi_netdev_priv *priv = netdev_priv(dev); return priv->lowerdev->ifindex; } static const struct net_device_ops virt_wifi_ops = { .ndo_start_xmit = virt_wifi_start_xmit, .ndo_open = virt_wifi_net_device_open, .ndo_stop = virt_wifi_net_device_stop, .ndo_get_iflink = virt_wifi_net_device_get_iflink, }; /* Invoked as part of rtnl lock release. */ static void virt_wifi_net_device_destructor(struct net_device *dev) { /* Delayed past dellink to allow nl80211 to react to the device being * deleted. */ kfree(dev->ieee80211_ptr); dev->ieee80211_ptr = NULL; } /* No lock interaction. */ static void virt_wifi_setup(struct net_device *dev) { ether_setup(dev); dev->netdev_ops = &virt_wifi_ops; dev->needs_free_netdev = true; } /* Called in a RCU read critical section from netif_receive_skb */ static rx_handler_result_t virt_wifi_rx_handler(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; struct virt_wifi_netdev_priv *priv = rcu_dereference(skb->dev->rx_handler_data); if (!priv->is_connected) return RX_HANDLER_PASS; /* GFP_ATOMIC because this is a packet interrupt handler. */ skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) { dev_err(&priv->upperdev->dev, "can't skb_share_check\n"); return RX_HANDLER_CONSUMED; } *pskb = skb; skb->dev = priv->upperdev; skb->pkt_type = PACKET_HOST; return RX_HANDLER_ANOTHER; } /* Called with rtnl lock held. */ static int virt_wifi_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct virt_wifi_netdev_priv *priv = netdev_priv(dev); int err; if (!tb[IFLA_LINK]) return -EINVAL; netif_carrier_off(dev); priv->upperdev = dev; priv->lowerdev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); if (!priv->lowerdev) return -ENODEV; if (!tb[IFLA_MTU]) dev->mtu = priv->lowerdev->mtu; else if (dev->mtu > priv->lowerdev->mtu) return -EINVAL; err = netdev_rx_handler_register(priv->lowerdev, virt_wifi_rx_handler, priv); if (err) { dev_err(&priv->lowerdev->dev, "can't netdev_rx_handler_register: %d\n", err); return err; } eth_hw_addr_inherit(dev, priv->lowerdev); netif_stacked_transfer_operstate(priv->lowerdev, dev); SET_NETDEV_DEV(dev, &priv->lowerdev->dev); dev->ieee80211_ptr = kzalloc(sizeof(*dev->ieee80211_ptr), GFP_KERNEL); if (!dev->ieee80211_ptr) { err = -ENOMEM; goto remove_handler; } dev->ieee80211_ptr->iftype = NL80211_IFTYPE_STATION; dev->ieee80211_ptr->wiphy = common_wiphy; err = register_netdevice(dev); if (err) { dev_err(&priv->lowerdev->dev, "can't register_netdevice: %d\n", err); goto free_wireless_dev; } err = netdev_upper_dev_link(priv->lowerdev, dev, extack); if (err) { dev_err(&priv->lowerdev->dev, "can't netdev_upper_dev_link: %d\n", err); goto unregister_netdev; } dev->priv_destructor = virt_wifi_net_device_destructor; priv->being_deleted = false; priv->is_connected = false; priv->is_up = false; INIT_DELAYED_WORK(&priv->connect, virt_wifi_connect_complete); __module_get(THIS_MODULE); return 0; unregister_netdev: unregister_netdevice(dev); free_wireless_dev: kfree(dev->ieee80211_ptr); dev->ieee80211_ptr = NULL; remove_handler: netdev_rx_handler_unregister(priv->lowerdev); return err; } /* Called with rtnl lock held. */ static void virt_wifi_dellink(struct net_device *dev, struct list_head *head) { struct virt_wifi_netdev_priv *priv = netdev_priv(dev); if (dev->ieee80211_ptr) virt_wifi_cancel_scan(dev->ieee80211_ptr->wiphy); priv->being_deleted = true; virt_wifi_cancel_connect(dev); netif_carrier_off(dev); netdev_rx_handler_unregister(priv->lowerdev); netdev_upper_dev_unlink(priv->lowerdev, dev); unregister_netdevice_queue(dev, head); module_put(THIS_MODULE); /* Deleting the wiphy is handled in the module destructor. */ } static struct rtnl_link_ops virt_wifi_link_ops = { .kind = "virt_wifi", .setup = virt_wifi_setup, .newlink = virt_wifi_newlink, .dellink = virt_wifi_dellink, .priv_size = sizeof(struct virt_wifi_netdev_priv), }; static bool netif_is_virt_wifi_dev(const struct net_device *dev) { return rcu_access_pointer(dev->rx_handler) == virt_wifi_rx_handler; } static int virt_wifi_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *lower_dev = netdev_notifier_info_to_dev(ptr); struct virt_wifi_netdev_priv *priv; struct net_device *upper_dev; LIST_HEAD(list_kill); if (!netif_is_virt_wifi_dev(lower_dev)) return NOTIFY_DONE; switch (event) { case NETDEV_UNREGISTER: priv = rtnl_dereference(lower_dev->rx_handler_data); if (!priv) return NOTIFY_DONE; upper_dev = priv->upperdev; upper_dev->rtnl_link_ops->dellink(upper_dev, &list_kill); unregister_netdevice_many(&list_kill); break; } return NOTIFY_DONE; } static struct notifier_block virt_wifi_notifier = { .notifier_call = virt_wifi_event, }; /* Acquires and releases the rtnl lock. */ static int __init virt_wifi_init_module(void) { int err; /* Guaranteed to be locally-administered and not multicast. */ eth_random_addr(fake_router_bssid); err = register_netdevice_notifier(&virt_wifi_notifier); if (err) return err; err = -ENOMEM; common_wiphy = virt_wifi_make_wiphy(); if (!common_wiphy) goto notifier; err = rtnl_link_register(&virt_wifi_link_ops); if (err) goto destroy_wiphy; return 0; destroy_wiphy: virt_wifi_destroy_wiphy(common_wiphy); notifier: unregister_netdevice_notifier(&virt_wifi_notifier); return err; } /* Acquires and releases the rtnl lock. */ static void __exit virt_wifi_cleanup_module(void) { /* Will delete any devices that depend on the wiphy. */ rtnl_link_unregister(&virt_wifi_link_ops); virt_wifi_destroy_wiphy(common_wiphy); unregister_netdevice_notifier(&virt_wifi_notifier); } module_init(virt_wifi_init_module); module_exit(virt_wifi_cleanup_module); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Cody Schuffelen <schuffelen@google.com>"); MODULE_DESCRIPTION("Driver for a wireless wrapper of ethernet devices"); MODULE_ALIAS_RTNL_LINK("virt_wifi");
5757 3546 4271 3296 1150 1128 5919 3811 1853 6178 550 5751 5998 4628 140 551 1256 2508 795 574 1806 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 /* SPDX-License-Identifier: GPL-2.0 */ /* * include/linux/buffer_head.h * * Everything to do with buffer_heads. */ #ifndef _LINUX_BUFFER_HEAD_H #define _LINUX_BUFFER_HEAD_H #include <linux/types.h> #include <linux/blk_types.h> #include <linux/fs.h> #include <linux/linkage.h> #include <linux/pagemap.h> #include <linux/wait.h> #include <linux/atomic.h> enum bh_state_bits { BH_Uptodate, /* Contains valid data */ BH_Dirty, /* Is dirty */ BH_Lock, /* Is locked */ BH_Req, /* Has been submitted for I/O */ BH_Mapped, /* Has a disk mapping */ BH_New, /* Disk mapping was newly created by get_block */ BH_Async_Read, /* Is under end_buffer_async_read I/O */ BH_Async_Write, /* Is under end_buffer_async_write I/O */ BH_Delay, /* Buffer is not yet allocated on disk */ BH_Boundary, /* Block is followed by a discontiguity */ BH_Write_EIO, /* I/O error on write */ BH_Unwritten, /* Buffer is allocated on disk but not written */ BH_Quiet, /* Buffer Error Prinks to be quiet */ BH_Meta, /* Buffer contains metadata */ BH_Prio, /* Buffer should be submitted with REQ_PRIO */ BH_Defer_Completion, /* Defer AIO completion to workqueue */ BH_PrivateStart,/* not a state bit, but the first bit available * for private allocation by other entities */ }; #define MAX_BUF_PER_PAGE (PAGE_SIZE / 512) struct page; struct buffer_head; struct address_space; typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate); /* * Historically, a buffer_head was used to map a single block * within a page, and of course as the unit of I/O through the * filesystem and block layers. Nowadays the basic I/O unit * is the bio, and buffer_heads are used for extracting block * mappings (via a get_block_t call), for tracking state within * a page (via a page_mapping) and for wrapping bio submission * for backward compatibility reasons (e.g. submit_bh). */ struct buffer_head { unsigned long b_state; /* buffer state bitmap (see above) */ struct buffer_head *b_this_page;/* circular list of page's buffers */ union { struct page *b_page; /* the page this bh is mapped to */ struct folio *b_folio; /* the folio this bh is mapped to */ }; sector_t b_blocknr; /* start block number */ size_t b_size; /* size of mapping */ char *b_data; /* pointer to data within the page */ struct block_device *b_bdev; bh_end_io_t *b_end_io; /* I/O completion */ void *b_private; /* reserved for b_end_io */ struct list_head b_assoc_buffers; /* associated with another mapping */ struct address_space *b_assoc_map; /* mapping this buffer is associated with */ atomic_t b_count; /* users using this buffer_head */ spinlock_t b_uptodate_lock; /* Used by the first bh in a page, to * serialise IO completion of other * buffers in the page */ }; /* * macro tricks to expand the set_buffer_foo(), clear_buffer_foo() * and buffer_foo() functions. * To avoid reset buffer flags that are already set, because that causes * a costly cache line transition, check the flag first. */ #define BUFFER_FNS(bit, name) \ static __always_inline void set_buffer_##name(struct buffer_head *bh) \ { \ if (!test_bit(BH_##bit, &(bh)->b_state)) \ set_bit(BH_##bit, &(bh)->b_state); \ } \ static __always_inline void clear_buffer_##name(struct buffer_head *bh) \ { \ clear_bit(BH_##bit, &(bh)->b_state); \ } \ static __always_inline int buffer_##name(const struct buffer_head *bh) \ { \ return test_bit(BH_##bit, &(bh)->b_state); \ } /* * test_set_buffer_foo() and test_clear_buffer_foo() */ #define TAS_BUFFER_FNS(bit, name) \ static __always_inline int test_set_buffer_##name(struct buffer_head *bh) \ { \ return test_and_set_bit(BH_##bit, &(bh)->b_state); \ } \ static __always_inline int test_clear_buffer_##name(struct buffer_head *bh) \ { \ return test_and_clear_bit(BH_##bit, &(bh)->b_state); \ } \ /* * Emit the buffer bitops functions. Note that there are also functions * of the form "mark_buffer_foo()". These are higher-level functions which * do something in addition to setting a b_state bit. */ BUFFER_FNS(Dirty, dirty) TAS_BUFFER_FNS(Dirty, dirty) BUFFER_FNS(Lock, locked) BUFFER_FNS(Req, req) TAS_BUFFER_FNS(Req, req) BUFFER_FNS(Mapped, mapped) BUFFER_FNS(New, new) BUFFER_FNS(Async_Read, async_read) BUFFER_FNS(Async_Write, async_write) BUFFER_FNS(Delay, delay) BUFFER_FNS(Boundary, boundary) BUFFER_FNS(Write_EIO, write_io_error) BUFFER_FNS(Unwritten, unwritten) BUFFER_FNS(Meta, meta) BUFFER_FNS(Prio, prio) BUFFER_FNS(Defer_Completion, defer_completion) static __always_inline void set_buffer_uptodate(struct buffer_head *bh) { /* * If somebody else already set this uptodate, they will * have done the memory barrier, and a reader will thus * see *some* valid buffer state. * * Any other serialization (with IO errors or whatever that * might clear the bit) has to come from other state (eg BH_Lock). */ if (test_bit(BH_Uptodate, &bh->b_state)) return; /* * make it consistent with folio_mark_uptodate * pairs with smp_load_acquire in buffer_uptodate */ smp_mb__before_atomic(); set_bit(BH_Uptodate, &bh->b_state); } static __always_inline void clear_buffer_uptodate(struct buffer_head *bh) { clear_bit(BH_Uptodate, &bh->b_state); } static __always_inline int buffer_uptodate(const struct buffer_head *bh) { /* * make it consistent with folio_test_uptodate * pairs with smp_mb__before_atomic in set_buffer_uptodate */ return test_bit_acquire(BH_Uptodate, &bh->b_state); } static inline unsigned long bh_offset(const struct buffer_head *bh) { return (unsigned long)(bh)->b_data & (page_size(bh->b_page) - 1); } /* If we *know* page->private refers to buffer_heads */ #define page_buffers(page) \ ({ \ BUG_ON(!PagePrivate(page)); \ ((struct buffer_head *)page_private(page)); \ }) #define page_has_buffers(page) PagePrivate(page) #define folio_buffers(folio) folio_get_private(folio) void buffer_check_dirty_writeback(struct folio *folio, bool *dirty, bool *writeback); /* * Declarations */ void mark_buffer_dirty(struct buffer_head *bh); void mark_buffer_write_io_error(struct buffer_head *bh); void touch_buffer(struct buffer_head *bh); void folio_set_bh(struct buffer_head *bh, struct folio *folio, unsigned long offset); struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size, bool retry); struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, bool retry); void create_empty_buffers(struct page *, unsigned long, unsigned long b_state); void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize, unsigned long b_state); void end_buffer_read_sync(struct buffer_head *bh, int uptodate); void end_buffer_write_sync(struct buffer_head *bh, int uptodate); void end_buffer_async_write(struct buffer_head *bh, int uptodate); /* Things to do with buffers at mapping->private_list */ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode); int generic_buffers_fsync_noflush(struct file *file, loff_t start, loff_t end, bool datasync); int generic_buffers_fsync(struct file *file, loff_t start, loff_t end, bool datasync); void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len); static inline void clean_bdev_bh_alias(struct buffer_head *bh) { clean_bdev_aliases(bh->b_bdev, bh->b_blocknr, 1); } void mark_buffer_async_write(struct buffer_head *bh); void __wait_on_buffer(struct buffer_head *); wait_queue_head_t *bh_waitq_head(struct buffer_head *bh); struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block, unsigned size); struct buffer_head *__getblk_gfp(struct block_device *bdev, sector_t block, unsigned size, gfp_t gfp); void __brelse(struct buffer_head *); void __bforget(struct buffer_head *); void __breadahead(struct block_device *, sector_t block, unsigned int size); struct buffer_head *__bread_gfp(struct block_device *, sector_t block, unsigned size, gfp_t gfp); struct buffer_head *alloc_buffer_head(gfp_t gfp_flags); void free_buffer_head(struct buffer_head * bh); void unlock_buffer(struct buffer_head *bh); void __lock_buffer(struct buffer_head *bh); int sync_dirty_buffer(struct buffer_head *bh); int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags); void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags); void submit_bh(blk_opf_t, struct buffer_head *); void write_boundary_block(struct block_device *bdev, sector_t bblock, unsigned blocksize); int bh_uptodate_or_lock(struct buffer_head *bh); int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait); void __bh_read_batch(int nr, struct buffer_head *bhs[], blk_opf_t op_flags, bool force_lock); /* * Generic address_space_operations implementations for buffer_head-backed * address_spaces. */ void block_invalidate_folio(struct folio *folio, size_t offset, size_t length); int block_write_full_page(struct page *page, get_block_t *get_block, struct writeback_control *wbc); int __block_write_full_folio(struct inode *inode, struct folio *folio, get_block_t *get_block, struct writeback_control *wbc, bh_end_io_t *handler); int block_read_full_folio(struct folio *, get_block_t *); bool block_is_partially_uptodate(struct folio *, size_t from, size_t count); int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, struct page **pagep, get_block_t *get_block); int __block_write_begin(struct page *page, loff_t pos, unsigned len, get_block_t *get_block); int block_write_end(struct file *, struct address_space *, loff_t, unsigned, unsigned, struct page *, void *); int generic_write_end(struct file *, struct address_space *, loff_t, unsigned, unsigned, struct page *, void *); void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to); void clean_page_buffers(struct page *page); int cont_write_begin(struct file *, struct address_space *, loff_t, unsigned, struct page **, void **, get_block_t *, loff_t *); int generic_cont_expand_simple(struct inode *inode, loff_t size); void block_commit_write(struct page *page, unsigned int from, unsigned int to); int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block); sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *); int block_truncate_page(struct address_space *, loff_t, get_block_t *); #ifdef CONFIG_MIGRATION extern int buffer_migrate_folio(struct address_space *, struct folio *dst, struct folio *src, enum migrate_mode); extern int buffer_migrate_folio_norefs(struct address_space *, struct folio *dst, struct folio *src, enum migrate_mode); #else #define buffer_migrate_folio NULL #define buffer_migrate_folio_norefs NULL #endif /* * inline definitions */ static inline void get_bh(struct buffer_head *bh) { atomic_inc(&bh->b_count); } static inline void put_bh(struct buffer_head *bh) { smp_mb__before_atomic(); atomic_dec(&bh->b_count); } static inline void brelse(struct buffer_head *bh) { if (bh) __brelse(bh); } static inline void bforget(struct buffer_head *bh) { if (bh) __bforget(bh); } static inline struct buffer_head * sb_bread(struct super_block *sb, sector_t block) { return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE); } static inline struct buffer_head * sb_bread_unmovable(struct super_block *sb, sector_t block) { return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, 0); } static inline void sb_breadahead(struct super_block *sb, sector_t block) { __breadahead(sb->s_bdev, block, sb->s_blocksize); } static inline struct buffer_head * sb_getblk(struct super_block *sb, sector_t block) { return __getblk_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE); } static inline struct buffer_head * sb_getblk_gfp(struct super_block *sb, sector_t block, gfp_t gfp) { return __getblk_gfp(sb->s_bdev, block, sb->s_blocksize, gfp); } static inline struct buffer_head * sb_find_get_block(struct super_block *sb, sector_t block) { return __find_get_block(sb->s_bdev, block, sb->s_blocksize); } static inline void map_bh(struct buffer_head *bh, struct super_block *sb, sector_t block) { set_buffer_mapped(bh); bh->b_bdev = sb->s_bdev; bh->b_blocknr = block; bh->b_size = sb->s_blocksize; } static inline void wait_on_buffer(struct buffer_head *bh) { might_sleep(); if (buffer_locked(bh)) __wait_on_buffer(bh); } static inline int trylock_buffer(struct buffer_head *bh) { return likely(!test_and_set_bit_lock(BH_Lock, &bh->b_state)); } static inline void lock_buffer(struct buffer_head *bh) { might_sleep(); if (!trylock_buffer(bh)) __lock_buffer(bh); } static inline struct buffer_head *getblk_unmovable(struct block_device *bdev, sector_t block, unsigned size) { return __getblk_gfp(bdev, block, size, 0); } static inline struct buffer_head *__getblk(struct block_device *bdev, sector_t block, unsigned size) { return __getblk_gfp(bdev, block, size, __GFP_MOVABLE); } static inline void bh_readahead(struct buffer_head *bh, blk_opf_t op_flags) { if (!buffer_uptodate(bh) && trylock_buffer(bh)) { if (!buffer_uptodate(bh)) __bh_read(bh, op_flags, false); else unlock_buffer(bh); } } static inline void bh_read_nowait(struct buffer_head *bh, blk_opf_t op_flags) { if (!bh_uptodate_or_lock(bh)) __bh_read(bh, op_flags, false); } /* Returns 1 if buffer uptodated, 0 on success, and -EIO on error. */ static inline int bh_read(struct buffer_head *bh, blk_opf_t op_flags) { if (bh_uptodate_or_lock(bh)) return 1; return __bh_read(bh, op_flags, true); } static inline void bh_read_batch(int nr, struct buffer_head *bhs[]) { __bh_read_batch(nr, bhs, 0, true); } static inline void bh_readahead_batch(int nr, struct buffer_head *bhs[], blk_opf_t op_flags) { __bh_read_batch(nr, bhs, op_flags, false); } /** * __bread() - reads a specified block and returns the bh * @bdev: the block_device to read from * @block: number of block * @size: size (in bytes) to read * * Reads a specified block, and returns buffer head that contains it. * The page cache is allocated from movable area so that it can be migrated. * It returns NULL if the block was unreadable. */ static inline struct buffer_head * __bread(struct block_device *bdev, sector_t block, unsigned size) { return __bread_gfp(bdev, block, size, __GFP_MOVABLE); } bool block_dirty_folio(struct address_space *mapping, struct folio *folio); #ifdef CONFIG_BUFFER_HEAD void buffer_init(void); bool try_to_free_buffers(struct folio *folio); int inode_has_buffers(struct inode *inode); void invalidate_inode_buffers(struct inode *inode); int remove_inode_buffers(struct inode *inode); int sync_mapping_buffers(struct address_space *mapping); void invalidate_bh_lrus(void); void invalidate_bh_lrus_cpu(void); bool has_bh_in_lru(int cpu, void *dummy); extern int buffer_heads_over_limit; #else /* CONFIG_BUFFER_HEAD */ static inline void buffer_init(void) {} static inline bool try_to_free_buffers(struct folio *folio) { return true; } static inline int inode_has_buffers(struct inode *inode) { return 0; } static inline void invalidate_inode_buffers(struct inode *inode) {} static inline int remove_inode_buffers(struct inode *inode) { return 1; } static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; } static inline void invalidate_bh_lrus(void) {} static inline void invalidate_bh_lrus_cpu(void) {} static inline bool has_bh_in_lru(int cpu, void *dummy) { return false; } #define buffer_heads_over_limit 0 #endif /* CONFIG_BUFFER_HEAD */ #endif /* _LINUX_BUFFER_HEAD_H */
11873 22 56 1 56 56 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 /* SPDX-License-Identifier: GPL-2.0-only */ /* * AppArmor security module * * This file contains AppArmor file mediation function definitions. * * Copyright (C) 1998-2008 Novell/SUSE * Copyright 2009-2010 Canonical Ltd. */ #ifndef __AA_FILE_H #define __AA_FILE_H #include <linux/spinlock.h> #include "domain.h" #include "match.h" #include "perms.h" struct aa_policydb; struct aa_profile; struct path; #define mask_mode_t(X) (X & (MAY_EXEC | MAY_WRITE | MAY_READ | MAY_APPEND)) #define AA_AUDIT_FILE_MASK (MAY_READ | MAY_WRITE | MAY_EXEC | MAY_APPEND |\ AA_MAY_CREATE | AA_MAY_DELETE | \ AA_MAY_GETATTR | AA_MAY_SETATTR | \ AA_MAY_CHMOD | AA_MAY_CHOWN | AA_MAY_LOCK | \ AA_EXEC_MMAP | AA_MAY_LINK) static inline struct aa_file_ctx *file_ctx(struct file *file) { return file->f_security + apparmor_blob_sizes.lbs_file; } /* struct aa_file_ctx - the AppArmor context the file was opened in * @lock: lock to update the ctx * @label: label currently cached on the ctx * @perms: the permission the file was opened with */ struct aa_file_ctx { spinlock_t lock; struct aa_label __rcu *label; u32 allow; }; /** * aa_alloc_file_ctx - allocate file_ctx * @label: initial label of task creating the file * @gfp: gfp flags for allocation * * Returns: file_ctx or NULL on failure */ static inline struct aa_file_ctx *aa_alloc_file_ctx(struct aa_label *label, gfp_t gfp) { struct aa_file_ctx *ctx; ctx = kzalloc(sizeof(struct aa_file_ctx), gfp); if (ctx) { spin_lock_init(&ctx->lock); rcu_assign_pointer(ctx->label, aa_get_label(label)); } return ctx; } /** * aa_free_file_ctx - free a file_ctx * @ctx: file_ctx to free (MAYBE_NULL) */ static inline void aa_free_file_ctx(struct aa_file_ctx *ctx) { if (ctx) { aa_put_label(rcu_access_pointer(ctx->label)); kfree_sensitive(ctx); } } static inline struct aa_label *aa_get_file_label(struct aa_file_ctx *ctx) { return aa_get_label_rcu(&ctx->label); } /* * The xindex is broken into 3 parts * - index - an index into either the exec name table or the variable table * - exec type - which determines how the executable name and index are used * - flags - which modify how the destination name is applied */ #define AA_X_INDEX_MASK AA_INDEX_MASK #define AA_X_TYPE_MASK 0x0c000000 #define AA_X_NONE AA_INDEX_NONE #define AA_X_NAME 0x04000000 /* use executable name px */ #define AA_X_TABLE 0x08000000 /* use a specified name ->n# */ #define AA_X_UNSAFE 0x10000000 #define AA_X_CHILD 0x20000000 #define AA_X_INHERIT 0x40000000 #define AA_X_UNCONFINED 0x80000000 /* need to make conditional which ones are being set */ struct path_cond { kuid_t uid; umode_t mode; }; #define COMBINED_PERM_MASK(X) ((X).allow | (X).audit | (X).quiet | (X).kill) int aa_audit_file(struct aa_profile *profile, struct aa_perms *perms, const char *op, u32 request, const char *name, const char *target, struct aa_label *tlabel, kuid_t ouid, const char *info, int error); struct aa_perms *aa_lookup_fperms(struct aa_policydb *file_rules, aa_state_t state, struct path_cond *cond); aa_state_t aa_str_perms(struct aa_policydb *file_rules, aa_state_t start, const char *name, struct path_cond *cond, struct aa_perms *perms); int aa_path_perm(const char *op, struct aa_label *label, const struct path *path, int flags, u32 request, struct path_cond *cond); int aa_path_link(struct aa_label *label, struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry); int aa_file_perm(const char *op, struct aa_label *label, struct file *file, u32 request, bool in_atomic); void aa_inherit_files(const struct cred *cred, struct files_struct *files); /** * aa_map_file_perms - map file flags to AppArmor permissions * @file: open file to map flags to AppArmor permissions * * Returns: apparmor permission set for the file */ static inline u32 aa_map_file_to_perms(struct file *file) { int flags = file->f_flags; u32 perms = 0; if (file->f_mode & FMODE_WRITE) perms |= MAY_WRITE; if (file->f_mode & FMODE_READ) perms |= MAY_READ; if ((flags & O_APPEND) && (perms & MAY_WRITE)) perms = (perms & ~MAY_WRITE) | MAY_APPEND; /* trunc implies write permission */ if (flags & O_TRUNC) perms |= MAY_WRITE; if (flags & O_CREAT) perms |= AA_MAY_CREATE; return perms; } #endif /* __AA_FILE_H */
1535 1536 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 /* * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2004 Voltaire, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "ipoib.h" #include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/vmalloc.h> #include <linux/if_arp.h> /* For ARPHRD_xxx */ #include <linux/ip.h> #include <linux/in.h> #include <linux/jhash.h> #include <net/arp.h> #include <net/addrconf.h> #include <linux/inetdevice.h> #include <rdma/ib_cache.h> MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); MODULE_LICENSE("Dual BSD/GPL"); int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE; int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE; module_param_named(send_queue_size, ipoib_sendq_size, int, 0444); MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue"); module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444); MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue"); #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG int ipoib_debug_level; module_param_named(debug_level, ipoib_debug_level, int, 0644); MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); #endif struct ipoib_path_iter { struct net_device *dev; struct ipoib_path path; }; static const u8 ipv4_bcast_addr[] = { 0x00, 0xff, 0xff, 0xff, 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff }; struct workqueue_struct *ipoib_workqueue; struct ib_sa_client ipoib_sa_client; static int ipoib_add_one(struct ib_device *device); static void ipoib_remove_one(struct ib_device *device, void *client_data); static void ipoib_neigh_reclaim(struct rcu_head *rp); static struct net_device *ipoib_get_net_dev_by_params( struct ib_device *dev, u32 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr, void *client_data); static int ipoib_set_mac(struct net_device *dev, void *addr); static int ipoib_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd); static struct ib_client ipoib_client = { .name = "ipoib", .add = ipoib_add_one, .remove = ipoib_remove_one, .get_net_dev_by_params = ipoib_get_net_dev_by_params, }; #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG static int ipoib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct netdev_notifier_info *ni = ptr; struct net_device *dev = ni->dev; if (dev->netdev_ops->ndo_open != ipoib_open) return NOTIFY_DONE; switch (event) { case NETDEV_REGISTER: ipoib_create_debug_files(dev); break; case NETDEV_CHANGENAME: ipoib_delete_debug_files(dev); ipoib_create_debug_files(dev); break; case NETDEV_UNREGISTER: ipoib_delete_debug_files(dev); break; } return NOTIFY_DONE; } #endif int ipoib_open(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); ipoib_dbg(priv, "bringing up interface\n"); netif_carrier_off(dev); set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); if (ipoib_ib_dev_open(dev)) { if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) return 0; goto err_disable; } ipoib_ib_dev_up(dev); if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { struct ipoib_dev_priv *cpriv; /* Bring up any child interfaces too */ down_read(&priv->vlan_rwsem); list_for_each_entry(cpriv, &priv->child_intfs, list) { int flags; flags = cpriv->dev->flags; if (flags & IFF_UP) continue; dev_change_flags(cpriv->dev, flags | IFF_UP, NULL); } up_read(&priv->vlan_rwsem); } else if (priv->parent) { struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent); if (!test_bit(IPOIB_FLAG_ADMIN_UP, &ppriv->flags)) ipoib_dbg(priv, "parent device %s is not up, so child device may be not functioning.\n", ppriv->dev->name); } netif_start_queue(dev); return 0; err_disable: clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); return -EINVAL; } static int ipoib_stop(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); ipoib_dbg(priv, "stopping interface\n"); clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); netif_stop_queue(dev); ipoib_ib_dev_down(dev); ipoib_ib_dev_stop(dev); if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { struct ipoib_dev_priv *cpriv; /* Bring down any child interfaces too */ down_read(&priv->vlan_rwsem); list_for_each_entry(cpriv, &priv->child_intfs, list) { int flags; flags = cpriv->dev->flags; if (!(flags & IFF_UP)) continue; dev_change_flags(cpriv->dev, flags & ~IFF_UP, NULL); } up_read(&priv->vlan_rwsem); } return 0; } static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_features_t features) { struct ipoib_dev_priv *priv = ipoib_priv(dev); if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags)) features &= ~(NETIF_F_IP_CSUM | NETIF_F_TSO); return features; } static int ipoib_change_mtu(struct net_device *dev, int new_mtu) { struct ipoib_dev_priv *priv = ipoib_priv(dev); int ret = 0; /* dev->mtu > 2K ==> connected mode */ if (ipoib_cm_admin_enabled(dev)) { if (new_mtu > ipoib_cm_max_mtu(dev)) return -EINVAL; if (new_mtu > priv->mcast_mtu) ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n", priv->mcast_mtu); dev->mtu = new_mtu; return 0; } if (new_mtu < (ETH_MIN_MTU + IPOIB_ENCAP_LEN) || new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu)) return -EINVAL; priv->admin_mtu = new_mtu; if (priv->mcast_mtu < priv->admin_mtu) ipoib_dbg(priv, "MTU must be smaller than the underlying " "link layer MTU - 4 (%u)\n", priv->mcast_mtu); new_mtu = min(priv->mcast_mtu, priv->admin_mtu); if (priv->rn_ops->ndo_change_mtu) { bool carrier_status = netif_carrier_ok(dev); netif_carrier_off(dev); /* notify lower level on the real mtu */ ret = priv->rn_ops->ndo_change_mtu(dev, new_mtu); if (carrier_status) netif_carrier_on(dev); } else { dev->mtu = new_mtu; } return ret; } static void ipoib_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) { struct ipoib_dev_priv *priv = ipoib_priv(dev); if (priv->rn_ops->ndo_get_stats64) priv->rn_ops->ndo_get_stats64(dev, stats); else netdev_stats_to_stats64(stats, &dev->stats); } /* Called with an RCU read lock taken */ static bool ipoib_is_dev_match_addr_rcu(const struct sockaddr *addr, struct net_device *dev) { struct net *net = dev_net(dev); struct in_device *in_dev; struct sockaddr_in *addr_in = (struct sockaddr_in *)addr; struct sockaddr_in6 *addr_in6 = (struct sockaddr_in6 *)addr; __be32 ret_addr; switch (addr->sa_family) { case AF_INET: in_dev = in_dev_get(dev); if (!in_dev) return false; ret_addr = inet_confirm_addr(net, in_dev, 0, addr_in->sin_addr.s_addr, RT_SCOPE_HOST); in_dev_put(in_dev); if (ret_addr) return true; break; case AF_INET6: if (IS_ENABLED(CONFIG_IPV6) && ipv6_chk_addr(net, &addr_in6->sin6_addr, dev, 1)) return true; break; } return false; } /* * Find the master net_device on top of the given net_device. * @dev: base IPoIB net_device * * Returns the master net_device with a reference held, or the same net_device * if no master exists. */ static struct net_device *ipoib_get_master_net_dev(struct net_device *dev) { struct net_device *master; rcu_read_lock(); master = netdev_master_upper_dev_get_rcu(dev); if (master) dev_hold(master); rcu_read_unlock(); if (master) return master; dev_hold(dev); return dev; } struct ipoib_walk_data { const struct sockaddr *addr; struct net_device *result; }; static int ipoib_upper_walk(struct net_device *upper, struct netdev_nested_priv *priv) { struct ipoib_walk_data *data = (struct ipoib_walk_data *)priv->data; int ret = 0; if (ipoib_is_dev_match_addr_rcu(data->addr, upper)) { dev_hold(upper); data->result = upper; ret = 1; } return ret; } /** * ipoib_get_net_dev_match_addr - Find a net_device matching * the given address, which is an upper device of the given net_device. * * @addr: IP address to look for. * @dev: base IPoIB net_device * * If found, returns the net_device with a reference held. Otherwise return * NULL. */ static struct net_device *ipoib_get_net_dev_match_addr( const struct sockaddr *addr, struct net_device *dev) { struct netdev_nested_priv priv; struct ipoib_walk_data data = { .addr = addr, }; priv.data = (void *)&data; rcu_read_lock(); if (ipoib_is_dev_match_addr_rcu(addr, dev)) { dev_hold(dev); data.result = dev; goto out; } netdev_walk_all_upper_dev_rcu(dev, ipoib_upper_walk, &priv); out: rcu_read_unlock(); return data.result; } /* returns the number of IPoIB netdevs on top a given ipoib device matching a * pkey_index and address, if one exists. * * @found_net_dev: contains a matching net_device if the return value >= 1, * with a reference held. */ static int ipoib_match_gid_pkey_addr(struct ipoib_dev_priv *priv, const union ib_gid *gid, u16 pkey_index, const struct sockaddr *addr, int nesting, struct net_device **found_net_dev) { struct ipoib_dev_priv *child_priv; struct net_device *net_dev = NULL; int matches = 0; if (priv->pkey_index == pkey_index && (!gid || !memcmp(gid, &priv->local_gid, sizeof(*gid)))) { if (!addr) { net_dev = ipoib_get_master_net_dev(priv->dev); } else { /* Verify the net_device matches the IP address, as * IPoIB child devices currently share a GID. */ net_dev = ipoib_get_net_dev_match_addr(addr, priv->dev); } if (net_dev) { if (!*found_net_dev) *found_net_dev = net_dev; else dev_put(net_dev); ++matches; } } /* Check child interfaces */ down_read_nested(&priv->vlan_rwsem, nesting); list_for_each_entry(child_priv, &priv->child_intfs, list) { matches += ipoib_match_gid_pkey_addr(child_priv, gid, pkey_index, addr, nesting + 1, found_net_dev); if (matches > 1) break; } up_read(&priv->vlan_rwsem); return matches; } /* Returns the number of matching net_devs found (between 0 and 2). Also * return the matching net_device in the @net_dev parameter, holding a * reference to the net_device, if the number of matches >= 1 */ static int __ipoib_get_net_dev_by_params(struct list_head *dev_list, u32 port, u16 pkey_index, const union ib_gid *gid, const struct sockaddr *addr, struct net_device **net_dev) { struct ipoib_dev_priv *priv; int matches = 0; *net_dev = NULL; list_for_each_entry(priv, dev_list, list) { if (priv->port != port) continue; matches += ipoib_match_gid_pkey_addr(priv, gid, pkey_index, addr, 0, net_dev); if (matches > 1) break; } return matches; } static struct net_device *ipoib_get_net_dev_by_params( struct ib_device *dev, u32 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr, void *client_data) { struct net_device *net_dev; struct list_head *dev_list = client_data; u16 pkey_index; int matches; int ret; if (!rdma_protocol_ib(dev, port)) return NULL; ret = ib_find_cached_pkey(dev, port, pkey, &pkey_index); if (ret) return NULL; /* See if we can find a unique device matching the L2 parameters */ matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index, gid, NULL, &net_dev); switch (matches) { case 0: return NULL; case 1: return net_dev; } dev_put(net_dev); /* Couldn't find a unique device with L2 parameters only. Use L3 * address to uniquely match the net device */ matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index, gid, addr, &net_dev); switch (matches) { case 0: return NULL; default: dev_warn_ratelimited(&dev->dev, "duplicate IP address detected\n"); fallthrough; case 1: return net_dev; } } int ipoib_set_mode(struct net_device *dev, const char *buf) { struct ipoib_dev_priv *priv = ipoib_priv(dev); if ((test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags) && !strcmp(buf, "connected\n")) || (!test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags) && !strcmp(buf, "datagram\n"))) { return 0; } /* flush paths if we switch modes so that connections are restarted */ if (IPOIB_CM_SUPPORTED(dev->dev_addr) && !strcmp(buf, "connected\n")) { set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); ipoib_warn(priv, "enabling connected mode " "will cause multicast packet drops\n"); netdev_update_features(dev); dev_set_mtu(dev, ipoib_cm_max_mtu(dev)); netif_set_real_num_tx_queues(dev, 1); rtnl_unlock(); priv->tx_wr.wr.send_flags &= ~IB_SEND_IP_CSUM; ipoib_flush_paths(dev); return (!rtnl_trylock()) ? -EBUSY : 0; } if (!strcmp(buf, "datagram\n")) { clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); netdev_update_features(dev); dev_set_mtu(dev, min(priv->mcast_mtu, dev->mtu)); netif_set_real_num_tx_queues(dev, dev->num_tx_queues); rtnl_unlock(); ipoib_flush_paths(dev); return (!rtnl_trylock()) ? -EBUSY : 0; } return -EINVAL; } struct ipoib_path *__path_find(struct net_device *dev, void *gid) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct rb_node *n = priv->path_tree.rb_node; struct ipoib_path *path; int ret; while (n) { path = rb_entry(n, struct ipoib_path, rb_node); ret = memcmp(gid, path->pathrec.dgid.raw, sizeof (union ib_gid)); if (ret < 0) n = n->rb_left; else if (ret > 0) n = n->rb_right; else return path; } return NULL; } static int __path_add(struct net_device *dev, struct ipoib_path *path) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct rb_node **n = &priv->path_tree.rb_node; struct rb_node *pn = NULL; struct ipoib_path *tpath; int ret; while (*n) { pn = *n; tpath = rb_entry(pn, struct ipoib_path, rb_node); ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw, sizeof (union ib_gid)); if (ret < 0) n = &pn->rb_left; else if (ret > 0) n = &pn->rb_right; else return -EEXIST; } rb_link_node(&path->rb_node, pn, n); rb_insert_color(&path->rb_node, &priv->path_tree); list_add_tail(&path->list, &priv->path_list); return 0; } static void path_free(struct net_device *dev, struct ipoib_path *path) { struct sk_buff *skb; while ((skb = __skb_dequeue(&path->queue))) dev_kfree_skb_irq(skb); ipoib_dbg(ipoib_priv(dev), "%s\n", __func__); /* remove all neigh connected to this path */ ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw); if (path->ah) ipoib_put_ah(path->ah); kfree(path); } #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev) { struct ipoib_path_iter *iter; iter = kmalloc(sizeof(*iter), GFP_KERNEL); if (!iter) return NULL; iter->dev = dev; memset(iter->path.pathrec.dgid.raw, 0, 16); if (ipoib_path_iter_next(iter)) { kfree(iter); return NULL; } return iter; } int ipoib_path_iter_next(struct ipoib_path_iter *iter) { struct ipoib_dev_priv *priv = ipoib_priv(iter->dev); struct rb_node *n; struct ipoib_path *path; int ret = 1; spin_lock_irq(&priv->lock); n = rb_first(&priv->path_tree); while (n) { path = rb_entry(n, struct ipoib_path, rb_node); if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw, sizeof (union ib_gid)) < 0) { iter->path = *path; ret = 0; break; } n = rb_next(n); } spin_unlock_irq(&priv->lock); return ret; } void ipoib_path_iter_read(struct ipoib_path_iter *iter, struct ipoib_path *path) { *path = iter->path; } #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */ void ipoib_mark_paths_invalid(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_path *path, *tp; spin_lock_irq(&priv->lock); list_for_each_entry_safe(path, tp, &priv->path_list, list) { ipoib_dbg(priv, "mark path LID 0x%08x GID %pI6 invalid\n", be32_to_cpu(sa_path_get_dlid(&path->pathrec)), path->pathrec.dgid.raw); if (path->ah) path->ah->valid = 0; } spin_unlock_irq(&priv->lock); } static void push_pseudo_header(struct sk_buff *skb, const char *daddr) { struct ipoib_pseudo_header *phdr; phdr = skb_push(skb, sizeof(*phdr)); memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN); } void ipoib_flush_paths(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_path *path, *tp; LIST_HEAD(remove_list); unsigned long flags; netif_tx_lock_bh(dev); spin_lock_irqsave(&priv->lock, flags); list_splice_init(&priv->path_list, &remove_list); list_for_each_entry(path, &remove_list, list) rb_erase(&path->rb_node, &priv->path_tree); list_for_each_entry_safe(path, tp, &remove_list, list) { if (path->query) ib_sa_cancel_query(path->query_id, path->query); spin_unlock_irqrestore(&priv->lock, flags); netif_tx_unlock_bh(dev); wait_for_completion(&path->done); path_free(dev, path); netif_tx_lock_bh(dev); spin_lock_irqsave(&priv->lock, flags); } spin_unlock_irqrestore(&priv->lock, flags); netif_tx_unlock_bh(dev); } static void path_rec_completion(int status, struct sa_path_rec *pathrec, unsigned int num_prs, void *path_ptr) { struct ipoib_path *path = path_ptr; struct net_device *dev = path->dev; struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_ah *ah = NULL; struct ipoib_ah *old_ah = NULL; struct ipoib_neigh *neigh, *tn; struct sk_buff_head skqueue; struct sk_buff *skb; unsigned long flags; if (!status) ipoib_dbg(priv, "PathRec LID 0x%04x for GID %pI6\n", be32_to_cpu(sa_path_get_dlid(pathrec)), pathrec->dgid.raw); else ipoib_dbg(priv, "PathRec status %d for GID %pI6\n", status, path->pathrec.dgid.raw); skb_queue_head_init(&skqueue); if (!status) { struct rdma_ah_attr av; if (!ib_init_ah_attr_from_path(priv->ca, priv->port, pathrec, &av, NULL)) { ah = ipoib_create_ah(dev, priv->pd, &av); rdma_destroy_ah_attr(&av); } } spin_lock_irqsave(&priv->lock, flags); if (!IS_ERR_OR_NULL(ah)) { /* * pathrec.dgid is used as the database key from the LLADDR, * it must remain unchanged even if the SA returns a different * GID to use in the AH. */ if (memcmp(pathrec->dgid.raw, path->pathrec.dgid.raw, sizeof(union ib_gid))) { ipoib_dbg( priv, "%s got PathRec for gid %pI6 while asked for %pI6\n", dev->name, pathrec->dgid.raw, path->pathrec.dgid.raw); memcpy(pathrec->dgid.raw, path->pathrec.dgid.raw, sizeof(union ib_gid)); } path->pathrec = *pathrec; old_ah = path->ah; path->ah = ah; ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n", ah, be32_to_cpu(sa_path_get_dlid(pathrec)), pathrec->sl); while ((skb = __skb_dequeue(&path->queue))) __skb_queue_tail(&skqueue, skb); list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) { if (neigh->ah) { WARN_ON(neigh->ah != old_ah); /* * Dropping the ah reference inside * priv->lock is safe here, because we * will hold one more reference from * the original value of path->ah (ie * old_ah). */ ipoib_put_ah(neigh->ah); } kref_get(&path->ah->ref); neigh->ah = path->ah; if (ipoib_cm_enabled(dev, neigh->daddr)) { if (!ipoib_cm_get(neigh)) ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh)); if (!ipoib_cm_get(neigh)) { ipoib_neigh_free(neigh); continue; } } while ((skb = __skb_dequeue(&neigh->queue))) __skb_queue_tail(&skqueue, skb); } path->ah->valid = 1; } path->query = NULL; complete(&path->done); spin_unlock_irqrestore(&priv->lock, flags); if (IS_ERR_OR_NULL(ah)) ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw); if (old_ah) ipoib_put_ah(old_ah); while ((skb = __skb_dequeue(&skqueue))) { int ret; skb->dev = dev; ret = dev_queue_xmit(skb); if (ret) ipoib_warn(priv, "%s: dev_queue_xmit failed to re-queue packet, ret:%d\n", __func__, ret); } } static void init_path_rec(struct ipoib_dev_priv *priv, struct ipoib_path *path, void *gid) { path->dev = priv->dev; if (rdma_cap_opa_ah(priv->ca, priv->port)) path->pathrec.rec_type = SA_PATH_REC_TYPE_OPA; else path->pathrec.rec_type = SA_PATH_REC_TYPE_IB; memcpy(path->pathrec.dgid.raw, gid, sizeof(union ib_gid)); path->pathrec.sgid = priv->local_gid; path->pathrec.pkey = cpu_to_be16(priv->pkey); path->pathrec.numb_path = 1; path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class; } static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_path *path; if (!priv->broadcast) return NULL; path = kzalloc(sizeof(*path), GFP_ATOMIC); if (!path) return NULL; skb_queue_head_init(&path->queue); INIT_LIST_HEAD(&path->neigh_list); init_path_rec(priv, path, gid); return path; } static int path_rec_start(struct net_device *dev, struct ipoib_path *path) { struct ipoib_dev_priv *priv = ipoib_priv(dev); ipoib_dbg(priv, "Start path record lookup for %pI6\n", path->pathrec.dgid.raw); init_completion(&path->done); path->query_id = ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port, &path->pathrec, IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_NUMB_PATH | IB_SA_PATH_REC_TRAFFIC_CLASS | IB_SA_PATH_REC_PKEY, 1000, GFP_ATOMIC, path_rec_completion, path, &path->query); if (path->query_id < 0) { ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id); path->query = NULL; complete(&path->done); return path->query_id; } return 0; } static void neigh_refresh_path(struct ipoib_neigh *neigh, u8 *daddr, struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_path *path; unsigned long flags; spin_lock_irqsave(&priv->lock, flags); path = __path_find(dev, daddr + 4); if (!path) goto out; if (!path->query) path_rec_start(dev, path); out: spin_unlock_irqrestore(&priv->lock, flags); } static struct ipoib_neigh *neigh_add_path(struct sk_buff *skb, u8 *daddr, struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct rdma_netdev *rn = netdev_priv(dev); struct ipoib_path *path; struct ipoib_neigh *neigh; unsigned long flags; spin_lock_irqsave(&priv->lock, flags); neigh = ipoib_neigh_alloc(daddr, dev); if (!neigh) { spin_unlock_irqrestore(&priv->lock, flags); ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); return NULL; } /* To avoid race condition, make sure that the * neigh will be added only once. */ if (unlikely(!list_empty(&neigh->list))) { spin_unlock_irqrestore(&priv->lock, flags); return neigh; } path = __path_find(dev, daddr + 4); if (!path) { path = path_rec_create(dev, daddr + 4); if (!path) goto err_path; __path_add(dev, path); } list_add_tail(&neigh->list, &path->neigh_list); if (path->ah && path->ah->valid) { kref_get(&path->ah->ref); neigh->ah = path->ah; if (ipoib_cm_enabled(dev, neigh->daddr)) { if (!ipoib_cm_get(neigh)) ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh)); if (!ipoib_cm_get(neigh)) { ipoib_neigh_free(neigh); goto err_drop; } if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { push_pseudo_header(skb, neigh->daddr); __skb_queue_tail(&neigh->queue, skb); } else { ipoib_warn(priv, "queue length limit %d. Packet drop.\n", skb_queue_len(&neigh->queue)); goto err_drop; } } else { spin_unlock_irqrestore(&priv->lock, flags); path->ah->last_send = rn->send(dev, skb, path->ah->ah, IPOIB_QPN(daddr)); ipoib_neigh_put(neigh); return NULL; } } else { neigh->ah = NULL; if (!path->query && path_rec_start(dev, path)) goto err_path; if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { push_pseudo_header(skb, neigh->daddr); __skb_queue_tail(&neigh->queue, skb); } else { goto err_drop; } } spin_unlock_irqrestore(&priv->lock, flags); ipoib_neigh_put(neigh); return NULL; err_path: ipoib_neigh_free(neigh); err_drop: ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); spin_unlock_irqrestore(&priv->lock, flags); ipoib_neigh_put(neigh); return NULL; } static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, struct ipoib_pseudo_header *phdr) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct rdma_netdev *rn = netdev_priv(dev); struct ipoib_path *path; unsigned long flags; spin_lock_irqsave(&priv->lock, flags); /* no broadcast means that all paths are (going to be) not valid */ if (!priv->broadcast) goto drop_and_unlock; path = __path_find(dev, phdr->hwaddr + 4); if (!path || !path->ah || !path->ah->valid) { if (!path) { path = path_rec_create(dev, phdr->hwaddr + 4); if (!path) goto drop_and_unlock; __path_add(dev, path); } else { /* * make sure there are no changes in the existing * path record */ init_path_rec(priv, path, phdr->hwaddr + 4); } if (!path->query && path_rec_start(dev, path)) { goto drop_and_unlock; } if (skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { push_pseudo_header(skb, phdr->hwaddr); __skb_queue_tail(&path->queue, skb); goto unlock; } else { goto drop_and_unlock; } } spin_unlock_irqrestore(&priv->lock, flags); ipoib_dbg(priv, "Send unicast ARP to %08x\n", be32_to_cpu(sa_path_get_dlid(&path->pathrec))); path->ah->last_send = rn->send(dev, skb, path->ah->ah, IPOIB_QPN(phdr->hwaddr)); return; drop_and_unlock: ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); unlock: spin_unlock_irqrestore(&priv->lock, flags); } static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct rdma_netdev *rn = netdev_priv(dev); struct ipoib_neigh *neigh; struct ipoib_pseudo_header *phdr; struct ipoib_header *header; unsigned long flags; phdr = (struct ipoib_pseudo_header *) skb->data; skb_pull(skb, sizeof(*phdr)); header = (struct ipoib_header *) skb->data; if (unlikely(phdr->hwaddr[4] == 0xff)) { /* multicast, arrange "if" according to probability */ if ((header->proto != htons(ETH_P_IP)) && (header->proto != htons(ETH_P_IPV6)) && (header->proto != htons(ETH_P_ARP)) && (header->proto != htons(ETH_P_RARP)) && (header->proto != htons(ETH_P_TIPC))) { /* ethertype not supported by IPoIB */ ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); return NETDEV_TX_OK; } /* Add in the P_Key for multicast*/ phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff; phdr->hwaddr[9] = priv->pkey & 0xff; neigh = ipoib_neigh_get(dev, phdr->hwaddr); if (likely(neigh)) goto send_using_neigh; ipoib_mcast_send(dev, phdr->hwaddr, skb); return NETDEV_TX_OK; } /* unicast, arrange "switch" according to probability */ switch (header->proto) { case htons(ETH_P_IP): case htons(ETH_P_IPV6): case htons(ETH_P_TIPC): neigh = ipoib_neigh_get(dev, phdr->hwaddr); if (unlikely(!neigh)) { neigh = neigh_add_path(skb, phdr->hwaddr, dev); if (likely(!neigh)) return NETDEV_TX_OK; } break; case htons(ETH_P_ARP): case htons(ETH_P_RARP): /* for unicast ARP and RARP should always perform path find */ unicast_arp_send(skb, dev, phdr); return NETDEV_TX_OK; default: /* ethertype not supported by IPoIB */ ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); return NETDEV_TX_OK; } send_using_neigh: /* note we now hold a ref to neigh */ if (ipoib_cm_get(neigh)) { if (ipoib_cm_up(neigh)) { ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); goto unref; } } else if (neigh->ah && neigh->ah->valid) { neigh->ah->last_send = rn->send(dev, skb, neigh->ah->ah, IPOIB_QPN(phdr->hwaddr)); goto unref; } else if (neigh->ah) { neigh_refresh_path(neigh, phdr->hwaddr, dev); } if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { push_pseudo_header(skb, phdr->hwaddr); spin_lock_irqsave(&priv->lock, flags); __skb_queue_tail(&neigh->queue, skb); spin_unlock_irqrestore(&priv->lock, flags); } else { ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); } unref: ipoib_neigh_put(neigh); return NETDEV_TX_OK; } static void ipoib_timeout(struct net_device *dev, unsigned int txqueue) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct rdma_netdev *rn = netdev_priv(dev); if (rn->tx_timeout) { rn->tx_timeout(dev, txqueue); return; } ipoib_warn(priv, "transmit timeout: latency %d msecs\n", jiffies_to_msecs(jiffies - dev_trans_start(dev))); ipoib_warn(priv, "queue stopped %d, tx_head %u, tx_tail %u, global_tx_head %u, global_tx_tail %u\n", netif_queue_stopped(dev), priv->tx_head, priv->tx_tail, priv->global_tx_head, priv->global_tx_tail); /* XXX reset QP, etc. */ } static int ipoib_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned int len) { struct ipoib_header *header; header = skb_push(skb, sizeof(*header)); header->proto = htons(type); header->reserved = 0; /* * we don't rely on dst_entry structure, always stuff the * destination address into skb hard header so we can figure out where * to send the packet later. */ push_pseudo_header(skb, daddr); return IPOIB_HARD_LEN; } static void ipoib_set_mcast_list(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) { ipoib_dbg(priv, "IPOIB_FLAG_OPER_UP not set"); return; } queue_work(priv->wq, &priv->restart_task); } static int ipoib_get_iflink(const struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); /* parent interface */ if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) return dev->ifindex; /* child/vlan interface */ return priv->parent->ifindex; } static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr) { /* * Use only the address parts that contributes to spreading * The subnet prefix is not used as one can not connect to * same remote port (GUID) using the same remote QPN via two * different subnets. */ /* qpn octets[1:4) & port GUID octets[12:20) */ u32 *d32 = (u32 *) daddr; u32 hv; hv = jhash_3words(d32[3], d32[4], IPOIB_QPN_MASK & d32[0], 0); return hv & htbl->mask; } struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_neigh_table *ntbl = &priv->ntbl; struct ipoib_neigh_hash *htbl; struct ipoib_neigh *neigh = NULL; u32 hash_val; rcu_read_lock_bh(); htbl = rcu_dereference_bh(ntbl->htbl); if (!htbl) goto out_unlock; hash_val = ipoib_addr_hash(htbl, daddr); for (neigh = rcu_dereference_bh(htbl->buckets[hash_val]); neigh != NULL; neigh = rcu_dereference_bh(neigh->hnext)) { if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) { /* found, take one ref on behalf of the caller */ if (!refcount_inc_not_zero(&neigh->refcnt)) { /* deleted */ neigh = NULL; goto out_unlock; } if (likely(skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)) neigh->alive = jiffies; goto out_unlock; } } out_unlock: rcu_read_unlock_bh(); return neigh; } static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv) { struct ipoib_neigh_table *ntbl = &priv->ntbl; struct ipoib_neigh_hash *htbl; unsigned long neigh_obsolete; unsigned long dt; unsigned long flags; int i; LIST_HEAD(remove_list); spin_lock_irqsave(&priv->lock, flags); htbl = rcu_dereference_protected(ntbl->htbl, lockdep_is_held(&priv->lock)); if (!htbl) goto out_unlock; /* neigh is obsolete if it was idle for two GC periods */ dt = 2 * arp_tbl.gc_interval; neigh_obsolete = jiffies - dt; for (i = 0; i < htbl->size; i++) { struct ipoib_neigh *neigh; struct ipoib_neigh __rcu **np = &htbl->buckets[i]; while ((neigh = rcu_dereference_protected(*np, lockdep_is_held(&priv->lock))) != NULL) { /* was the neigh idle for two GC periods */ if (time_after(neigh_obsolete, neigh->alive)) { ipoib_check_and_add_mcast_sendonly(priv, neigh->daddr + 4, &remove_list); rcu_assign_pointer(*np, rcu_dereference_protected(neigh->hnext, lockdep_is_held(&priv->lock))); /* remove from path/mc list */ list_del_init(&neigh->list); call_rcu(&neigh->rcu, ipoib_neigh_reclaim); } else { np = &neigh->hnext; } } } out_unlock: spin_unlock_irqrestore(&priv->lock, flags); ipoib_mcast_remove_list(&remove_list); } static void ipoib_reap_neigh(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, neigh_reap_task.work); __ipoib_reap_neigh(priv); queue_delayed_work(priv->wq, &priv->neigh_reap_task, arp_tbl.gc_interval); } static struct ipoib_neigh *ipoib_neigh_ctor(u8 *daddr, struct net_device *dev) { struct ipoib_neigh *neigh; neigh = kzalloc(sizeof(*neigh), GFP_ATOMIC); if (!neigh) return NULL; neigh->dev = dev; memcpy(&neigh->daddr, daddr, sizeof(neigh->daddr)); skb_queue_head_init(&neigh->queue); INIT_LIST_HEAD(&neigh->list); ipoib_cm_set(neigh, NULL); /* one ref on behalf of the caller */ refcount_set(&neigh->refcnt, 1); return neigh; } struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr, struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_neigh_table *ntbl = &priv->ntbl; struct ipoib_neigh_hash *htbl; struct ipoib_neigh *neigh; u32 hash_val; htbl = rcu_dereference_protected(ntbl->htbl, lockdep_is_held(&priv->lock)); if (!htbl) { neigh = NULL; goto out_unlock; } /* need to add a new neigh, but maybe some other thread succeeded? * recalc hash, maybe hash resize took place so we do a search */ hash_val = ipoib_addr_hash(htbl, daddr); for (neigh = rcu_dereference_protected(htbl->buckets[hash_val], lockdep_is_held(&priv->lock)); neigh != NULL; neigh = rcu_dereference_protected(neigh->hnext, lockdep_is_held(&priv->lock))) { if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) { /* found, take one ref on behalf of the caller */ if (!refcount_inc_not_zero(&neigh->refcnt)) { /* deleted */ neigh = NULL; break; } neigh->alive = jiffies; goto out_unlock; } } neigh = ipoib_neigh_ctor(daddr, dev); if (!neigh) goto out_unlock; /* one ref on behalf of the hash table */ refcount_inc(&neigh->refcnt); neigh->alive = jiffies; /* put in hash */ rcu_assign_pointer(neigh->hnext, rcu_dereference_protected(htbl->buckets[hash_val], lockdep_is_held(&priv->lock))); rcu_assign_pointer(htbl->buckets[hash_val], neigh); atomic_inc(&ntbl->entries); out_unlock: return neigh; } void ipoib_neigh_dtor(struct ipoib_neigh *neigh) { /* neigh reference count was dropprd to zero */ struct net_device *dev = neigh->dev; struct ipoib_dev_priv *priv = ipoib_priv(dev); struct sk_buff *skb; if (neigh->ah) ipoib_put_ah(neigh->ah); while ((skb = __skb_dequeue(&neigh->queue))) { ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); } if (ipoib_cm_get(neigh)) ipoib_cm_destroy_tx(ipoib_cm_get(neigh)); ipoib_dbg(ipoib_priv(dev), "neigh free for %06x %pI6\n", IPOIB_QPN(neigh->daddr), neigh->daddr + 4); kfree(neigh); if (atomic_dec_and_test(&priv->ntbl.entries)) { if (test_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags)) complete(&priv->ntbl.flushed); } } static void ipoib_neigh_reclaim(struct rcu_head *rp) { /* Called as a result of removal from hash table */ struct ipoib_neigh *neigh = container_of(rp, struct ipoib_neigh, rcu); /* note TX context may hold another ref */ ipoib_neigh_put(neigh); } void ipoib_neigh_free(struct ipoib_neigh *neigh) { struct net_device *dev = neigh->dev; struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_neigh_table *ntbl = &priv->ntbl; struct ipoib_neigh_hash *htbl; struct ipoib_neigh __rcu **np; struct ipoib_neigh *n; u32 hash_val; htbl = rcu_dereference_protected(ntbl->htbl, lockdep_is_held(&priv->lock)); if (!htbl) return; hash_val = ipoib_addr_hash(htbl, neigh->daddr); np = &htbl->buckets[hash_val]; for (n = rcu_dereference_protected(*np, lockdep_is_held(&priv->lock)); n != NULL; n = rcu_dereference_protected(*np, lockdep_is_held(&priv->lock))) { if (n == neigh) { /* found */ rcu_assign_pointer(*np, rcu_dereference_protected(neigh->hnext, lockdep_is_held(&priv->lock))); /* remove from parent list */ list_del_init(&neigh->list); call_rcu(&neigh->rcu, ipoib_neigh_reclaim); return; } else { np = &n->hnext; } } } static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv) { struct ipoib_neigh_table *ntbl = &priv->ntbl; struct ipoib_neigh_hash *htbl; struct ipoib_neigh __rcu **buckets; u32 size; clear_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags); ntbl->htbl = NULL; htbl = kzalloc(sizeof(*htbl), GFP_KERNEL); if (!htbl) return -ENOMEM; size = roundup_pow_of_two(arp_tbl.gc_thresh3); buckets = kvcalloc(size, sizeof(*buckets), GFP_KERNEL); if (!buckets) { kfree(htbl); return -ENOMEM; } htbl->size = size; htbl->mask = (size - 1); htbl->buckets = buckets; RCU_INIT_POINTER(ntbl->htbl, htbl); htbl->ntbl = ntbl; atomic_set(&ntbl->entries, 0); /* start garbage collection */ queue_delayed_work(priv->wq, &priv->neigh_reap_task, arp_tbl.gc_interval); return 0; } static void neigh_hash_free_rcu(struct rcu_head *head) { struct ipoib_neigh_hash *htbl = container_of(head, struct ipoib_neigh_hash, rcu); struct ipoib_neigh __rcu **buckets = htbl->buckets; struct ipoib_neigh_table *ntbl = htbl->ntbl; kvfree(buckets); kfree(htbl); complete(&ntbl->deleted); } void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_neigh_table *ntbl = &priv->ntbl; struct ipoib_neigh_hash *htbl; unsigned long flags; int i; /* remove all neigh connected to a given path or mcast */ spin_lock_irqsave(&priv->lock, flags); htbl = rcu_dereference_protected(ntbl->htbl, lockdep_is_held(&priv->lock)); if (!htbl) goto out_unlock; for (i = 0; i < htbl->size; i++) { struct ipoib_neigh *neigh; struct ipoib_neigh __rcu **np = &htbl->buckets[i]; while ((neigh = rcu_dereference_protected(*np, lockdep_is_held(&priv->lock))) != NULL) { /* delete neighs belong to this parent */ if (!memcmp(gid, neigh->daddr + 4, sizeof (union ib_gid))) { rcu_assign_pointer(*np, rcu_dereference_protected(neigh->hnext, lockdep_is_held(&priv->lock))); /* remove from parent list */ list_del_init(&neigh->list); call_rcu(&neigh->rcu, ipoib_neigh_reclaim); } else { np = &neigh->hnext; } } } out_unlock: spin_unlock_irqrestore(&priv->lock, flags); } static void ipoib_flush_neighs(struct ipoib_dev_priv *priv) { struct ipoib_neigh_table *ntbl = &priv->ntbl; struct ipoib_neigh_hash *htbl; unsigned long flags; int i, wait_flushed = 0; init_completion(&priv->ntbl.flushed); set_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags); spin_lock_irqsave(&priv->lock, flags); htbl = rcu_dereference_protected(ntbl->htbl, lockdep_is_held(&priv->lock)); if (!htbl) goto out_unlock; wait_flushed = atomic_read(&priv->ntbl.entries); if (!wait_flushed) goto free_htbl; for (i = 0; i < htbl->size; i++) { struct ipoib_neigh *neigh; struct ipoib_neigh __rcu **np = &htbl->buckets[i]; while ((neigh = rcu_dereference_protected(*np, lockdep_is_held(&priv->lock))) != NULL) { rcu_assign_pointer(*np, rcu_dereference_protected(neigh->hnext, lockdep_is_held(&priv->lock))); /* remove from path/mc list */ list_del_init(&neigh->list); call_rcu(&neigh->rcu, ipoib_neigh_reclaim); } } free_htbl: rcu_assign_pointer(ntbl->htbl, NULL); call_rcu(&htbl->rcu, neigh_hash_free_rcu); out_unlock: spin_unlock_irqrestore(&priv->lock, flags); if (wait_flushed) wait_for_completion(&priv->ntbl.flushed); } static void ipoib_neigh_hash_uninit(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); ipoib_dbg(priv, "%s\n", __func__); init_completion(&priv->ntbl.deleted); cancel_delayed_work_sync(&priv->neigh_reap_task); ipoib_flush_neighs(priv); wait_for_completion(&priv->ntbl.deleted); } static void ipoib_napi_add(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); netif_napi_add_weight(dev, &priv->recv_napi, ipoib_rx_poll, IPOIB_NUM_WC); netif_napi_add_weight(dev, &priv->send_napi, ipoib_tx_poll, MAX_SEND_CQE); } static void ipoib_napi_del(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); netif_napi_del(&priv->recv_napi); netif_napi_del(&priv->send_napi); } static void ipoib_dev_uninit_default(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); ipoib_transport_dev_cleanup(dev); ipoib_napi_del(dev); ipoib_cm_dev_cleanup(dev); kfree(priv->rx_ring); vfree(priv->tx_ring); priv->rx_ring = NULL; priv->tx_ring = NULL; } static int ipoib_dev_init_default(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); u8 addr_mod[3]; ipoib_napi_add(dev); /* Allocate RX/TX "rings" to hold queued skbs */ priv->rx_ring = kcalloc(ipoib_recvq_size, sizeof(*priv->rx_ring), GFP_KERNEL); if (!priv->rx_ring) goto out; priv->tx_ring = vzalloc(array_size(ipoib_sendq_size, sizeof(*priv->tx_ring))); if (!priv->tx_ring) { pr_warn("%s: failed to allocate TX ring (%d entries)\n", priv->ca->name, ipoib_sendq_size); goto out_rx_ring_cleanup; } /* priv->tx_head, tx_tail and global_tx_tail/head are already 0 */ if (ipoib_transport_dev_init(dev, priv->ca)) { pr_warn("%s: ipoib_transport_dev_init failed\n", priv->ca->name); goto out_tx_ring_cleanup; } /* after qp created set dev address */ addr_mod[0] = (priv->qp->qp_num >> 16) & 0xff; addr_mod[1] = (priv->qp->qp_num >> 8) & 0xff; addr_mod[2] = (priv->qp->qp_num) & 0xff; dev_addr_mod(priv->dev, 1, addr_mod, sizeof(addr_mod)); return 0; out_tx_ring_cleanup: vfree(priv->tx_ring); out_rx_ring_cleanup: kfree(priv->rx_ring); out: ipoib_napi_del(dev); return -ENOMEM; } static int ipoib_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { struct ipoib_dev_priv *priv = ipoib_priv(dev); if (!priv->rn_ops->ndo_eth_ioctl) return -EOPNOTSUPP; return priv->rn_ops->ndo_eth_ioctl(dev, ifr, cmd); } static int ipoib_dev_init(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); int ret = -ENOMEM; priv->qp = NULL; /* * the various IPoIB tasks assume they will never race against * themselves, so always use a single thread workqueue */ priv->wq = alloc_ordered_workqueue("ipoib_wq", WQ_MEM_RECLAIM); if (!priv->wq) { pr_warn("%s: failed to allocate device WQ\n", dev->name); goto out; } /* create pd, which used both for control and datapath*/ priv->pd = ib_alloc_pd(priv->ca, 0); if (IS_ERR(priv->pd)) { pr_warn("%s: failed to allocate PD\n", priv->ca->name); goto clean_wq; } ret = priv->rn_ops->ndo_init(dev); if (ret) { pr_warn("%s failed to init HW resource\n", dev->name); goto out_free_pd; } ret = ipoib_neigh_hash_init(priv); if (ret) { pr_warn("%s failed to init neigh hash\n", dev->name); goto out_dev_uninit; } if (dev->flags & IFF_UP) { if (ipoib_ib_dev_open(dev)) { pr_warn("%s failed to open device\n", dev->name); ret = -ENODEV; goto out_hash_uninit; } } return 0; out_hash_uninit: ipoib_neigh_hash_uninit(dev); out_dev_uninit: ipoib_ib_dev_cleanup(dev); out_free_pd: if (priv->pd) { ib_dealloc_pd(priv->pd); priv->pd = NULL; } clean_wq: if (priv->wq) { destroy_workqueue(priv->wq); priv->wq = NULL; } out: return ret; } /* * This must be called before doing an unregister_netdev on a parent device to * shutdown the IB event handler. */ static void ipoib_parent_unregister_pre(struct net_device *ndev) { struct ipoib_dev_priv *priv = ipoib_priv(ndev); /* * ipoib_set_mac checks netif_running before pushing work, clearing * running ensures the it will not add more work. */ rtnl_lock(); dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP, NULL); rtnl_unlock(); /* ipoib_event() cannot be running once this returns */ ib_unregister_event_handler(&priv->event_handler); /* * Work on the queue grabs the rtnl lock, so this cannot be done while * also holding it. */ flush_workqueue(ipoib_workqueue); } static void ipoib_set_dev_features(struct ipoib_dev_priv *priv) { priv->hca_caps = priv->ca->attrs.device_cap_flags; priv->kernel_caps = priv->ca->attrs.kernel_cap_flags; if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) { priv->dev->hw_features |= NETIF_F_IP_CSUM | NETIF_F_RXCSUM; if (priv->kernel_caps & IBK_UD_TSO) priv->dev->hw_features |= NETIF_F_TSO; priv->dev->features |= priv->dev->hw_features; } } static int ipoib_parent_init(struct net_device *ndev) { struct ipoib_dev_priv *priv = ipoib_priv(ndev); struct ib_port_attr attr; int result; result = ib_query_port(priv->ca, priv->port, &attr); if (result) { pr_warn("%s: ib_query_port %d failed\n", priv->ca->name, priv->port); return result; } priv->max_ib_mtu = rdma_mtu_from_attr(priv->ca, priv->port, &attr); result = ib_query_pkey(priv->ca, priv->port, 0, &priv->pkey); if (result) { pr_warn("%s: ib_query_pkey port %d failed (ret = %d)\n", priv->ca->name, priv->port, result); return result; } result = rdma_query_gid(priv->ca, priv->port, 0, &priv->local_gid); if (result) { pr_warn("%s: rdma_query_gid port %d failed (ret = %d)\n", priv->ca->name, priv->port, result); return result; } dev_addr_mod(priv->dev, 4, priv->local_gid.raw, sizeof(union ib_gid)); SET_NETDEV_DEV(priv->dev, priv->ca->dev.parent); priv->dev->dev_port = priv->port - 1; /* Let's set this one too for backwards compatibility. */ priv->dev->dev_id = priv->port - 1; return 0; } static void ipoib_child_init(struct net_device *ndev) { struct ipoib_dev_priv *priv = ipoib_priv(ndev); struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent); priv->max_ib_mtu = ppriv->max_ib_mtu; set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags); if (memchr_inv(priv->dev->dev_addr, 0, INFINIBAND_ALEN)) memcpy(&priv->local_gid, priv->dev->dev_addr + 4, sizeof(priv->local_gid)); else { __dev_addr_set(priv->dev, ppriv->dev->dev_addr, INFINIBAND_ALEN); memcpy(&priv->local_gid, &ppriv->local_gid, sizeof(priv->local_gid)); } } static int ipoib_ndo_init(struct net_device *ndev) { struct ipoib_dev_priv *priv = ipoib_priv(ndev); int rc; struct rdma_netdev *rn = netdev_priv(ndev); if (priv->parent) { ipoib_child_init(ndev); } else { rc = ipoib_parent_init(ndev); if (rc) return rc; } /* MTU will be reset when mcast join happens */ ndev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu); priv->mcast_mtu = priv->admin_mtu = ndev->mtu; rn->mtu = priv->mcast_mtu; ndev->max_mtu = IPOIB_CM_MTU; ndev->neigh_priv_len = sizeof(struct ipoib_neigh); /* * Set the full membership bit, so that we join the right * broadcast group, etc. */ priv->pkey |= 0x8000; ndev->broadcast[8] = priv->pkey >> 8; ndev->broadcast[9] = priv->pkey & 0xff; set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags); ipoib_set_dev_features(priv); rc = ipoib_dev_init(ndev); if (rc) { pr_warn("%s: failed to initialize device: %s port %d (ret = %d)\n", priv->ca->name, priv->dev->name, priv->port, rc); return rc; } if (priv->parent) { struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent); dev_hold(priv->parent); down_write(&ppriv->vlan_rwsem); list_add_tail(&priv->list, &ppriv->child_intfs); up_write(&ppriv->vlan_rwsem); } return 0; } static void ipoib_ndo_uninit(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); ASSERT_RTNL(); /* * ipoib_remove_one guarantees the children are removed before the * parent, and that is the only place where a parent can be removed. */ WARN_ON(!list_empty(&priv->child_intfs)); if (priv->parent) { struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent); down_write(&ppriv->vlan_rwsem); list_del(&priv->list); up_write(&ppriv->vlan_rwsem); } ipoib_neigh_hash_uninit(dev); ipoib_ib_dev_cleanup(dev); /* no more works over the priv->wq */ if (priv->wq) { /* See ipoib_mcast_carrier_on_task() */ WARN_ON(test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)); destroy_workqueue(priv->wq); priv->wq = NULL; } if (priv->parent) dev_put(priv->parent); } static int ipoib_set_vf_link_state(struct net_device *dev, int vf, int link_state) { struct ipoib_dev_priv *priv = ipoib_priv(dev); return ib_set_vf_link_state(priv->ca, vf, priv->port, link_state); } static int ipoib_get_vf_config(struct net_device *dev, int vf, struct ifla_vf_info *ivf) { struct ipoib_dev_priv *priv = ipoib_priv(dev); int err; err = ib_get_vf_config(priv->ca, vf, priv->port, ivf); if (err) return err; ivf->vf = vf; memcpy(ivf->mac, dev->dev_addr, dev->addr_len); return 0; } static int ipoib_set_vf_guid(struct net_device *dev, int vf, u64 guid, int type) { struct ipoib_dev_priv *priv = ipoib_priv(dev); if (type != IFLA_VF_IB_NODE_GUID && type != IFLA_VF_IB_PORT_GUID) return -EINVAL; return ib_set_vf_guid(priv->ca, vf, priv->port, guid, type); } static int ipoib_get_vf_guid(struct net_device *dev, int vf, struct ifla_vf_guid *node_guid, struct ifla_vf_guid *port_guid) { struct ipoib_dev_priv *priv = ipoib_priv(dev); return ib_get_vf_guid(priv->ca, vf, priv->port, node_guid, port_guid); } static int ipoib_get_vf_stats(struct net_device *dev, int vf, struct ifla_vf_stats *vf_stats) { struct ipoib_dev_priv *priv = ipoib_priv(dev); return ib_get_vf_stats(priv->ca, vf, priv->port, vf_stats); } static const struct header_ops ipoib_header_ops = { .create = ipoib_hard_header, }; static const struct net_device_ops ipoib_netdev_ops_pf = { .ndo_init = ipoib_ndo_init, .ndo_uninit = ipoib_ndo_uninit, .ndo_open = ipoib_open, .ndo_stop = ipoib_stop, .ndo_change_mtu = ipoib_change_mtu, .ndo_fix_features = ipoib_fix_features, .ndo_start_xmit = ipoib_start_xmit, .ndo_tx_timeout = ipoib_timeout, .ndo_set_rx_mode = ipoib_set_mcast_list, .ndo_get_iflink = ipoib_get_iflink, .ndo_set_vf_link_state = ipoib_set_vf_link_state, .ndo_get_vf_config = ipoib_get_vf_config, .ndo_get_vf_stats = ipoib_get_vf_stats, .ndo_get_vf_guid = ipoib_get_vf_guid, .ndo_set_vf_guid = ipoib_set_vf_guid, .ndo_set_mac_address = ipoib_set_mac, .ndo_get_stats64 = ipoib_get_stats, .ndo_eth_ioctl = ipoib_ioctl, }; static const struct net_device_ops ipoib_netdev_ops_vf = { .ndo_init = ipoib_ndo_init, .ndo_uninit = ipoib_ndo_uninit, .ndo_open = ipoib_open, .ndo_stop = ipoib_stop, .ndo_change_mtu = ipoib_change_mtu, .ndo_fix_features = ipoib_fix_features, .ndo_start_xmit = ipoib_start_xmit, .ndo_tx_timeout = ipoib_timeout, .ndo_set_rx_mode = ipoib_set_mcast_list, .ndo_get_iflink = ipoib_get_iflink, .ndo_get_stats64 = ipoib_get_stats, .ndo_eth_ioctl = ipoib_ioctl, }; static const struct net_device_ops ipoib_netdev_default_pf = { .ndo_init = ipoib_dev_init_default, .ndo_uninit = ipoib_dev_uninit_default, .ndo_open = ipoib_ib_dev_open_default, .ndo_stop = ipoib_ib_dev_stop_default, }; void ipoib_setup_common(struct net_device *dev) { dev->header_ops = &ipoib_header_ops; dev->netdev_ops = &ipoib_netdev_default_pf; ipoib_set_ethtool_ops(dev); dev->watchdog_timeo = HZ; dev->flags |= IFF_BROADCAST | IFF_MULTICAST; dev->hard_header_len = IPOIB_HARD_LEN; dev->addr_len = INFINIBAND_ALEN; dev->type = ARPHRD_INFINIBAND; dev->tx_queue_len = ipoib_sendq_size * 2; dev->features = (NETIF_F_VLAN_CHALLENGED | NETIF_F_HIGHDMA); netif_keep_dst(dev); memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN); /* * unregister_netdev always frees the netdev, we use this mode * consistently to unify all the various unregister paths, including * those connected to rtnl_link_ops which require it. */ dev->needs_free_netdev = true; } static void ipoib_build_priv(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); priv->dev = dev; spin_lock_init(&priv->lock); init_rwsem(&priv->vlan_rwsem); mutex_init(&priv->mcast_mutex); INIT_LIST_HEAD(&priv->path_list); INIT_LIST_HEAD(&priv->child_intfs); INIT_LIST_HEAD(&priv->dead_ahs); INIT_LIST_HEAD(&priv->multicast_list); INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task); INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task); INIT_WORK(&priv->flush_light, ipoib_ib_dev_flush_light); INIT_WORK(&priv->flush_normal, ipoib_ib_dev_flush_normal); INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy); INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task); INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah); INIT_DELAYED_WORK(&priv->neigh_reap_task, ipoib_reap_neigh); } static struct net_device *ipoib_alloc_netdev(struct ib_device *hca, u32 port, const char *name) { struct net_device *dev; dev = rdma_alloc_netdev(hca, port, RDMA_NETDEV_IPOIB, name, NET_NAME_UNKNOWN, ipoib_setup_common); if (!IS_ERR(dev) || PTR_ERR(dev) != -EOPNOTSUPP) return dev; dev = alloc_netdev(sizeof(struct rdma_netdev), name, NET_NAME_UNKNOWN, ipoib_setup_common); if (!dev) return ERR_PTR(-ENOMEM); return dev; } int ipoib_intf_init(struct ib_device *hca, u32 port, const char *name, struct net_device *dev) { struct rdma_netdev *rn = netdev_priv(dev); struct ipoib_dev_priv *priv; int rc; priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; priv->ca = hca; priv->port = port; rc = rdma_init_netdev(hca, port, RDMA_NETDEV_IPOIB, name, NET_NAME_UNKNOWN, ipoib_setup_common, dev); if (rc) { if (rc != -EOPNOTSUPP) goto out; rn->send = ipoib_send; rn->attach_mcast = ipoib_mcast_attach; rn->detach_mcast = ipoib_mcast_detach; rn->hca = hca; rc = netif_set_real_num_tx_queues(dev, 1); if (rc) goto out; rc = netif_set_real_num_rx_queues(dev, 1); if (rc) goto out; } priv->rn_ops = dev->netdev_ops; if (hca->attrs.kernel_cap_flags & IBK_VIRTUAL_FUNCTION) dev->netdev_ops = &ipoib_netdev_ops_vf; else dev->netdev_ops = &ipoib_netdev_ops_pf; rn->clnt_priv = priv; /* * Only the child register_netdev flows can handle priv_destructor * being set, so we force it to NULL here and handle manually until it * is safe to turn on. */ priv->next_priv_destructor = dev->priv_destructor; dev->priv_destructor = NULL; ipoib_build_priv(dev); return 0; out: kfree(priv); return rc; } struct net_device *ipoib_intf_alloc(struct ib_device *hca, u32 port, const char *name) { struct net_device *dev; int rc; dev = ipoib_alloc_netdev(hca, port, name); if (IS_ERR(dev)) return dev; rc = ipoib_intf_init(hca, port, name, dev); if (rc) { free_netdev(dev); return ERR_PTR(rc); } /* * Upon success the caller must ensure ipoib_intf_free is called or * register_netdevice succeed'd and priv_destructor is set to * ipoib_intf_free. */ return dev; } void ipoib_intf_free(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct rdma_netdev *rn = netdev_priv(dev); dev->priv_destructor = priv->next_priv_destructor; if (dev->priv_destructor) dev->priv_destructor(dev); /* * There are some error flows around register_netdev failing that may * attempt to call priv_destructor twice, prevent that from happening. */ dev->priv_destructor = NULL; /* unregister/destroy is very complicated. Make bugs more obvious. */ rn->clnt_priv = NULL; kfree(priv); } static ssize_t pkey_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *ndev = to_net_dev(dev); struct ipoib_dev_priv *priv = ipoib_priv(ndev); return sysfs_emit(buf, "0x%04x\n", priv->pkey); } static DEVICE_ATTR_RO(pkey); static ssize_t umcast_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *ndev = to_net_dev(dev); struct ipoib_dev_priv *priv = ipoib_priv(ndev); return sysfs_emit(buf, "%d\n", test_bit(IPOIB_FLAG_UMCAST, &priv->flags)); } void ipoib_set_umcast(struct net_device *ndev, int umcast_val) { struct ipoib_dev_priv *priv = ipoib_priv(ndev); if (umcast_val > 0) { set_bit(IPOIB_FLAG_UMCAST, &priv->flags); ipoib_warn(priv, "ignoring multicast groups joined directly " "by userspace\n"); } else clear_bit(IPOIB_FLAG_UMCAST, &priv->flags); } static ssize_t umcast_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { unsigned long umcast_val = simple_strtoul(buf, NULL, 0); ipoib_set_umcast(to_net_dev(dev), umcast_val); return count; } static DEVICE_ATTR_RW(umcast); int ipoib_add_umcast_attr(struct net_device *dev) { return device_create_file(&dev->dev, &dev_attr_umcast); } static void set_base_guid(struct ipoib_dev_priv *priv, union ib_gid *gid) { struct ipoib_dev_priv *child_priv; struct net_device *netdev = priv->dev; netif_addr_lock_bh(netdev); memcpy(&priv->local_gid.global.interface_id, &gid->global.interface_id, sizeof(gid->global.interface_id)); dev_addr_mod(netdev, 4, (u8 *)&priv->local_gid, sizeof(priv->local_gid)); clear_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags); netif_addr_unlock_bh(netdev); if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { down_read(&priv->vlan_rwsem); list_for_each_entry(child_priv, &priv->child_intfs, list) set_base_guid(child_priv, gid); up_read(&priv->vlan_rwsem); } } static int ipoib_check_lladdr(struct net_device *dev, struct sockaddr_storage *ss) { union ib_gid *gid = (union ib_gid *)(ss->__data + 4); int ret = 0; netif_addr_lock_bh(dev); /* Make sure the QPN, reserved and subnet prefix match the current * lladdr, it also makes sure the lladdr is unicast. */ if (memcmp(dev->dev_addr, ss->__data, 4 + sizeof(gid->global.subnet_prefix)) || gid->global.interface_id == 0) ret = -EINVAL; netif_addr_unlock_bh(dev); return ret; } static int ipoib_set_mac(struct net_device *dev, void *addr) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct sockaddr_storage *ss = addr; int ret; if (!(dev->priv_flags & IFF_LIVE_ADDR_CHANGE) && netif_running(dev)) return -EBUSY; ret = ipoib_check_lladdr(dev, ss); if (ret) return ret; set_base_guid(priv, (union ib_gid *)(ss->__data + 4)); queue_work(ipoib_workqueue, &priv->flush_light); return 0; } static ssize_t create_child_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int pkey; int ret; if (sscanf(buf, "%i", &pkey) != 1) return -EINVAL; if (pkey <= 0 || pkey > 0xffff || pkey == 0x8000) return -EINVAL; ret = ipoib_vlan_add(to_net_dev(dev), pkey); return ret ? ret : count; } static DEVICE_ATTR_WO(create_child); static ssize_t delete_child_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int pkey; int ret; if (sscanf(buf, "%i", &pkey) != 1) return -EINVAL; if (pkey < 0 || pkey > 0xffff) return -EINVAL; ret = ipoib_vlan_delete(to_net_dev(dev), pkey); return ret ? ret : count; } static DEVICE_ATTR_WO(delete_child); int ipoib_add_pkey_attr(struct net_device *dev) { return device_create_file(&dev->dev, &dev_attr_pkey); } /* * We erroneously exposed the iface's port number in the dev_id * sysfs field long after dev_port was introduced for that purpose[1], * and we need to stop everyone from relying on that. * Let's overload the shower routine for the dev_id file here * to gently bring the issue up. * * [1] https://www.spinics.net/lists/netdev/msg272123.html */ static ssize_t dev_id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *ndev = to_net_dev(dev); /* * ndev->dev_port will be equal to 0 in old kernel prior to commit * 9b8b2a323008 ("IB/ipoib: Use dev_port to expose network interface * port numbers") Zero was chosen as special case for user space * applications to fallback and query dev_id to check if it has * different value or not. * * Don't print warning in such scenario. * * https://github.com/systemd/systemd/blob/master/src/udev/udev-builtin-net_id.c#L358 */ if (ndev->dev_port && ndev->dev_id == ndev->dev_port) netdev_info_once(ndev, "\"%s\" wants to know my dev_id. Should it look at dev_port instead? See Documentation/ABI/testing/sysfs-class-net for more info.\n", current->comm); return sysfs_emit(buf, "%#x\n", ndev->dev_id); } static DEVICE_ATTR_RO(dev_id); static int ipoib_intercept_dev_id_attr(struct net_device *dev) { device_remove_file(&dev->dev, &dev_attr_dev_id); return device_create_file(&dev->dev, &dev_attr_dev_id); } static struct net_device *ipoib_add_port(const char *format, struct ib_device *hca, u32 port) { struct rtnl_link_ops *ops = ipoib_get_link_ops(); struct rdma_netdev_alloc_params params; struct ipoib_dev_priv *priv; struct net_device *ndev; int result; ndev = ipoib_intf_alloc(hca, port, format); if (IS_ERR(ndev)) { pr_warn("%s, %d: ipoib_intf_alloc failed %ld\n", hca->name, port, PTR_ERR(ndev)); return ndev; } priv = ipoib_priv(ndev); INIT_IB_EVENT_HANDLER(&priv->event_handler, priv->ca, ipoib_event); ib_register_event_handler(&priv->event_handler); /* call event handler to ensure pkey in sync */ queue_work(ipoib_workqueue, &priv->flush_heavy); ndev->rtnl_link_ops = ipoib_get_link_ops(); result = register_netdev(ndev); if (result) { pr_warn("%s: couldn't register ipoib port %d; error %d\n", hca->name, port, result); ipoib_parent_unregister_pre(ndev); ipoib_intf_free(ndev); free_netdev(ndev); return ERR_PTR(result); } if (hca->ops.rdma_netdev_get_params) { int rc = hca->ops.rdma_netdev_get_params(hca, port, RDMA_NETDEV_IPOIB, &params); if (!rc && ops->priv_size < params.sizeof_priv) ops->priv_size = params.sizeof_priv; } /* * We cannot set priv_destructor before register_netdev because we * need priv to be always valid during the error flow to execute * ipoib_parent_unregister_pre(). Instead handle it manually and only * enter priv_destructor mode once we are completely registered. */ ndev->priv_destructor = ipoib_intf_free; if (ipoib_intercept_dev_id_attr(ndev)) goto sysfs_failed; if (ipoib_cm_add_mode_attr(ndev)) goto sysfs_failed; if (ipoib_add_pkey_attr(ndev)) goto sysfs_failed; if (ipoib_add_umcast_attr(ndev)) goto sysfs_failed; if (device_create_file(&ndev->dev, &dev_attr_create_child)) goto sysfs_failed; if (device_create_file(&ndev->dev, &dev_attr_delete_child)) goto sysfs_failed; return ndev; sysfs_failed: ipoib_parent_unregister_pre(ndev); unregister_netdev(ndev); return ERR_PTR(-ENOMEM); } static int ipoib_add_one(struct ib_device *device) { struct list_head *dev_list; struct net_device *dev; struct ipoib_dev_priv *priv; unsigned int p; int count = 0; dev_list = kmalloc(sizeof(*dev_list), GFP_KERNEL); if (!dev_list) return -ENOMEM; INIT_LIST_HEAD(dev_list); rdma_for_each_port (device, p) { if (!rdma_protocol_ib(device, p)) continue; dev = ipoib_add_port("ib%d", device, p); if (!IS_ERR(dev)) { priv = ipoib_priv(dev); list_add_tail(&priv->list, dev_list); count++; } } if (!count) { kfree(dev_list); return -EOPNOTSUPP; } ib_set_client_data(device, &ipoib_client, dev_list); return 0; } static void ipoib_remove_one(struct ib_device *device, void *client_data) { struct ipoib_dev_priv *priv, *tmp, *cpriv, *tcpriv; struct list_head *dev_list = client_data; list_for_each_entry_safe(priv, tmp, dev_list, list) { LIST_HEAD(head); ipoib_parent_unregister_pre(priv->dev); rtnl_lock(); list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) unregister_netdevice_queue(cpriv->dev, &head); unregister_netdevice_queue(priv->dev, &head); unregister_netdevice_many(&head); rtnl_unlock(); } kfree(dev_list); } #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG static struct notifier_block ipoib_netdev_notifier = { .notifier_call = ipoib_netdev_event, }; #endif static int __init ipoib_init_module(void) { int ret; ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size); ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE); ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE); ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size); ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE); ipoib_sendq_size = max3(ipoib_sendq_size, 2 * MAX_SEND_CQE, IPOIB_MIN_QUEUE_SIZE); #ifdef CONFIG_INFINIBAND_IPOIB_CM ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP); ipoib_max_conn_qp = max(ipoib_max_conn_qp, 0); #endif /* * When copying small received packets, we only copy from the * linear data part of the SKB, so we rely on this condition. */ BUILD_BUG_ON(IPOIB_CM_COPYBREAK > IPOIB_CM_HEAD_SIZE); ipoib_register_debugfs(); /* * We create a global workqueue here that is used for all flush * operations. However, if you attempt to flush a workqueue * from a task on that same workqueue, it deadlocks the system. * We want to be able to flush the tasks associated with a * specific net device, so we also create a workqueue for each * netdevice. We queue up the tasks for that device only on * its private workqueue, and we only queue up flush events * on our global flush workqueue. This avoids the deadlocks. */ ipoib_workqueue = alloc_ordered_workqueue("ipoib_flush", 0); if (!ipoib_workqueue) { ret = -ENOMEM; goto err_fs; } ib_sa_register_client(&ipoib_sa_client); ret = ib_register_client(&ipoib_client); if (ret) goto err_sa; ret = ipoib_netlink_init(); if (ret) goto err_client; #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG register_netdevice_notifier(&ipoib_netdev_notifier); #endif return 0; err_client: ib_unregister_client(&ipoib_client); err_sa: ib_sa_unregister_client(&ipoib_sa_client); destroy_workqueue(ipoib_workqueue); err_fs: ipoib_unregister_debugfs(); return ret; } static void __exit ipoib_cleanup_module(void) { #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG unregister_netdevice_notifier(&ipoib_netdev_notifier); #endif ipoib_netlink_fini(); ib_unregister_client(&ipoib_client); ib_sa_unregister_client(&ipoib_sa_client); ipoib_unregister_debugfs(); destroy_workqueue(ipoib_workqueue); } module_init(ipoib_init_module); module_exit(ipoib_cleanup_module);
1450 1449 1435 1435 1435 1205 1205 1205 1205 1123 940 1205 986 1205 1205 986 976 1198 1198 124 504 1450 1450 1450 1521 1521 1499 1449 28 1450 1450 222 222 38 35 222 222 1450 1450 222 1450 1450 1403 43 1450 1450 23 1367 1428 1428 401 1399 1296 222 1252 482 1296 1280 23 1521 77 1521 1642 1642 1642 1642 1642 1642 1642 1642 1642 1642 1642 1639 1638 1367 1209 1207 1190 1063 1207 296 1207 992 992 992 1205 1642 1205 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2010 Red Hat, Inc. * Copyright (c) 2016-2021 Christoph Hellwig. */ #include <linux/module.h> #include <linux/compiler.h> #include <linux/fs.h> #include <linux/fscrypt.h> #include <linux/pagemap.h> #include <linux/iomap.h> #include <linux/backing-dev.h> #include <linux/uio.h> #include <linux/task_io_accounting_ops.h> #include "trace.h" #include "../internal.h" /* * Private flags for iomap_dio, must not overlap with the public ones in * iomap.h: */ #define IOMAP_DIO_CALLER_COMP (1U << 26) #define IOMAP_DIO_INLINE_COMP (1U << 27) #define IOMAP_DIO_WRITE_THROUGH (1U << 28) #define IOMAP_DIO_NEED_SYNC (1U << 29) #define IOMAP_DIO_WRITE (1U << 30) #define IOMAP_DIO_DIRTY (1U << 31) struct iomap_dio { struct kiocb *iocb; const struct iomap_dio_ops *dops; loff_t i_size; loff_t size; atomic_t ref; unsigned flags; int error; size_t done_before; bool wait_for_completion; union { /* used during submission and for synchronous completion: */ struct { struct iov_iter *iter; struct task_struct *waiter; } submit; /* used for aio completion: */ struct { struct work_struct work; } aio; }; }; static struct bio *iomap_dio_alloc_bio(const struct iomap_iter *iter, struct iomap_dio *dio, unsigned short nr_vecs, blk_opf_t opf) { if (dio->dops && dio->dops->bio_set) return bio_alloc_bioset(iter->iomap.bdev, nr_vecs, opf, GFP_KERNEL, dio->dops->bio_set); return bio_alloc(iter->iomap.bdev, nr_vecs, opf, GFP_KERNEL); } static void iomap_dio_submit_bio(const struct iomap_iter *iter, struct iomap_dio *dio, struct bio *bio, loff_t pos) { struct kiocb *iocb = dio->iocb; atomic_inc(&dio->ref); /* Sync dio can't be polled reliably */ if ((iocb->ki_flags & IOCB_HIPRI) && !is_sync_kiocb(iocb)) { bio_set_polled(bio, iocb); WRITE_ONCE(iocb->private, bio); } if (dio->dops && dio->dops->submit_io) dio->dops->submit_io(iter, bio, pos); else submit_bio(bio); } ssize_t iomap_dio_complete(struct iomap_dio *dio) { const struct iomap_dio_ops *dops = dio->dops; struct kiocb *iocb = dio->iocb; loff_t offset = iocb->ki_pos; ssize_t ret = dio->error; if (dops && dops->end_io) ret = dops->end_io(iocb, dio->size, ret, dio->flags); if (likely(!ret)) { ret = dio->size; /* check for short read */ if (offset + ret > dio->i_size && !(dio->flags & IOMAP_DIO_WRITE)) ret = dio->i_size - offset; } /* * Try again to invalidate clean pages which might have been cached by * non-direct readahead, or faulted in by get_user_pages() if the source * of the write was an mmap'ed region of the file we're writing. Either * one is a pretty crazy thing to do, so we don't support it 100%. If * this invalidation fails, tough, the write still worked... * * And this page cache invalidation has to be after ->end_io(), as some * filesystems convert unwritten extents to real allocations in * ->end_io() when necessary, otherwise a racing buffer read would cache * zeros from unwritten extents. */ if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE)) kiocb_invalidate_post_direct_write(iocb, dio->size); inode_dio_end(file_inode(iocb->ki_filp)); if (ret > 0) { iocb->ki_pos += ret; /* * If this is a DSYNC write, make sure we push it to stable * storage now that we've written data. */ if (dio->flags & IOMAP_DIO_NEED_SYNC) ret = generic_write_sync(iocb, ret); if (ret > 0) ret += dio->done_before; } trace_iomap_dio_complete(iocb, dio->error, ret); kfree(dio); return ret; } EXPORT_SYMBOL_GPL(iomap_dio_complete); static ssize_t iomap_dio_deferred_complete(void *data) { return iomap_dio_complete(data); } static void iomap_dio_complete_work(struct work_struct *work) { struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work); struct kiocb *iocb = dio->iocb; iocb->ki_complete(iocb, iomap_dio_complete(dio)); } /* * Set an error in the dio if none is set yet. We have to use cmpxchg * as the submission context and the completion context(s) can race to * update the error. */ static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret) { cmpxchg(&dio->error, 0, ret); } void iomap_dio_bio_end_io(struct bio *bio) { struct iomap_dio *dio = bio->bi_private; bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); struct kiocb *iocb = dio->iocb; if (bio->bi_status) iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); if (!atomic_dec_and_test(&dio->ref)) goto release_bio; /* * Synchronous dio, task itself will handle any completion work * that needs after IO. All we need to do is wake the task. */ if (dio->wait_for_completion) { struct task_struct *waiter = dio->submit.waiter; WRITE_ONCE(dio->submit.waiter, NULL); blk_wake_io_task(waiter); goto release_bio; } /* * Flagged with IOMAP_DIO_INLINE_COMP, we can complete it inline */ if (dio->flags & IOMAP_DIO_INLINE_COMP) { WRITE_ONCE(iocb->private, NULL); iomap_dio_complete_work(&dio->aio.work); goto release_bio; } /* * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then schedule * our completion that way to avoid an async punt to a workqueue. */ if (dio->flags & IOMAP_DIO_CALLER_COMP) { /* only polled IO cares about private cleared */ iocb->private = dio; iocb->dio_complete = iomap_dio_deferred_complete; /* * Invoke ->ki_complete() directly. We've assigned our * dio_complete callback handler, and since the issuer set * IOCB_DIO_CALLER_COMP, we know their ki_complete handler will * notice ->dio_complete being set and will defer calling that * handler until it can be done from a safe task context. * * Note that the 'res' being passed in here is not important * for this case. The actual completion value of the request * will be gotten from dio_complete when that is run by the * issuer. */ iocb->ki_complete(iocb, 0); goto release_bio; } /* * Async DIO completion that requires filesystem level completion work * gets punted to a work queue to complete as the operation may require * more IO to be issued to finalise filesystem metadata changes or * guarantee data integrity. */ INIT_WORK(&dio->aio.work, iomap_dio_complete_work); queue_work(file_inode(iocb->ki_filp)->i_sb->s_dio_done_wq, &dio->aio.work); release_bio: if (should_dirty) { bio_check_pages_dirty(bio); } else { bio_release_pages(bio, false); bio_put(bio); } } EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io); static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, loff_t pos, unsigned len) { struct inode *inode = file_inode(dio->iocb->ki_filp); struct page *page = ZERO_PAGE(0); struct bio *bio; bio = iomap_dio_alloc_bio(iter, dio, 1, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE); fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits, GFP_KERNEL); bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos); bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; __bio_add_page(bio, page, len, 0); iomap_dio_submit_bio(iter, dio, bio, pos); } /* * Figure out the bio's operation flags from the dio request, the * mapping, and whether or not we want FUA. Note that we can end up * clearing the WRITE_THROUGH flag in the dio request. */ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, const struct iomap *iomap, bool use_fua) { blk_opf_t opflags = REQ_SYNC | REQ_IDLE; if (!(dio->flags & IOMAP_DIO_WRITE)) return REQ_OP_READ; opflags |= REQ_OP_WRITE; if (use_fua) opflags |= REQ_FUA; else dio->flags &= ~IOMAP_DIO_WRITE_THROUGH; return opflags; } static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, struct iomap_dio *dio) { const struct iomap *iomap = &iter->iomap; struct inode *inode = iter->inode; unsigned int fs_block_size = i_blocksize(inode), pad; loff_t length = iomap_length(iter); loff_t pos = iter->pos; blk_opf_t bio_opf; struct bio *bio; bool need_zeroout = false; bool use_fua = false; int nr_pages, ret = 0; size_t copied = 0; size_t orig_count; if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) || !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter)) return -EINVAL; if (iomap->type == IOMAP_UNWRITTEN) { dio->flags |= IOMAP_DIO_UNWRITTEN; need_zeroout = true; } if (iomap->flags & IOMAP_F_SHARED) dio->flags |= IOMAP_DIO_COW; if (iomap->flags & IOMAP_F_NEW) { need_zeroout = true; } else if (iomap->type == IOMAP_MAPPED) { /* * Use a FUA write if we need datasync semantics, this is a pure * data IO that doesn't require any metadata updates (including * after IO completion such as unwritten extent conversion) and * the underlying device either supports FUA or doesn't have * a volatile write cache. This allows us to avoid cache flushes * on IO completion. If we can't use writethrough and need to * sync, disable in-task completions as dio completion will * need to call generic_write_sync() which will do a blocking * fsync / cache flush call. */ if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) && (dio->flags & IOMAP_DIO_WRITE_THROUGH) && (bdev_fua(iomap->bdev) || !bdev_write_cache(iomap->bdev))) use_fua = true; else if (dio->flags & IOMAP_DIO_NEED_SYNC) dio->flags &= ~IOMAP_DIO_CALLER_COMP; } /* * Save the original count and trim the iter to just the extent we * are operating on right now. The iter will be re-expanded once * we are done. */ orig_count = iov_iter_count(dio->submit.iter); iov_iter_truncate(dio->submit.iter, length); if (!iov_iter_count(dio->submit.iter)) goto out; /* * We can only do deferred completion for pure overwrites that * don't require additional IO at completion. This rules out * writes that need zeroing or extent conversion, extend * the file size, or issue journal IO or cache flushes * during completion processing. */ if (need_zeroout || ((dio->flags & IOMAP_DIO_NEED_SYNC) && !use_fua) || ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) dio->flags &= ~IOMAP_DIO_CALLER_COMP; /* * The rules for polled IO completions follow the guidelines as the * ones we set for inline and deferred completions. If none of those * are available for this IO, clear the polled flag. */ if (!(dio->flags & (IOMAP_DIO_INLINE_COMP|IOMAP_DIO_CALLER_COMP))) dio->iocb->ki_flags &= ~IOCB_HIPRI; if (need_zeroout) { /* zero out from the start of the block to the write offset */ pad = pos & (fs_block_size - 1); if (pad) iomap_dio_zero(iter, dio, pos - pad, pad); } /* * Set the operation flags early so that bio_iov_iter_get_pages * can set up the page vector appropriately for a ZONE_APPEND * operation. */ bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua); nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS); do { size_t n; if (dio->error) { iov_iter_revert(dio->submit.iter, copied); copied = ret = 0; goto out; } bio = iomap_dio_alloc_bio(iter, dio, nr_pages, bio_opf); fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits, GFP_KERNEL); bio->bi_iter.bi_sector = iomap_sector(iomap, pos); bio->bi_ioprio = dio->iocb->ki_ioprio; bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; ret = bio_iov_iter_get_pages(bio, dio->submit.iter); if (unlikely(ret)) { /* * We have to stop part way through an IO. We must fall * through to the sub-block tail zeroing here, otherwise * this short IO may expose stale data in the tail of * the block we haven't written data to. */ bio_put(bio); goto zero_tail; } n = bio->bi_iter.bi_size; if (dio->flags & IOMAP_DIO_WRITE) { task_io_account_write(n); } else { if (dio->flags & IOMAP_DIO_DIRTY) bio_set_pages_dirty(bio); } dio->size += n; copied += n; nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS); /* * We can only poll for single bio I/Os. */ if (nr_pages) dio->iocb->ki_flags &= ~IOCB_HIPRI; iomap_dio_submit_bio(iter, dio, bio, pos); pos += n; } while (nr_pages); /* * We need to zeroout the tail of a sub-block write if the extent type * requires zeroing or the write extends beyond EOF. If we don't zero * the block tail in the latter case, we can expose stale data via mmap * reads of the EOF block. */ zero_tail: if (need_zeroout || ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) { /* zero out from the end of the write to the end of the block */ pad = pos & (fs_block_size - 1); if (pad) iomap_dio_zero(iter, dio, pos, fs_block_size - pad); } out: /* Undo iter limitation to current extent */ iov_iter_reexpand(dio->submit.iter, orig_count - copied); if (copied) return copied; return ret; } static loff_t iomap_dio_hole_iter(const struct iomap_iter *iter, struct iomap_dio *dio) { loff_t length = iov_iter_zero(iomap_length(iter), dio->submit.iter); dio->size += length; if (!length) return -EFAULT; return length; } static loff_t iomap_dio_inline_iter(const struct iomap_iter *iomi, struct iomap_dio *dio) { const struct iomap *iomap = &iomi->iomap; struct iov_iter *iter = dio->submit.iter; void *inline_data = iomap_inline_data(iomap, iomi->pos); loff_t length = iomap_length(iomi); loff_t pos = iomi->pos; size_t copied; if (WARN_ON_ONCE(!iomap_inline_data_valid(iomap))) return -EIO; if (dio->flags & IOMAP_DIO_WRITE) { loff_t size = iomi->inode->i_size; if (pos > size) memset(iomap_inline_data(iomap, size), 0, pos - size); copied = copy_from_iter(inline_data, length, iter); if (copied) { if (pos + copied > size) i_size_write(iomi->inode, pos + copied); mark_inode_dirty(iomi->inode); } } else { copied = copy_to_iter(inline_data, length, iter); } dio->size += copied; if (!copied) return -EFAULT; return copied; } static loff_t iomap_dio_iter(const struct iomap_iter *iter, struct iomap_dio *dio) { switch (iter->iomap.type) { case IOMAP_HOLE: if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE)) return -EIO; return iomap_dio_hole_iter(iter, dio); case IOMAP_UNWRITTEN: if (!(dio->flags & IOMAP_DIO_WRITE)) return iomap_dio_hole_iter(iter, dio); return iomap_dio_bio_iter(iter, dio); case IOMAP_MAPPED: return iomap_dio_bio_iter(iter, dio); case IOMAP_INLINE: return iomap_dio_inline_iter(iter, dio); case IOMAP_DELALLOC: /* * DIO is not serialised against mmap() access at all, and so * if the page_mkwrite occurs between the writeback and the * iomap_iter() call in the DIO path, then it will see the * DELALLOC block that the page-mkwrite allocated. */ pr_warn_ratelimited("Direct I/O collision with buffered writes! File: %pD4 Comm: %.20s\n", dio->iocb->ki_filp, current->comm); return -EIO; default: WARN_ON_ONCE(1); return -EIO; } } /* * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO * is being issued as AIO or not. This allows us to optimise pure data writes * to use REQ_FUA rather than requiring generic_write_sync() to issue a * REQ_FLUSH post write. This is slightly tricky because a single request here * can be mapped into multiple disjoint IOs and only a subset of the IOs issued * may be pure data writes. In that case, we still need to do a full data sync * completion. * * When page faults are disabled and @dio_flags includes IOMAP_DIO_PARTIAL, * __iomap_dio_rw can return a partial result if it encounters a non-resident * page in @iter after preparing a transfer. In that case, the non-resident * pages can be faulted in and the request resumed with @done_before set to the * number of bytes previously transferred. The request will then complete with * the correct total number of bytes transferred; this is essential for * completing partial requests asynchronously. * * Returns -ENOTBLK In case of a page invalidation invalidation failure for * writes. The callers needs to fall back to buffered I/O in this case. */ struct iomap_dio * __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, unsigned int dio_flags, void *private, size_t done_before) { struct inode *inode = file_inode(iocb->ki_filp); struct iomap_iter iomi = { .inode = inode, .pos = iocb->ki_pos, .len = iov_iter_count(iter), .flags = IOMAP_DIRECT, .private = private, }; bool wait_for_completion = is_sync_kiocb(iocb) || (dio_flags & IOMAP_DIO_FORCE_WAIT); struct blk_plug plug; struct iomap_dio *dio; loff_t ret = 0; trace_iomap_dio_rw_begin(iocb, iter, dio_flags, done_before); if (!iomi.len) return NULL; dio = kmalloc(sizeof(*dio), GFP_KERNEL); if (!dio) return ERR_PTR(-ENOMEM); dio->iocb = iocb; atomic_set(&dio->ref, 1); dio->size = 0; dio->i_size = i_size_read(inode); dio->dops = dops; dio->error = 0; dio->flags = 0; dio->done_before = done_before; dio->submit.iter = iter; dio->submit.waiter = current; if (iocb->ki_flags & IOCB_NOWAIT) iomi.flags |= IOMAP_NOWAIT; if (iov_iter_rw(iter) == READ) { /* reads can always complete inline */ dio->flags |= IOMAP_DIO_INLINE_COMP; if (iomi.pos >= dio->i_size) goto out_free_dio; if (user_backed_iter(iter)) dio->flags |= IOMAP_DIO_DIRTY; ret = kiocb_write_and_wait(iocb, iomi.len); if (ret) goto out_free_dio; } else { iomi.flags |= IOMAP_WRITE; dio->flags |= IOMAP_DIO_WRITE; /* * Flag as supporting deferred completions, if the issuer * groks it. This can avoid a workqueue punt for writes. * We may later clear this flag if we need to do other IO * as part of this IO completion. */ if (iocb->ki_flags & IOCB_DIO_CALLER_COMP) dio->flags |= IOMAP_DIO_CALLER_COMP; if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) { ret = -EAGAIN; if (iomi.pos >= dio->i_size || iomi.pos + iomi.len > dio->i_size) goto out_free_dio; iomi.flags |= IOMAP_OVERWRITE_ONLY; } /* for data sync or sync, we need sync completion processing */ if (iocb_is_dsync(iocb)) { dio->flags |= IOMAP_DIO_NEED_SYNC; /* * For datasync only writes, we optimistically try using * WRITE_THROUGH for this IO. This flag requires either * FUA writes through the device's write cache, or a * normal write to a device without a volatile write * cache. For the former, Any non-FUA write that occurs * will clear this flag, hence we know before completion * whether a cache flush is necessary. */ if (!(iocb->ki_flags & IOCB_SYNC)) dio->flags |= IOMAP_DIO_WRITE_THROUGH; } /* * Try to invalidate cache pages for the range we are writing. * If this invalidation fails, let the caller fall back to * buffered I/O. */ ret = kiocb_invalidate_pages(iocb, iomi.len); if (ret) { if (ret != -EAGAIN) { trace_iomap_dio_invalidate_fail(inode, iomi.pos, iomi.len); ret = -ENOTBLK; } goto out_free_dio; } if (!wait_for_completion && !inode->i_sb->s_dio_done_wq) { ret = sb_init_dio_done_wq(inode->i_sb); if (ret < 0) goto out_free_dio; } } inode_dio_begin(inode); blk_start_plug(&plug); while ((ret = iomap_iter(&iomi, ops)) > 0) { iomi.processed = iomap_dio_iter(&iomi, dio); /* * We can only poll for single bio I/Os. */ iocb->ki_flags &= ~IOCB_HIPRI; } blk_finish_plug(&plug); /* * We only report that we've read data up to i_size. * Revert iter to a state corresponding to that as some callers (such * as the splice code) rely on it. */ if (iov_iter_rw(iter) == READ && iomi.pos >= dio->i_size) iov_iter_revert(iter, iomi.pos - dio->i_size); if (ret == -EFAULT && dio->size && (dio_flags & IOMAP_DIO_PARTIAL)) { if (!(iocb->ki_flags & IOCB_NOWAIT)) wait_for_completion = true; ret = 0; } /* magic error code to fall back to buffered I/O */ if (ret == -ENOTBLK) { wait_for_completion = true; ret = 0; } if (ret < 0) iomap_dio_set_error(dio, ret); /* * If all the writes we issued were already written through to the * media, we don't need to flush the cache on IO completion. Clear the * sync flag for this case. */ if (dio->flags & IOMAP_DIO_WRITE_THROUGH) dio->flags &= ~IOMAP_DIO_NEED_SYNC; /* * We are about to drop our additional submission reference, which * might be the last reference to the dio. There are three different * ways we can progress here: * * (a) If this is the last reference we will always complete and free * the dio ourselves. * (b) If this is not the last reference, and we serve an asynchronous * iocb, we must never touch the dio after the decrement, the * I/O completion handler will complete and free it. * (c) If this is not the last reference, but we serve a synchronous * iocb, the I/O completion handler will wake us up on the drop * of the final reference, and we will complete and free it here * after we got woken by the I/O completion handler. */ dio->wait_for_completion = wait_for_completion; if (!atomic_dec_and_test(&dio->ref)) { if (!wait_for_completion) { trace_iomap_dio_rw_queued(inode, iomi.pos, iomi.len); return ERR_PTR(-EIOCBQUEUED); } for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (!READ_ONCE(dio->submit.waiter)) break; blk_io_schedule(); } __set_current_state(TASK_RUNNING); } return dio; out_free_dio: kfree(dio); if (ret) return ERR_PTR(ret); return NULL; } EXPORT_SYMBOL_GPL(__iomap_dio_rw); ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, unsigned int dio_flags, void *private, size_t done_before) { struct iomap_dio *dio; dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags, private, done_before); if (IS_ERR_OR_NULL(dio)) return PTR_ERR_OR_ZERO(dio); return iomap_dio_complete(dio); } EXPORT_SYMBOL_GPL(iomap_dio_rw);
62 235 400 223 11 222 72 218 39 40 228 2 3 11 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_IP_TUNNELS_H #define __NET_IP_TUNNELS_H 1 #include <linux/if_tunnel.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/socket.h> #include <linux/types.h> #include <linux/u64_stats_sync.h> #include <linux/bitops.h> #include <net/dsfield.h> #include <net/gro_cells.h> #include <net/inet_ecn.h> #include <net/netns/generic.h> #include <net/rtnetlink.h> #include <net/lwtunnel.h> #include <net/dst_cache.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> #endif /* Keep error state on tunnel for 30 sec */ #define IPTUNNEL_ERR_TIMEO (30*HZ) /* Used to memset ip_tunnel padding. */ #define IP_TUNNEL_KEY_SIZE offsetofend(struct ip_tunnel_key, tp_dst) /* Used to memset ipv4 address padding. */ #define IP_TUNNEL_KEY_IPV4_PAD offsetofend(struct ip_tunnel_key, u.ipv4.dst) #define IP_TUNNEL_KEY_IPV4_PAD_LEN \ (sizeof_field(struct ip_tunnel_key, u) - \ sizeof_field(struct ip_tunnel_key, u.ipv4)) struct ip_tunnel_key { __be64 tun_id; union { struct { __be32 src; __be32 dst; } ipv4; struct { struct in6_addr src; struct in6_addr dst; } ipv6; } u; __be16 tun_flags; u8 tos; /* TOS for IPv4, TC for IPv6 */ u8 ttl; /* TTL for IPv4, HL for IPv6 */ __be32 label; /* Flow Label for IPv6 */ u32 nhid; __be16 tp_src; __be16 tp_dst; __u8 flow_flags; }; struct ip_tunnel_encap { u16 type; u16 flags; __be16 sport; __be16 dport; }; /* Flags for ip_tunnel_info mode. */ #define IP_TUNNEL_INFO_TX 0x01 /* represents tx tunnel parameters */ #define IP_TUNNEL_INFO_IPV6 0x02 /* key contains IPv6 addresses */ #define IP_TUNNEL_INFO_BRIDGE 0x04 /* represents a bridged tunnel id */ /* Maximum tunnel options length. */ #define IP_TUNNEL_OPTS_MAX \ GENMASK((sizeof_field(struct ip_tunnel_info, \ options_len) * BITS_PER_BYTE) - 1, 0) #define ip_tunnel_info_opts(info) \ _Generic(info, \ const struct ip_tunnel_info * : ((const void *)((info) + 1)),\ struct ip_tunnel_info * : ((void *)((info) + 1))\ ) struct ip_tunnel_info { struct ip_tunnel_key key; struct ip_tunnel_encap encap; #ifdef CONFIG_DST_CACHE struct dst_cache dst_cache; #endif u8 options_len; u8 mode; }; /* 6rd prefix/relay information */ #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel_6rd_parm { struct in6_addr prefix; __be32 relay_prefix; u16 prefixlen; u16 relay_prefixlen; }; #endif struct ip_tunnel_prl_entry { struct ip_tunnel_prl_entry __rcu *next; __be32 addr; u16 flags; struct rcu_head rcu_head; }; struct metadata_dst; struct ip_tunnel { struct ip_tunnel __rcu *next; struct hlist_node hash_node; struct net_device *dev; netdevice_tracker dev_tracker; struct net *net; /* netns for packet i/o */ unsigned long err_time; /* Time when the last ICMP error * arrived */ int err_count; /* Number of arrived ICMP errors */ /* These four fields used only by GRE */ u32 i_seqno; /* The last seen seqno */ atomic_t o_seqno; /* The last output seqno */ int tun_hlen; /* Precalculated header length */ /* These four fields used only by ERSPAN */ u32 index; /* ERSPAN type II index */ u8 erspan_ver; /* ERSPAN version */ u8 dir; /* ERSPAN direction */ u16 hwid; /* ERSPAN hardware ID */ struct dst_cache dst_cache; struct ip_tunnel_parm parms; int mlink; int encap_hlen; /* Encap header length (FOU,GUE) */ int hlen; /* tun_hlen + encap_hlen */ struct ip_tunnel_encap encap; /* for SIT */ #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel_6rd_parm ip6rd; #endif struct ip_tunnel_prl_entry __rcu *prl; /* potential router list */ unsigned int prl_count; /* # of entries in PRL */ unsigned int ip_tnl_net_id; struct gro_cells gro_cells; __u32 fwmark; bool collect_md; bool ignore_df; }; struct tnl_ptk_info { __be16 flags; __be16 proto; __be32 key; __be32 seq; int hdr_len; }; #define PACKET_RCVD 0 #define PACKET_REJECT 1 #define PACKET_NEXT 2 #define IP_TNL_HASH_BITS 7 #define IP_TNL_HASH_SIZE (1 << IP_TNL_HASH_BITS) struct ip_tunnel_net { struct net_device *fb_tunnel_dev; struct rtnl_link_ops *rtnl_link_ops; struct hlist_head tunnels[IP_TNL_HASH_SIZE]; struct ip_tunnel __rcu *collect_md_tun; int type; }; static inline void ip_tunnel_key_init(struct ip_tunnel_key *key, __be32 saddr, __be32 daddr, u8 tos, u8 ttl, __be32 label, __be16 tp_src, __be16 tp_dst, __be64 tun_id, __be16 tun_flags) { key->tun_id = tun_id; key->u.ipv4.src = saddr; key->u.ipv4.dst = daddr; memset((unsigned char *)key + IP_TUNNEL_KEY_IPV4_PAD, 0, IP_TUNNEL_KEY_IPV4_PAD_LEN); key->tos = tos; key->ttl = ttl; key->label = label; key->tun_flags = tun_flags; /* For the tunnel types on the top of IPsec, the tp_src and tp_dst of * the upper tunnel are used. * E.g: GRE over IPSEC, the tp_src and tp_port are zero. */ key->tp_src = tp_src; key->tp_dst = tp_dst; /* Clear struct padding. */ if (sizeof(*key) != IP_TUNNEL_KEY_SIZE) memset((unsigned char *)key + IP_TUNNEL_KEY_SIZE, 0, sizeof(*key) - IP_TUNNEL_KEY_SIZE); } static inline bool ip_tunnel_dst_cache_usable(const struct sk_buff *skb, const struct ip_tunnel_info *info) { if (skb->mark) return false; if (!info) return true; if (info->key.tun_flags & TUNNEL_NOCACHE) return false; return true; } static inline unsigned short ip_tunnel_info_af(const struct ip_tunnel_info *tun_info) { return tun_info->mode & IP_TUNNEL_INFO_IPV6 ? AF_INET6 : AF_INET; } static inline __be64 key32_to_tunnel_id(__be32 key) { #ifdef __BIG_ENDIAN return (__force __be64)key; #else return (__force __be64)((__force u64)key << 32); #endif } /* Returns the least-significant 32 bits of a __be64. */ static inline __be32 tunnel_id_to_key32(__be64 tun_id) { #ifdef __BIG_ENDIAN return (__force __be32)tun_id; #else return (__force __be32)((__force u64)tun_id >> 32); #endif } #ifdef CONFIG_INET static inline void ip_tunnel_init_flow(struct flowi4 *fl4, int proto, __be32 daddr, __be32 saddr, __be32 key, __u8 tos, struct net *net, int oif, __u32 mark, __u32 tun_inner_hash, __u8 flow_flags) { memset(fl4, 0, sizeof(*fl4)); if (oif) { fl4->flowi4_l3mdev = l3mdev_master_upper_ifindex_by_index_rcu(net, oif); /* Legacy VRF/l3mdev use case */ fl4->flowi4_oif = fl4->flowi4_l3mdev ? 0 : oif; } fl4->daddr = daddr; fl4->saddr = saddr; fl4->flowi4_tos = tos; fl4->flowi4_proto = proto; fl4->fl4_gre_key = key; fl4->flowi4_mark = mark; fl4->flowi4_multipath_hash = tun_inner_hash; fl4->flowi4_flags = flow_flags; } int ip_tunnel_init(struct net_device *dev); void ip_tunnel_uninit(struct net_device *dev); void ip_tunnel_dellink(struct net_device *dev, struct list_head *head); struct net *ip_tunnel_get_link_net(const struct net_device *dev); int ip_tunnel_get_iflink(const struct net_device *dev); int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, struct rtnl_link_ops *ops, char *devname); void ip_tunnel_delete_nets(struct list_head *list_net, unsigned int id, struct rtnl_link_ops *ops); void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, const u8 protocol); void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const u8 proto, int tunnel_hlen); int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd); int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd); int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict); int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu); struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, int link, __be16 flags, __be32 remote, __be32 local, __be32 key); void ip_tunnel_md_udp_encap(struct sk_buff *skb, struct ip_tunnel_info *info); int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, bool log_ecn_error); int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm *p, __u32 fwmark); int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm *p, __u32 fwmark); void ip_tunnel_setup(struct net_device *dev, unsigned int net_id); bool ip_tunnel_netlink_encap_parms(struct nlattr *data[], struct ip_tunnel_encap *encap); void ip_tunnel_netlink_parms(struct nlattr *data[], struct ip_tunnel_parm *parms); extern const struct header_ops ip_tunnel_header_ops; __be16 ip_tunnel_parse_protocol(const struct sk_buff *skb); struct ip_tunnel_encap_ops { size_t (*encap_hlen)(struct ip_tunnel_encap *e); int (*build_header)(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, struct flowi4 *fl4); int (*err_handler)(struct sk_buff *skb, u32 info); }; #define MAX_IPTUN_ENCAP_OPS 8 extern const struct ip_tunnel_encap_ops __rcu * iptun_encaps[MAX_IPTUN_ENCAP_OPS]; int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *op, unsigned int num); int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *op, unsigned int num); int ip_tunnel_encap_setup(struct ip_tunnel *t, struct ip_tunnel_encap *ipencap); static inline bool pskb_inet_may_pull(struct sk_buff *skb) { int nhlen; switch (skb->protocol) { #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): nhlen = sizeof(struct ipv6hdr); break; #endif case htons(ETH_P_IP): nhlen = sizeof(struct iphdr); break; default: nhlen = 0; } return pskb_network_may_pull(skb, nhlen); } static inline int ip_encap_hlen(struct ip_tunnel_encap *e) { const struct ip_tunnel_encap_ops *ops; int hlen = -EINVAL; if (e->type == TUNNEL_ENCAP_NONE) return 0; if (e->type >= MAX_IPTUN_ENCAP_OPS) return -EINVAL; rcu_read_lock(); ops = rcu_dereference(iptun_encaps[e->type]); if (likely(ops && ops->encap_hlen)) hlen = ops->encap_hlen(e); rcu_read_unlock(); return hlen; } static inline int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, struct flowi4 *fl4) { const struct ip_tunnel_encap_ops *ops; int ret = -EINVAL; if (e->type == TUNNEL_ENCAP_NONE) return 0; if (e->type >= MAX_IPTUN_ENCAP_OPS) return -EINVAL; rcu_read_lock(); ops = rcu_dereference(iptun_encaps[e->type]); if (likely(ops && ops->build_header)) ret = ops->build_header(skb, e, protocol, fl4); rcu_read_unlock(); return ret; } /* Extract dsfield from inner protocol */ static inline u8 ip_tunnel_get_dsfield(const struct iphdr *iph, const struct sk_buff *skb) { __be16 payload_protocol = skb_protocol(skb, true); if (payload_protocol == htons(ETH_P_IP)) return iph->tos; else if (payload_protocol == htons(ETH_P_IPV6)) return ipv6_get_dsfield((const struct ipv6hdr *)iph); else return 0; } static inline u8 ip_tunnel_get_ttl(const struct iphdr *iph, const struct sk_buff *skb) { __be16 payload_protocol = skb_protocol(skb, true); if (payload_protocol == htons(ETH_P_IP)) return iph->ttl; else if (payload_protocol == htons(ETH_P_IPV6)) return ((const struct ipv6hdr *)iph)->hop_limit; else return 0; } /* Propogate ECN bits out */ static inline u8 ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph, const struct sk_buff *skb) { u8 inner = ip_tunnel_get_dsfield(iph, skb); return INET_ECN_encapsulate(tos, inner); } int __iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto, bool raw_proto, bool xnet); static inline int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto, bool xnet) { return __iptunnel_pull_header(skb, hdr_len, inner_proto, false, xnet); } void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, u8 proto, u8 tos, u8 ttl, __be16 df, bool xnet); struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, gfp_t flags); int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst, int headroom, bool reply); int iptunnel_handle_offloads(struct sk_buff *skb, int gso_type_mask); static inline int iptunnel_pull_offloads(struct sk_buff *skb) { if (skb_is_gso(skb)) { int err; err = skb_unclone(skb, GFP_ATOMIC); if (unlikely(err)) return err; skb_shinfo(skb)->gso_type &= ~(NETIF_F_GSO_ENCAP_ALL >> NETIF_F_GSO_SHIFT); } skb->encapsulation = 0; return 0; } static inline void iptunnel_xmit_stats(struct net_device *dev, int pkt_len) { if (pkt_len > 0) { struct pcpu_sw_netstats *tstats = get_cpu_ptr(dev->tstats); u64_stats_update_begin(&tstats->syncp); u64_stats_add(&tstats->tx_bytes, pkt_len); u64_stats_inc(&tstats->tx_packets); u64_stats_update_end(&tstats->syncp); put_cpu_ptr(tstats); return; } if (pkt_len < 0) { DEV_STATS_INC(dev, tx_errors); DEV_STATS_INC(dev, tx_aborted_errors); } else { DEV_STATS_INC(dev, tx_dropped); } } static inline void ip_tunnel_info_opts_get(void *to, const struct ip_tunnel_info *info) { memcpy(to, info + 1, info->options_len); } static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info, const void *from, int len, __be16 flags) { info->options_len = len; if (len > 0) { memcpy(ip_tunnel_info_opts(info), from, len); info->key.tun_flags |= flags; } } static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstate) { return (struct ip_tunnel_info *)lwtstate->data; } DECLARE_STATIC_KEY_FALSE(ip_tunnel_metadata_cnt); /* Returns > 0 if metadata should be collected */ static inline int ip_tunnel_collect_metadata(void) { return static_branch_unlikely(&ip_tunnel_metadata_cnt); } void __init ip_tunnel_core_init(void); void ip_tunnel_need_metadata(void); void ip_tunnel_unneed_metadata(void); #else /* CONFIG_INET */ static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstate) { return NULL; } static inline void ip_tunnel_need_metadata(void) { } static inline void ip_tunnel_unneed_metadata(void) { } static inline void ip_tunnel_info_opts_get(void *to, const struct ip_tunnel_info *info) { } static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info, const void *from, int len, __be16 flags) { info->options_len = 0; } #endif /* CONFIG_INET */ #endif /* __NET_IP_TUNNELS_H */
4575 4548 4469 4583 186 1687 1686 1687 1686 1688 68 186 68 68 1516 512 1516 1224 1223 1222 209 209 18 3084 1107 147 1107 878 24 878 2 189 189 189 189 189 189 189 189 189 3967 3964 3966 1588 1238 189 189 1590 1577 11 3 8 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io */ #include <uapi/linux/btf.h> #include <linux/kernel.h> #include <linux/types.h> #include <linux/bpf.h> #include <linux/bpf_verifier.h> #include <linux/math64.h> static bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log) { /* ubuf and len_total should both be specified (or not) together */ if (!!log->ubuf != !!log->len_total) return false; /* log buf without log_level is meaningless */ if (log->ubuf && log->level == 0) return false; if (log->level & ~BPF_LOG_MASK) return false; if (log->len_total > UINT_MAX >> 2) return false; return true; } int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level, char __user *log_buf, u32 log_size) { log->level = log_level; log->ubuf = log_buf; log->len_total = log_size; /* log attributes have to be sane */ if (!bpf_verifier_log_attr_valid(log)) return -EINVAL; return 0; } static void bpf_vlog_update_len_max(struct bpf_verifier_log *log, u32 add_len) { /* add_len includes terminal \0, so no need for +1. */ u64 len = log->end_pos + add_len; /* log->len_max could be larger than our current len due to * bpf_vlog_reset() calls, so we maintain the max of any length at any * previous point */ if (len > UINT_MAX) log->len_max = UINT_MAX; else if (len > log->len_max) log->len_max = len; } void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, va_list args) { u64 cur_pos; u32 new_n, n; n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args); if (log->level == BPF_LOG_KERNEL) { bool newline = n > 0 && log->kbuf[n - 1] == '\n'; pr_err("BPF: %s%s", log->kbuf, newline ? "" : "\n"); return; } n += 1; /* include terminating zero */ bpf_vlog_update_len_max(log, n); if (log->level & BPF_LOG_FIXED) { /* check if we have at least something to put into user buf */ new_n = 0; if (log->end_pos < log->len_total) { new_n = min_t(u32, log->len_total - log->end_pos, n); log->kbuf[new_n - 1] = '\0'; } cur_pos = log->end_pos; log->end_pos += n - 1; /* don't count terminating '\0' */ if (log->ubuf && new_n && copy_to_user(log->ubuf + cur_pos, log->kbuf, new_n)) goto fail; } else { u64 new_end, new_start; u32 buf_start, buf_end, new_n; new_end = log->end_pos + n; if (new_end - log->start_pos >= log->len_total) new_start = new_end - log->len_total; else new_start = log->start_pos; log->start_pos = new_start; log->end_pos = new_end - 1; /* don't count terminating '\0' */ if (!log->ubuf) return; new_n = min(n, log->len_total); cur_pos = new_end - new_n; div_u64_rem(cur_pos, log->len_total, &buf_start); div_u64_rem(new_end, log->len_total, &buf_end); /* new_end and buf_end are exclusive indices, so if buf_end is * exactly zero, then it actually points right to the end of * ubuf and there is no wrap around */ if (buf_end == 0) buf_end = log->len_total; /* if buf_start > buf_end, we wrapped around; * if buf_start == buf_end, then we fill ubuf completely; we * can't have buf_start == buf_end to mean that there is * nothing to write, because we always write at least * something, even if terminal '\0' */ if (buf_start < buf_end) { /* message fits within contiguous chunk of ubuf */ if (copy_to_user(log->ubuf + buf_start, log->kbuf + n - new_n, buf_end - buf_start)) goto fail; } else { /* message wraps around the end of ubuf, copy in two chunks */ if (copy_to_user(log->ubuf + buf_start, log->kbuf + n - new_n, log->len_total - buf_start)) goto fail; if (copy_to_user(log->ubuf, log->kbuf + n - buf_end, buf_end)) goto fail; } } return; fail: log->ubuf = NULL; } void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos) { char zero = 0; u32 pos; if (WARN_ON_ONCE(new_pos > log->end_pos)) return; if (!bpf_verifier_log_needed(log) || log->level == BPF_LOG_KERNEL) return; /* if position to which we reset is beyond current log window, * then we didn't preserve any useful content and should adjust * start_pos to end up with an empty log (start_pos == end_pos) */ log->end_pos = new_pos; if (log->end_pos < log->start_pos) log->start_pos = log->end_pos; if (!log->ubuf) return; if (log->level & BPF_LOG_FIXED) pos = log->end_pos + 1; else div_u64_rem(new_pos, log->len_total, &pos); if (pos < log->len_total && put_user(zero, log->ubuf + pos)) log->ubuf = NULL; } static void bpf_vlog_reverse_kbuf(char *buf, int len) { int i, j; for (i = 0, j = len - 1; i < j; i++, j--) swap(buf[i], buf[j]); } static int bpf_vlog_reverse_ubuf(struct bpf_verifier_log *log, int start, int end) { /* we split log->kbuf into two equal parts for both ends of array */ int n = sizeof(log->kbuf) / 2, nn; char *lbuf = log->kbuf, *rbuf = log->kbuf + n; /* Read ubuf's section [start, end) two chunks at a time, from left * and right side; within each chunk, swap all the bytes; after that * reverse the order of lbuf and rbuf and write result back to ubuf. * This way we'll end up with swapped contents of specified * [start, end) ubuf segment. */ while (end - start > 1) { nn = min(n, (end - start ) / 2); if (copy_from_user(lbuf, log->ubuf + start, nn)) return -EFAULT; if (copy_from_user(rbuf, log->ubuf + end - nn, nn)) return -EFAULT; bpf_vlog_reverse_kbuf(lbuf, nn); bpf_vlog_reverse_kbuf(rbuf, nn); /* we write lbuf to the right end of ubuf, while rbuf to the * left one to end up with properly reversed overall ubuf */ if (copy_to_user(log->ubuf + start, rbuf, nn)) return -EFAULT; if (copy_to_user(log->ubuf + end - nn, lbuf, nn)) return -EFAULT; start += nn; end -= nn; } return 0; } int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual) { u32 sublen; int err; *log_size_actual = 0; if (!log || log->level == 0 || log->level == BPF_LOG_KERNEL) return 0; if (!log->ubuf) goto skip_log_rotate; /* If we never truncated log, there is nothing to move around. */ if (log->start_pos == 0) goto skip_log_rotate; /* Otherwise we need to rotate log contents to make it start from the * buffer beginning and be a continuous zero-terminated string. Note * that if log->start_pos != 0 then we definitely filled up entire log * buffer with no gaps, and we just need to shift buffer contents to * the left by (log->start_pos % log->len_total) bytes. * * Unfortunately, user buffer could be huge and we don't want to * allocate temporary kernel memory of the same size just to shift * contents in a straightforward fashion. Instead, we'll be clever and * do in-place array rotation. This is a leetcode-style problem, which * could be solved by three rotations. * * Let's say we have log buffer that has to be shifted left by 7 bytes * (spaces and vertical bar is just for demonstrative purposes): * E F G H I J K | A B C D * * First, we reverse entire array: * D C B A | K J I H G F E * * Then we rotate first 4 bytes (DCBA) and separately last 7 bytes * (KJIHGFE), resulting in a properly rotated array: * A B C D | E F G H I J K * * We'll utilize log->kbuf to read user memory chunk by chunk, swap * bytes, and write them back. Doing it byte-by-byte would be * unnecessarily inefficient. Altogether we are going to read and * write each byte twice, for total 4 memory copies between kernel and * user space. */ /* length of the chopped off part that will be the beginning; * len(ABCD) in the example above */ div_u64_rem(log->start_pos, log->len_total, &sublen); sublen = log->len_total - sublen; err = bpf_vlog_reverse_ubuf(log, 0, log->len_total); err = err ?: bpf_vlog_reverse_ubuf(log, 0, sublen); err = err ?: bpf_vlog_reverse_ubuf(log, sublen, log->len_total); if (err) log->ubuf = NULL; skip_log_rotate: *log_size_actual = log->len_max; /* properly initialized log has either both ubuf!=NULL and len_total>0 * or ubuf==NULL and len_total==0, so if this condition doesn't hold, * we got a fault somewhere along the way, so report it back */ if (!!log->ubuf != !!log->len_total) return -EFAULT; /* did truncation actually happen? */ if (log->ubuf && log->len_max > log->len_total) return -ENOSPC; return 0; } /* log_level controls verbosity level of eBPF verifier. * bpf_verifier_log_write() is used to dump the verification trace to the log, * so the user can figure out what's wrong with the program */ __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, const char *fmt, ...) { va_list args; if (!bpf_verifier_log_needed(&env->log)) return; va_start(args, fmt); bpf_verifier_vlog(&env->log, fmt, args); va_end(args); } EXPORT_SYMBOL_GPL(bpf_verifier_log_write); __printf(2, 3) void bpf_log(struct bpf_verifier_log *log, const char *fmt, ...) { va_list args; if (!bpf_verifier_log_needed(log)) return; va_start(args, fmt); bpf_verifier_vlog(log, fmt, args); va_end(args); } EXPORT_SYMBOL_GPL(bpf_log);
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_RATELIMIT_H #define _LINUX_RATELIMIT_H #include <linux/ratelimit_types.h> #include <linux/sched.h> #include <linux/spinlock.h> static inline void ratelimit_state_init(struct ratelimit_state *rs, int interval, int burst) { memset(rs, 0, sizeof(*rs)); raw_spin_lock_init(&rs->lock); rs->interval = interval; rs->burst = burst; } static inline void ratelimit_default_init(struct ratelimit_state *rs) { return ratelimit_state_init(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); } static inline void ratelimit_state_exit(struct ratelimit_state *rs) { if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) return; if (rs->missed) { pr_warn("%s: %d output lines suppressed due to ratelimiting\n", current->comm, rs->missed); rs->missed = 0; } } static inline void ratelimit_set_flags(struct ratelimit_state *rs, unsigned long flags) { rs->flags = flags; } extern struct ratelimit_state printk_ratelimit_state; #ifdef CONFIG_PRINTK #define WARN_ON_RATELIMIT(condition, state) ({ \ bool __rtn_cond = !!(condition); \ WARN_ON(__rtn_cond && __ratelimit(state)); \ __rtn_cond; \ }) #define WARN_RATELIMIT(condition, format, ...) \ ({ \ static DEFINE_RATELIMIT_STATE(_rs, \ DEFAULT_RATELIMIT_INTERVAL, \ DEFAULT_RATELIMIT_BURST); \ int rtn = !!(condition); \ \ if (unlikely(rtn && __ratelimit(&_rs))) \ WARN(rtn, format, ##__VA_ARGS__); \ \ rtn; \ }) #else #define WARN_ON_RATELIMIT(condition, state) \ WARN_ON(condition) #define WARN_RATELIMIT(condition, format, ...) \ ({ \ int rtn = WARN(condition, format, ##__VA_ARGS__); \ rtn; \ }) #endif #endif /* _LINUX_RATELIMIT_H */
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 /* SPDX-License-Identifier: GPL-2.0 */ /* * linux/mii.h: definitions for MII-compatible transceivers * Originally drivers/net/sunhme.h. * * Copyright (C) 1996, 1999, 2001 David S. Miller (davem@redhat.com) */ #ifndef __LINUX_MII_H__ #define __LINUX_MII_H__ #include <linux/if.h> #include <linux/linkmode.h> #include <uapi/linux/mii.h> struct ethtool_cmd; struct mii_if_info { int phy_id; int advertising; int phy_id_mask; int reg_num_mask; unsigned int full_duplex : 1; /* is full duplex? */ unsigned int force_media : 1; /* is autoneg. disabled? */ unsigned int supports_gmii : 1; /* are GMII registers supported? */ struct net_device *dev; int (*mdio_read) (struct net_device *dev, int phy_id, int location); void (*mdio_write) (struct net_device *dev, int phy_id, int location, int val); }; extern int mii_link_ok (struct mii_if_info *mii); extern int mii_nway_restart (struct mii_if_info *mii); extern void mii_ethtool_gset(struct mii_if_info *mii, struct ethtool_cmd *ecmd); extern void mii_ethtool_get_link_ksettings( struct mii_if_info *mii, struct ethtool_link_ksettings *cmd); extern int mii_ethtool_sset(struct mii_if_info *mii, struct ethtool_cmd *ecmd); extern int mii_ethtool_set_link_ksettings( struct mii_if_info *mii, const struct ethtool_link_ksettings *cmd); extern int mii_check_gmii_support(struct mii_if_info *mii); extern void mii_check_link (struct mii_if_info *mii); extern unsigned int mii_check_media (struct mii_if_info *mii, unsigned int ok_to_print, unsigned int init_media); extern int generic_mii_ioctl(struct mii_if_info *mii_if, struct mii_ioctl_data *mii_data, int cmd, unsigned int *duplex_changed); static inline struct mii_ioctl_data *if_mii(struct ifreq *rq) { return (struct mii_ioctl_data *) &rq->ifr_ifru; } /** * mii_nway_result * @negotiated: value of MII ANAR and'd with ANLPAR * * Given a set of MII abilities, check each bit and returns the * currently supported media, in the priority order defined by * IEEE 802.3u. We use LPA_xxx constants but note this is not the * value of LPA solely, as described above. * * The one exception to IEEE 802.3u is that 100baseT4 is placed * between 100T-full and 100T-half. If your phy does not support * 100T4 this is fine. If your phy places 100T4 elsewhere in the * priority order, you will need to roll your own function. */ static inline unsigned int mii_nway_result (unsigned int negotiated) { unsigned int ret; if (negotiated & LPA_100FULL) ret = LPA_100FULL; else if (negotiated & LPA_100BASE4) ret = LPA_100BASE4; else if (negotiated & LPA_100HALF) ret = LPA_100HALF; else if (negotiated & LPA_10FULL) ret = LPA_10FULL; else ret = LPA_10HALF; return ret; } /** * mii_duplex * @duplex_lock: Non-zero if duplex is locked at full * @negotiated: value of MII ANAR and'd with ANLPAR * * A small helper function for a common case. Returns one * if the media is operating or locked at full duplex, and * returns zero otherwise. */ static inline unsigned int mii_duplex (unsigned int duplex_lock, unsigned int negotiated) { if (duplex_lock) return 1; if (mii_nway_result(negotiated) & LPA_DUPLEX) return 1; return 0; } /** * ethtool_adv_to_mii_adv_t * @ethadv: the ethtool advertisement settings * * A small helper function that translates ethtool advertisement * settings to phy autonegotiation advertisements for the * MII_ADVERTISE register. */ static inline u32 ethtool_adv_to_mii_adv_t(u32 ethadv) { u32 result = 0; if (ethadv & ADVERTISED_10baseT_Half) result |= ADVERTISE_10HALF; if (ethadv & ADVERTISED_10baseT_Full) result |= ADVERTISE_10FULL; if (ethadv & ADVERTISED_100baseT_Half) result |= ADVERTISE_100HALF; if (ethadv & ADVERTISED_100baseT_Full) result |= ADVERTISE_100FULL; if (ethadv & ADVERTISED_Pause) result |= ADVERTISE_PAUSE_CAP; if (ethadv & ADVERTISED_Asym_Pause) result |= ADVERTISE_PAUSE_ASYM; return result; } /** * linkmode_adv_to_mii_adv_t * @advertising: the linkmode advertisement settings * * A small helper function that translates linkmode advertisement * settings to phy autonegotiation advertisements for the * MII_ADVERTISE register. */ static inline u32 linkmode_adv_to_mii_adv_t(unsigned long *advertising) { u32 result = 0; if (linkmode_test_bit(ETHTOOL_LINK_MODE_10baseT_Half_BIT, advertising)) result |= ADVERTISE_10HALF; if (linkmode_test_bit(ETHTOOL_LINK_MODE_10baseT_Full_BIT, advertising)) result |= ADVERTISE_10FULL; if (linkmode_test_bit(ETHTOOL_LINK_MODE_100baseT_Half_BIT, advertising)) result |= ADVERTISE_100HALF; if (linkmode_test_bit(ETHTOOL_LINK_MODE_100baseT_Full_BIT, advertising)) result |= ADVERTISE_100FULL; if (linkmode_test_bit(ETHTOOL_LINK_MODE_Pause_BIT, advertising)) result |= ADVERTISE_PAUSE_CAP; if (linkmode_test_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, advertising)) result |= ADVERTISE_PAUSE_ASYM; return result; } /** * mii_adv_to_ethtool_adv_t * @adv: value of the MII_ADVERTISE register * * A small helper function that translates MII_ADVERTISE bits * to ethtool advertisement settings. */ static inline u32 mii_adv_to_ethtool_adv_t(u32 adv) { u32 result = 0; if (adv & ADVERTISE_10HALF) result |= ADVERTISED_10baseT_Half; if (adv & ADVERTISE_10FULL) result |= ADVERTISED_10baseT_Full; if (adv & ADVERTISE_100HALF) result |= ADVERTISED_100baseT_Half; if (adv & ADVERTISE_100FULL) result |= ADVERTISED_100baseT_Full; if (adv & ADVERTISE_PAUSE_CAP) result |= ADVERTISED_Pause; if (adv & ADVERTISE_PAUSE_ASYM) result |= ADVERTISED_Asym_Pause; return result; } /** * ethtool_adv_to_mii_ctrl1000_t * @ethadv: the ethtool advertisement settings * * A small helper function that translates ethtool advertisement * settings to phy autonegotiation advertisements for the * MII_CTRL1000 register when in 1000T mode. */ static inline u32 ethtool_adv_to_mii_ctrl1000_t(u32 ethadv) { u32 result = 0; if (ethadv & ADVERTISED_1000baseT_Half) result |= ADVERTISE_1000HALF; if (ethadv & ADVERTISED_1000baseT_Full) result |= ADVERTISE_1000FULL; return result; } /** * linkmode_adv_to_mii_ctrl1000_t * @advertising: the linkmode advertisement settings * * A small helper function that translates linkmode advertisement * settings to phy autonegotiation advertisements for the * MII_CTRL1000 register when in 1000T mode. */ static inline u32 linkmode_adv_to_mii_ctrl1000_t(unsigned long *advertising) { u32 result = 0; if (linkmode_test_bit(ETHTOOL_LINK_MODE_1000baseT_Half_BIT, advertising)) result |= ADVERTISE_1000HALF; if (linkmode_test_bit(ETHTOOL_LINK_MODE_1000baseT_Full_BIT, advertising)) result |= ADVERTISE_1000FULL; return result; } /** * mii_ctrl1000_to_ethtool_adv_t * @adv: value of the MII_CTRL1000 register * * A small helper function that translates MII_CTRL1000 * bits, when in 1000Base-T mode, to ethtool * advertisement settings. */ static inline u32 mii_ctrl1000_to_ethtool_adv_t(u32 adv) { u32 result = 0; if (adv & ADVERTISE_1000HALF) result |= ADVERTISED_1000baseT_Half; if (adv & ADVERTISE_1000FULL) result |= ADVERTISED_1000baseT_Full; return result; } /** * mii_lpa_to_ethtool_lpa_t * @adv: value of the MII_LPA register * * A small helper function that translates MII_LPA * bits, when in 1000Base-T mode, to ethtool * LP advertisement settings. */ static inline u32 mii_lpa_to_ethtool_lpa_t(u32 lpa) { u32 result = 0; if (lpa & LPA_LPACK) result |= ADVERTISED_Autoneg; return result | mii_adv_to_ethtool_adv_t(lpa); } /** * mii_stat1000_to_ethtool_lpa_t * @adv: value of the MII_STAT1000 register * * A small helper function that translates MII_STAT1000 * bits, when in 1000Base-T mode, to ethtool * advertisement settings. */ static inline u32 mii_stat1000_to_ethtool_lpa_t(u32 lpa) { u32 result = 0; if (lpa & LPA_1000HALF) result |= ADVERTISED_1000baseT_Half; if (lpa & LPA_1000FULL) result |= ADVERTISED_1000baseT_Full; return result; } /** * mii_stat1000_mod_linkmode_lpa_t * @advertising: target the linkmode advertisement settings * @adv: value of the MII_STAT1000 register * * A small helper function that translates MII_STAT1000 bits, when in * 1000Base-T mode, to linkmode advertisement settings. Other bits in * advertising are not changes. */ static inline void mii_stat1000_mod_linkmode_lpa_t(unsigned long *advertising, u32 lpa) { linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseT_Half_BIT, advertising, lpa & LPA_1000HALF); linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseT_Full_BIT, advertising, lpa & LPA_1000FULL); } /** * ethtool_adv_to_mii_adv_x * @ethadv: the ethtool advertisement settings * * A small helper function that translates ethtool advertisement * settings to phy autonegotiation advertisements for the * MII_CTRL1000 register when in 1000Base-X mode. */ static inline u32 ethtool_adv_to_mii_adv_x(u32 ethadv) { u32 result = 0; if (ethadv & ADVERTISED_1000baseT_Half) result |= ADVERTISE_1000XHALF; if (ethadv & ADVERTISED_1000baseT_Full) result |= ADVERTISE_1000XFULL; if (ethadv & ADVERTISED_Pause) result |= ADVERTISE_1000XPAUSE; if (ethadv & ADVERTISED_Asym_Pause) result |= ADVERTISE_1000XPSE_ASYM; return result; } /** * mii_adv_to_ethtool_adv_x * @adv: value of the MII_CTRL1000 register * * A small helper function that translates MII_CTRL1000 * bits, when in 1000Base-X mode, to ethtool * advertisement settings. */ static inline u32 mii_adv_to_ethtool_adv_x(u32 adv) { u32 result = 0; if (adv & ADVERTISE_1000XHALF) result |= ADVERTISED_1000baseT_Half; if (adv & ADVERTISE_1000XFULL) result |= ADVERTISED_1000baseT_Full; if (adv & ADVERTISE_1000XPAUSE) result |= ADVERTISED_Pause; if (adv & ADVERTISE_1000XPSE_ASYM) result |= ADVERTISED_Asym_Pause; return result; } /** * mii_adv_mod_linkmode_adv_t * @advertising:pointer to destination link mode. * @adv: value of the MII_ADVERTISE register * * A small helper function that translates MII_ADVERTISE bits to * linkmode advertisement settings. Leaves other bits unchanged. */ static inline void mii_adv_mod_linkmode_adv_t(unsigned long *advertising, u32 adv) { linkmode_mod_bit(ETHTOOL_LINK_MODE_10baseT_Half_BIT, advertising, adv & ADVERTISE_10HALF); linkmode_mod_bit(ETHTOOL_LINK_MODE_10baseT_Full_BIT, advertising, adv & ADVERTISE_10FULL); linkmode_mod_bit(ETHTOOL_LINK_MODE_100baseT_Half_BIT, advertising, adv & ADVERTISE_100HALF); linkmode_mod_bit(ETHTOOL_LINK_MODE_100baseT_Full_BIT, advertising, adv & ADVERTISE_100FULL); linkmode_mod_bit(ETHTOOL_LINK_MODE_Pause_BIT, advertising, adv & ADVERTISE_PAUSE_CAP); linkmode_mod_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, advertising, adv & ADVERTISE_PAUSE_ASYM); } /** * mii_adv_to_linkmode_adv_t * @advertising:pointer to destination link mode. * @adv: value of the MII_ADVERTISE register * * A small helper function that translates MII_ADVERTISE bits * to linkmode advertisement settings. Clears the old value * of advertising. */ static inline void mii_adv_to_linkmode_adv_t(unsigned long *advertising, u32 adv) { linkmode_zero(advertising); mii_adv_mod_linkmode_adv_t(advertising, adv); } /** * mii_lpa_to_linkmode_lpa_t * @adv: value of the MII_LPA register * * A small helper function that translates MII_LPA bits, when in * 1000Base-T mode, to linkmode LP advertisement settings. Clears the * old value of advertising */ static inline void mii_lpa_to_linkmode_lpa_t(unsigned long *lp_advertising, u32 lpa) { mii_adv_to_linkmode_adv_t(lp_advertising, lpa); if (lpa & LPA_LPACK) linkmode_set_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, lp_advertising); } /** * mii_lpa_mod_linkmode_lpa_t * @adv: value of the MII_LPA register * * A small helper function that translates MII_LPA bits, when in * 1000Base-T mode, to linkmode LP advertisement settings. Leaves * other bits unchanged. */ static inline void mii_lpa_mod_linkmode_lpa_t(unsigned long *lp_advertising, u32 lpa) { mii_adv_mod_linkmode_adv_t(lp_advertising, lpa); linkmode_mod_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, lp_advertising, lpa & LPA_LPACK); } static inline void mii_ctrl1000_mod_linkmode_adv_t(unsigned long *advertising, u32 ctrl1000) { linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseT_Half_BIT, advertising, ctrl1000 & ADVERTISE_1000HALF); linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseT_Full_BIT, advertising, ctrl1000 & ADVERTISE_1000FULL); } /** * linkmode_adv_to_lcl_adv_t * @advertising:pointer to linkmode advertising * * A small helper function that translates linkmode advertising to LVL * pause capabilities. */ static inline u32 linkmode_adv_to_lcl_adv_t(unsigned long *advertising) { u32 lcl_adv = 0; if (linkmode_test_bit(ETHTOOL_LINK_MODE_Pause_BIT, advertising)) lcl_adv |= ADVERTISE_PAUSE_CAP; if (linkmode_test_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, advertising)) lcl_adv |= ADVERTISE_PAUSE_ASYM; return lcl_adv; } /** * mii_lpa_mod_linkmode_x - decode the link partner's config_reg to linkmodes * @linkmodes: link modes array * @lpa: config_reg word from link partner * @fd_bit: link mode for 1000XFULL bit */ static inline void mii_lpa_mod_linkmode_x(unsigned long *linkmodes, u16 lpa, int fd_bit) { linkmode_mod_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, linkmodes, lpa & LPA_LPACK); linkmode_mod_bit(ETHTOOL_LINK_MODE_Pause_BIT, linkmodes, lpa & LPA_1000XPAUSE); linkmode_mod_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, linkmodes, lpa & LPA_1000XPAUSE_ASYM); linkmode_mod_bit(fd_bit, linkmodes, lpa & LPA_1000XFULL); } /** * linkmode_adv_to_mii_adv_x - encode a linkmode to config_reg * @linkmodes: linkmodes * @fd_bit: full duplex bit */ static inline u16 linkmode_adv_to_mii_adv_x(const unsigned long *linkmodes, int fd_bit) { u16 adv = 0; if (linkmode_test_bit(fd_bit, linkmodes)) adv |= ADVERTISE_1000XFULL; if (linkmode_test_bit(ETHTOOL_LINK_MODE_Pause_BIT, linkmodes)) adv |= ADVERTISE_1000XPAUSE; if (linkmode_test_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, linkmodes)) adv |= ADVERTISE_1000XPSE_ASYM; return adv; } /** * mii_advertise_flowctrl - get flow control advertisement flags * @cap: Flow control capabilities (FLOW_CTRL_RX, FLOW_CTRL_TX or both) */ static inline u16 mii_advertise_flowctrl(int cap) { u16 adv = 0; if (cap & FLOW_CTRL_RX) adv = ADVERTISE_PAUSE_CAP | ADVERTISE_PAUSE_ASYM; if (cap & FLOW_CTRL_TX) adv ^= ADVERTISE_PAUSE_ASYM; return adv; } /** * mii_resolve_flowctrl_fdx * @lcladv: value of MII ADVERTISE register * @rmtadv: value of MII LPA register * * Resolve full duplex flow control as per IEEE 802.3-2005 table 28B-3 */ static inline u8 mii_resolve_flowctrl_fdx(u16 lcladv, u16 rmtadv) { u8 cap = 0; if (lcladv & rmtadv & ADVERTISE_PAUSE_CAP) { cap = FLOW_CTRL_TX | FLOW_CTRL_RX; } else if (lcladv & rmtadv & ADVERTISE_PAUSE_ASYM) { if (lcladv & ADVERTISE_PAUSE_CAP) cap = FLOW_CTRL_RX; else if (rmtadv & ADVERTISE_PAUSE_CAP) cap = FLOW_CTRL_TX; } return cap; } /** * mii_bmcr_encode_fixed - encode fixed speed/duplex settings to a BMCR value * @speed: a SPEED_* value * @duplex: a DUPLEX_* value * * Encode the speed and duplex to a BMCR value. 2500, 1000, 100 and 10 Mbps are * supported. 2500Mbps is encoded to 1000Mbps. Other speeds are encoded as 10 * Mbps. Unknown duplex values are encoded to half-duplex. */ static inline u16 mii_bmcr_encode_fixed(int speed, int duplex) { u16 bmcr; switch (speed) { case SPEED_2500: case SPEED_1000: bmcr = BMCR_SPEED1000; break; case SPEED_100: bmcr = BMCR_SPEED100; break; case SPEED_10: default: bmcr = BMCR_SPEED10; break; } if (duplex == DUPLEX_FULL) bmcr |= BMCR_FULLDPLX; return bmcr; } #endif /* __LINUX_MII_H__ */
3 2 2 1 3 3 2 2 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 // SPDX-License-Identifier: GPL-2.0-only /* * ppp_deflate.c - interface the zlib procedures for Deflate compression * and decompression (as used by gzip) to the PPP code. * * Copyright 1994-1998 Paul Mackerras. */ #include <linux/module.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/init.h> #include <linux/string.h> #include <linux/ppp_defs.h> #include <linux/ppp-comp.h> #include <linux/zlib.h> #include <asm/unaligned.h> /* * State for a Deflate (de)compressor. */ struct ppp_deflate_state { int seqno; int w_size; int unit; int mru; int debug; z_stream strm; struct compstat stats; }; #define DEFLATE_OVHD 2 /* Deflate overhead/packet */ static void *z_comp_alloc(unsigned char *options, int opt_len); static void *z_decomp_alloc(unsigned char *options, int opt_len); static void z_comp_free(void *state); static void z_decomp_free(void *state); static int z_comp_init(void *state, unsigned char *options, int opt_len, int unit, int hdrlen, int debug); static int z_decomp_init(void *state, unsigned char *options, int opt_len, int unit, int hdrlen, int mru, int debug); static int z_compress(void *state, unsigned char *rptr, unsigned char *obuf, int isize, int osize); static void z_incomp(void *state, unsigned char *ibuf, int icnt); static int z_decompress(void *state, unsigned char *ibuf, int isize, unsigned char *obuf, int osize); static void z_comp_reset(void *state); static void z_decomp_reset(void *state); static void z_comp_stats(void *state, struct compstat *stats); /** * z_comp_free - free the memory used by a compressor * @arg: pointer to the private state for the compressor. */ static void z_comp_free(void *arg) { struct ppp_deflate_state *state = (struct ppp_deflate_state *) arg; if (state) { zlib_deflateEnd(&state->strm); vfree(state->strm.workspace); kfree(state); } } /** * z_comp_alloc - allocate space for a compressor. * @options: pointer to CCP option data * @opt_len: length of the CCP option at @options. * * The @options pointer points to the a buffer containing the * CCP option data for the compression being negotiated. It is * formatted according to RFC1979, and describes the window * size that the peer is requesting that we use in compressing * data to be sent to it. * * Returns the pointer to the private state for the compressor, * or NULL if we could not allocate enough memory. */ static void *z_comp_alloc(unsigned char *options, int opt_len) { struct ppp_deflate_state *state; int w_size; if (opt_len != CILEN_DEFLATE || (options[0] != CI_DEFLATE && options[0] != CI_DEFLATE_DRAFT) || options[1] != CILEN_DEFLATE || DEFLATE_METHOD(options[2]) != DEFLATE_METHOD_VAL || options[3] != DEFLATE_CHK_SEQUENCE) return NULL; w_size = DEFLATE_SIZE(options[2]); if (w_size < DEFLATE_MIN_SIZE || w_size > DEFLATE_MAX_SIZE) return NULL; state = kzalloc(sizeof(*state), GFP_KERNEL); if (state == NULL) return NULL; state->strm.next_in = NULL; state->w_size = w_size; state->strm.workspace = vmalloc(zlib_deflate_workspacesize(-w_size, 8)); if (state->strm.workspace == NULL) goto out_free; if (zlib_deflateInit2(&state->strm, Z_DEFAULT_COMPRESSION, DEFLATE_METHOD_VAL, -w_size, 8, Z_DEFAULT_STRATEGY) != Z_OK) goto out_free; return (void *) state; out_free: z_comp_free(state); return NULL; } /** * z_comp_init - initialize a previously-allocated compressor. * @arg: pointer to the private state for the compressor * @options: pointer to the CCP option data describing the * compression that was negotiated with the peer * @opt_len: length of the CCP option data at @options * @unit: PPP unit number for diagnostic messages * @hdrlen: ignored (present for backwards compatibility) * @debug: debug flag; if non-zero, debug messages are printed. * * The CCP options described by @options must match the options * specified when the compressor was allocated. The compressor * history is reset. Returns 0 for failure (CCP options don't * match) or 1 for success. */ static int z_comp_init(void *arg, unsigned char *options, int opt_len, int unit, int hdrlen, int debug) { struct ppp_deflate_state *state = (struct ppp_deflate_state *) arg; if (opt_len < CILEN_DEFLATE || (options[0] != CI_DEFLATE && options[0] != CI_DEFLATE_DRAFT) || options[1] != CILEN_DEFLATE || DEFLATE_METHOD(options[2]) != DEFLATE_METHOD_VAL || DEFLATE_SIZE(options[2]) != state->w_size || options[3] != DEFLATE_CHK_SEQUENCE) return 0; state->seqno = 0; state->unit = unit; state->debug = debug; zlib_deflateReset(&state->strm); return 1; } /** * z_comp_reset - reset a previously-allocated compressor. * @arg: pointer to private state for the compressor. * * This clears the history for the compressor and makes it * ready to start emitting a new compressed stream. */ static void z_comp_reset(void *arg) { struct ppp_deflate_state *state = (struct ppp_deflate_state *) arg; state->seqno = 0; zlib_deflateReset(&state->strm); } /** * z_compress - compress a PPP packet with Deflate compression. * @arg: pointer to private state for the compressor * @rptr: uncompressed packet (input) * @obuf: compressed packet (output) * @isize: size of uncompressed packet * @osize: space available at @obuf * * Returns the length of the compressed packet, or 0 if the * packet is incompressible. */ static int z_compress(void *arg, unsigned char *rptr, unsigned char *obuf, int isize, int osize) { struct ppp_deflate_state *state = (struct ppp_deflate_state *) arg; int r, proto, off, olen, oavail; unsigned char *wptr; /* * Check that the protocol is in the range we handle. */ proto = PPP_PROTOCOL(rptr); if (proto > 0x3fff || proto == 0xfd || proto == 0xfb) return 0; /* Don't generate compressed packets which are larger than the uncompressed packet. */ if (osize > isize) osize = isize; wptr = obuf; /* * Copy over the PPP header and store the 2-byte sequence number. */ wptr[0] = PPP_ADDRESS(rptr); wptr[1] = PPP_CONTROL(rptr); put_unaligned_be16(PPP_COMP, wptr + 2); wptr += PPP_HDRLEN; put_unaligned_be16(state->seqno, wptr); wptr += DEFLATE_OVHD; olen = PPP_HDRLEN + DEFLATE_OVHD; state->strm.next_out = wptr; state->strm.avail_out = oavail = osize - olen; ++state->seqno; off = (proto > 0xff) ? 2 : 3; /* skip 1st proto byte if 0 */ rptr += off; state->strm.next_in = rptr; state->strm.avail_in = (isize - off); for (;;) { r = zlib_deflate(&state->strm, Z_PACKET_FLUSH); if (r != Z_OK) { if (state->debug) printk(KERN_ERR "z_compress: deflate returned %d\n", r); break; } if (state->strm.avail_out == 0) { olen += oavail; state->strm.next_out = NULL; state->strm.avail_out = oavail = 1000000; } else { break; /* all done */ } } olen += oavail - state->strm.avail_out; /* * See if we managed to reduce the size of the packet. */ if (olen < isize && olen <= osize) { state->stats.comp_bytes += olen; state->stats.comp_packets++; } else { state->stats.inc_bytes += isize; state->stats.inc_packets++; olen = 0; } state->stats.unc_bytes += isize; state->stats.unc_packets++; return olen; } /** * z_comp_stats - return compression statistics for a compressor * or decompressor. * @arg: pointer to private space for the (de)compressor * @stats: pointer to a struct compstat to receive the result. */ static void z_comp_stats(void *arg, struct compstat *stats) { struct ppp_deflate_state *state = (struct ppp_deflate_state *) arg; *stats = state->stats; } /** * z_decomp_free - Free the memory used by a decompressor. * @arg: pointer to private space for the decompressor. */ static void z_decomp_free(void *arg) { struct ppp_deflate_state *state = (struct ppp_deflate_state *) arg; if (state) { vfree(state->strm.workspace); kfree(state); } } /** * z_decomp_alloc - allocate space for a decompressor. * @options: pointer to CCP option data * @opt_len: length of the CCP option at @options. * * The @options pointer points to the a buffer containing the * CCP option data for the compression being negotiated. It is * formatted according to RFC1979, and describes the window * size that we are requesting the peer to use in compressing * data to be sent to us. * * Returns the pointer to the private state for the decompressor, * or NULL if we could not allocate enough memory. */ static void *z_decomp_alloc(unsigned char *options, int opt_len) { struct ppp_deflate_state *state; int w_size; if (opt_len != CILEN_DEFLATE || (options[0] != CI_DEFLATE && options[0] != CI_DEFLATE_DRAFT) || options[1] != CILEN_DEFLATE || DEFLATE_METHOD(options[2]) != DEFLATE_METHOD_VAL || options[3] != DEFLATE_CHK_SEQUENCE) return NULL; w_size = DEFLATE_SIZE(options[2]); if (w_size < DEFLATE_MIN_SIZE || w_size > DEFLATE_MAX_SIZE) return NULL; state = kzalloc(sizeof(*state), GFP_KERNEL); if (state == NULL) return NULL; state->w_size = w_size; state->strm.next_out = NULL; state->strm.workspace = vmalloc(zlib_inflate_workspacesize()); if (state->strm.workspace == NULL) goto out_free; if (zlib_inflateInit2(&state->strm, -w_size) != Z_OK) goto out_free; return (void *) state; out_free: z_decomp_free(state); return NULL; } /** * z_decomp_init - initialize a previously-allocated decompressor. * @arg: pointer to the private state for the decompressor * @options: pointer to the CCP option data describing the * compression that was negotiated with the peer * @opt_len: length of the CCP option data at @options * @unit: PPP unit number for diagnostic messages * @hdrlen: ignored (present for backwards compatibility) * @mru: maximum length of decompressed packets * @debug: debug flag; if non-zero, debug messages are printed. * * The CCP options described by @options must match the options * specified when the decompressor was allocated. The decompressor * history is reset. Returns 0 for failure (CCP options don't * match) or 1 for success. */ static int z_decomp_init(void *arg, unsigned char *options, int opt_len, int unit, int hdrlen, int mru, int debug) { struct ppp_deflate_state *state = (struct ppp_deflate_state *) arg; if (opt_len < CILEN_DEFLATE || (options[0] != CI_DEFLATE && options[0] != CI_DEFLATE_DRAFT) || options[1] != CILEN_DEFLATE || DEFLATE_METHOD(options[2]) != DEFLATE_METHOD_VAL || DEFLATE_SIZE(options[2]) != state->w_size || options[3] != DEFLATE_CHK_SEQUENCE) return 0; state->seqno = 0; state->unit = unit; state->debug = debug; state->mru = mru; zlib_inflateReset(&state->strm); return 1; } /** * z_decomp_reset - reset a previously-allocated decompressor. * @arg: pointer to private state for the decompressor. * * This clears the history for the decompressor and makes it * ready to receive a new compressed stream. */ static void z_decomp_reset(void *arg) { struct ppp_deflate_state *state = (struct ppp_deflate_state *) arg; state->seqno = 0; zlib_inflateReset(&state->strm); } /** * z_decompress - decompress a Deflate-compressed packet. * @arg: pointer to private state for the decompressor * @ibuf: pointer to input (compressed) packet data * @isize: length of input packet * @obuf: pointer to space for output (decompressed) packet * @osize: amount of space available at @obuf * * Because of patent problems, we return DECOMP_ERROR for errors * found by inspecting the input data and for system problems, but * DECOMP_FATALERROR for any errors which could possibly be said to * be being detected "after" decompression. For DECOMP_ERROR, * we can issue a CCP reset-request; for DECOMP_FATALERROR, we may be * infringing a patent of Motorola's if we do, so we take CCP down * instead. * * Given that the frame has the correct sequence number and a good FCS, * errors such as invalid codes in the input most likely indicate a * bug, so we return DECOMP_FATALERROR for them in order to turn off * compression, even though they are detected by inspecting the input. */ static int z_decompress(void *arg, unsigned char *ibuf, int isize, unsigned char *obuf, int osize) { struct ppp_deflate_state *state = (struct ppp_deflate_state *) arg; int olen, seq, r; int decode_proto, overflow; unsigned char overflow_buf[1]; if (isize <= PPP_HDRLEN + DEFLATE_OVHD) { if (state->debug) printk(KERN_DEBUG "z_decompress%d: short pkt (%d)\n", state->unit, isize); return DECOMP_ERROR; } /* Check the sequence number. */ seq = get_unaligned_be16(ibuf + PPP_HDRLEN); if (seq != (state->seqno & 0xffff)) { if (state->debug) printk(KERN_DEBUG "z_decompress%d: bad seq # %d, expected %d\n", state->unit, seq, state->seqno & 0xffff); return DECOMP_ERROR; } ++state->seqno; /* * Fill in the first part of the PPP header. The protocol field * comes from the decompressed data. */ obuf[0] = PPP_ADDRESS(ibuf); obuf[1] = PPP_CONTROL(ibuf); obuf[2] = 0; /* * Set up to call inflate. We set avail_out to 1 initially so we can * look at the first byte of the output and decide whether we have * a 1-byte or 2-byte protocol field. */ state->strm.next_in = ibuf + PPP_HDRLEN + DEFLATE_OVHD; state->strm.avail_in = isize - (PPP_HDRLEN + DEFLATE_OVHD); state->strm.next_out = obuf + 3; state->strm.avail_out = 1; decode_proto = 1; overflow = 0; /* * Call inflate, supplying more input or output as needed. */ for (;;) { r = zlib_inflate(&state->strm, Z_PACKET_FLUSH); if (r != Z_OK) { if (state->debug) printk(KERN_DEBUG "z_decompress%d: inflate returned %d (%s)\n", state->unit, r, (state->strm.msg? state->strm.msg: "")); return DECOMP_FATALERROR; } if (state->strm.avail_out != 0) break; /* all done */ if (decode_proto) { state->strm.avail_out = osize - PPP_HDRLEN; if ((obuf[3] & 1) == 0) { /* 2-byte protocol field */ obuf[2] = obuf[3]; --state->strm.next_out; ++state->strm.avail_out; } decode_proto = 0; } else if (!overflow) { /* * We've filled up the output buffer; the only way to * find out whether inflate has any more characters * left is to give it another byte of output space. */ state->strm.next_out = overflow_buf; state->strm.avail_out = 1; overflow = 1; } else { if (state->debug) printk(KERN_DEBUG "z_decompress%d: ran out of mru\n", state->unit); return DECOMP_FATALERROR; } } if (decode_proto) { if (state->debug) printk(KERN_DEBUG "z_decompress%d: didn't get proto\n", state->unit); return DECOMP_ERROR; } olen = osize + overflow - state->strm.avail_out; state->stats.unc_bytes += olen; state->stats.unc_packets++; state->stats.comp_bytes += isize; state->stats.comp_packets++; return olen; } /** * z_incomp - add incompressible input data to the history. * @arg: pointer to private state for the decompressor * @ibuf: pointer to input packet data * @icnt: length of input data. */ static void z_incomp(void *arg, unsigned char *ibuf, int icnt) { struct ppp_deflate_state *state = (struct ppp_deflate_state *) arg; int proto, r; /* * Check that the protocol is one we handle. */ proto = PPP_PROTOCOL(ibuf); if (proto > 0x3fff || proto == 0xfd || proto == 0xfb) return; ++state->seqno; /* * We start at the either the 1st or 2nd byte of the protocol field, * depending on whether the protocol value is compressible. */ state->strm.next_in = ibuf + 3; state->strm.avail_in = icnt - 3; if (proto > 0xff) { --state->strm.next_in; ++state->strm.avail_in; } r = zlib_inflateIncomp(&state->strm); if (r != Z_OK) { /* gak! */ if (state->debug) { printk(KERN_DEBUG "z_incomp%d: inflateIncomp returned %d (%s)\n", state->unit, r, (state->strm.msg? state->strm.msg: "")); } return; } /* * Update stats. */ state->stats.inc_bytes += icnt; state->stats.inc_packets++; state->stats.unc_bytes += icnt; state->stats.unc_packets++; } /************************************************************* * Module interface table *************************************************************/ /* These are in ppp_generic.c */ extern int ppp_register_compressor (struct compressor *cp); extern void ppp_unregister_compressor (struct compressor *cp); /* * Procedures exported to if_ppp.c. */ static struct compressor ppp_deflate = { .compress_proto = CI_DEFLATE, .comp_alloc = z_comp_alloc, .comp_free = z_comp_free, .comp_init = z_comp_init, .comp_reset = z_comp_reset, .compress = z_compress, .comp_stat = z_comp_stats, .decomp_alloc = z_decomp_alloc, .decomp_free = z_decomp_free, .decomp_init = z_decomp_init, .decomp_reset = z_decomp_reset, .decompress = z_decompress, .incomp = z_incomp, .decomp_stat = z_comp_stats, .owner = THIS_MODULE }; static struct compressor ppp_deflate_draft = { .compress_proto = CI_DEFLATE_DRAFT, .comp_alloc = z_comp_alloc, .comp_free = z_comp_free, .comp_init = z_comp_init, .comp_reset = z_comp_reset, .compress = z_compress, .comp_stat = z_comp_stats, .decomp_alloc = z_decomp_alloc, .decomp_free = z_decomp_free, .decomp_init = z_decomp_init, .decomp_reset = z_decomp_reset, .decompress = z_decompress, .incomp = z_incomp, .decomp_stat = z_comp_stats, .owner = THIS_MODULE }; static int __init deflate_init(void) { int rc; rc = ppp_register_compressor(&ppp_deflate); if (rc) return rc; rc = ppp_register_compressor(&ppp_deflate_draft); if (rc) { ppp_unregister_compressor(&ppp_deflate); return rc; } pr_info("PPP Deflate Compression module registered\n"); return 0; } static void __exit deflate_cleanup(void) { ppp_unregister_compressor(&ppp_deflate); ppp_unregister_compressor(&ppp_deflate_draft); } module_init(deflate_init); module_exit(deflate_cleanup); MODULE_LICENSE("Dual BSD/GPL"); MODULE_ALIAS("ppp-compress-" __stringify(CI_DEFLATE)); MODULE_ALIAS("ppp-compress-" __stringify(CI_DEFLATE_DRAFT));
22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 // SPDX-License-Identifier: GPL-2.0-or-later /* * Symmetric key cipher operations. * * Generic encrypt/decrypt wrapper for ciphers, handles operations across * multiple page boundaries by using temporary blocks. In user context, * the kernel is given a chance to schedule us once per page. * * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/internal/aead.h> #include <crypto/internal/cipher.h> #include <crypto/internal/skcipher.h> #include <crypto/scatterwalk.h> #include <linux/bug.h> #include <linux/cryptouser.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/string.h> #include <net/netlink.h> #include "internal.h" enum { SKCIPHER_WALK_PHYS = 1 << 0, SKCIPHER_WALK_SLOW = 1 << 1, SKCIPHER_WALK_COPY = 1 << 2, SKCIPHER_WALK_DIFF = 1 << 3, SKCIPHER_WALK_SLEEP = 1 << 4, }; struct skcipher_walk_buffer { struct list_head entry; struct scatter_walk dst; unsigned int len; u8 *data; u8 buffer[]; }; static int skcipher_walk_next(struct skcipher_walk *walk); static inline void skcipher_map_src(struct skcipher_walk *walk) { walk->src.virt.addr = scatterwalk_map(&walk->in); } static inline void skcipher_map_dst(struct skcipher_walk *walk) { walk->dst.virt.addr = scatterwalk_map(&walk->out); } static inline void skcipher_unmap_src(struct skcipher_walk *walk) { scatterwalk_unmap(walk->src.virt.addr); } static inline void skcipher_unmap_dst(struct skcipher_walk *walk) { scatterwalk_unmap(walk->dst.virt.addr); } static inline gfp_t skcipher_walk_gfp(struct skcipher_walk *walk) { return walk->flags & SKCIPHER_WALK_SLEEP ? GFP_KERNEL : GFP_ATOMIC; } /* Get a spot of the specified length that does not straddle a page. * The caller needs to ensure that there is enough space for this operation. */ static inline u8 *skcipher_get_spot(u8 *start, unsigned int len) { u8 *end_page = (u8 *)(((unsigned long)(start + len - 1)) & PAGE_MASK); return max(start, end_page); } static inline struct skcipher_alg *__crypto_skcipher_alg( struct crypto_alg *alg) { return container_of(alg, struct skcipher_alg, base); } static inline struct crypto_istat_cipher *skcipher_get_stat( struct skcipher_alg *alg) { #ifdef CONFIG_CRYPTO_STATS return &alg->stat; #else return NULL; #endif } static inline int crypto_skcipher_errstat(struct skcipher_alg *alg, int err) { struct crypto_istat_cipher *istat = skcipher_get_stat(alg); if (!IS_ENABLED(CONFIG_CRYPTO_STATS)) return err; if (err && err != -EINPROGRESS && err != -EBUSY) atomic64_inc(&istat->err_cnt); return err; } static int skcipher_done_slow(struct skcipher_walk *walk, unsigned int bsize) { u8 *addr; addr = (u8 *)ALIGN((unsigned long)walk->buffer, walk->alignmask + 1); addr = skcipher_get_spot(addr, bsize); scatterwalk_copychunks(addr, &walk->out, bsize, (walk->flags & SKCIPHER_WALK_PHYS) ? 2 : 1); return 0; } int skcipher_walk_done(struct skcipher_walk *walk, int err) { unsigned int n = walk->nbytes; unsigned int nbytes = 0; if (!n) goto finish; if (likely(err >= 0)) { n -= err; nbytes = walk->total - n; } if (likely(!(walk->flags & (SKCIPHER_WALK_PHYS | SKCIPHER_WALK_SLOW | SKCIPHER_WALK_COPY | SKCIPHER_WALK_DIFF)))) { unmap_src: skcipher_unmap_src(walk); } else if (walk->flags & SKCIPHER_WALK_DIFF) { skcipher_unmap_dst(walk); goto unmap_src; } else if (walk->flags & SKCIPHER_WALK_COPY) { skcipher_map_dst(walk); memcpy(walk->dst.virt.addr, walk->page, n); skcipher_unmap_dst(walk); } else if (unlikely(walk->flags & SKCIPHER_WALK_SLOW)) { if (err > 0) { /* * Didn't process all bytes. Either the algorithm is * broken, or this was the last step and it turned out * the message wasn't evenly divisible into blocks but * the algorithm requires it. */ err = -EINVAL; nbytes = 0; } else n = skcipher_done_slow(walk, n); } if (err > 0) err = 0; walk->total = nbytes; walk->nbytes = 0; scatterwalk_advance(&walk->in, n); scatterwalk_advance(&walk->out, n); scatterwalk_done(&walk->in, 0, nbytes); scatterwalk_done(&walk->out, 1, nbytes); if (nbytes) { crypto_yield(walk->flags & SKCIPHER_WALK_SLEEP ? CRYPTO_TFM_REQ_MAY_SLEEP : 0); return skcipher_walk_next(walk); } finish: /* Short-circuit for the common/fast path. */ if (!((unsigned long)walk->buffer | (unsigned long)walk->page)) goto out; if (walk->flags & SKCIPHER_WALK_PHYS) goto out; if (walk->iv != walk->oiv) memcpy(walk->oiv, walk->iv, walk->ivsize); if (walk->buffer != walk->page) kfree(walk->buffer); if (walk->page) free_page((unsigned long)walk->page); out: return err; } EXPORT_SYMBOL_GPL(skcipher_walk_done); void skcipher_walk_complete(struct skcipher_walk *walk, int err) { struct skcipher_walk_buffer *p, *tmp; list_for_each_entry_safe(p, tmp, &walk->buffers, entry) { u8 *data; if (err) goto done; data = p->data; if (!data) { data = PTR_ALIGN(&p->buffer[0], walk->alignmask + 1); data = skcipher_get_spot(data, walk->stride); } scatterwalk_copychunks(data, &p->dst, p->len, 1); if (offset_in_page(p->data) + p->len + walk->stride > PAGE_SIZE) free_page((unsigned long)p->data); done: list_del(&p->entry); kfree(p); } if (!err && walk->iv != walk->oiv) memcpy(walk->oiv, walk->iv, walk->ivsize); if (walk->buffer != walk->page) kfree(walk->buffer); if (walk->page) free_page((unsigned long)walk->page); } EXPORT_SYMBOL_GPL(skcipher_walk_complete); static void skcipher_queue_write(struct skcipher_walk *walk, struct skcipher_walk_buffer *p) { p->dst = walk->out; list_add_tail(&p->entry, &walk->buffers); } static int skcipher_next_slow(struct skcipher_walk *walk, unsigned int bsize) { bool phys = walk->flags & SKCIPHER_WALK_PHYS; unsigned alignmask = walk->alignmask; struct skcipher_walk_buffer *p; unsigned a; unsigned n; u8 *buffer; void *v; if (!phys) { if (!walk->buffer) walk->buffer = walk->page; buffer = walk->buffer; if (buffer) goto ok; } /* Start with the minimum alignment of kmalloc. */ a = crypto_tfm_ctx_alignment() - 1; n = bsize; if (phys) { /* Calculate the minimum alignment of p->buffer. */ a &= (sizeof(*p) ^ (sizeof(*p) - 1)) >> 1; n += sizeof(*p); } /* Minimum size to align p->buffer by alignmask. */ n += alignmask & ~a; /* Minimum size to ensure p->buffer does not straddle a page. */ n += (bsize - 1) & ~(alignmask | a); v = kzalloc(n, skcipher_walk_gfp(walk)); if (!v) return skcipher_walk_done(walk, -ENOMEM); if (phys) { p = v; p->len = bsize; skcipher_queue_write(walk, p); buffer = p->buffer; } else { walk->buffer = v; buffer = v; } ok: walk->dst.virt.addr = PTR_ALIGN(buffer, alignmask + 1); walk->dst.virt.addr = skcipher_get_spot(walk->dst.virt.addr, bsize); walk->src.virt.addr = walk->dst.virt.addr; scatterwalk_copychunks(walk->src.virt.addr, &walk->in, bsize, 0); walk->nbytes = bsize; walk->flags |= SKCIPHER_WALK_SLOW; return 0; } static int skcipher_next_copy(struct skcipher_walk *walk) { struct skcipher_walk_buffer *p; u8 *tmp = walk->page; skcipher_map_src(walk); memcpy(tmp, walk->src.virt.addr, walk->nbytes); skcipher_unmap_src(walk); walk->src.virt.addr = tmp; walk->dst.virt.addr = tmp; if (!(walk->flags & SKCIPHER_WALK_PHYS)) return 0; p = kmalloc(sizeof(*p), skcipher_walk_gfp(walk)); if (!p) return -ENOMEM; p->data = walk->page; p->len = walk->nbytes; skcipher_queue_write(walk, p); if (offset_in_page(walk->page) + walk->nbytes + walk->stride > PAGE_SIZE) walk->page = NULL; else walk->page += walk->nbytes; return 0; } static int skcipher_next_fast(struct skcipher_walk *walk) { unsigned long diff; walk->src.phys.page = scatterwalk_page(&walk->in); walk->src.phys.offset = offset_in_page(walk->in.offset); walk->dst.phys.page = scatterwalk_page(&walk->out); walk->dst.phys.offset = offset_in_page(walk->out.offset); if (walk->flags & SKCIPHER_WALK_PHYS) return 0; diff = walk->src.phys.offset - walk->dst.phys.offset; diff |= walk->src.virt.page - walk->dst.virt.page; skcipher_map_src(walk); walk->dst.virt.addr = walk->src.virt.addr; if (diff) { walk->flags |= SKCIPHER_WALK_DIFF; skcipher_map_dst(walk); } return 0; } static int skcipher_walk_next(struct skcipher_walk *walk) { unsigned int bsize; unsigned int n; int err; walk->flags &= ~(SKCIPHER_WALK_SLOW | SKCIPHER_WALK_COPY | SKCIPHER_WALK_DIFF); n = walk->total; bsize = min(walk->stride, max(n, walk->blocksize)); n = scatterwalk_clamp(&walk->in, n); n = scatterwalk_clamp(&walk->out, n); if (unlikely(n < bsize)) { if (unlikely(walk->total < walk->blocksize)) return skcipher_walk_done(walk, -EINVAL); slow_path: err = skcipher_next_slow(walk, bsize); goto set_phys_lowmem; } if (unlikely((walk->in.offset | walk->out.offset) & walk->alignmask)) { if (!walk->page) { gfp_t gfp = skcipher_walk_gfp(walk); walk->page = (void *)__get_free_page(gfp); if (!walk->page) goto slow_path; } walk->nbytes = min_t(unsigned, n, PAGE_SIZE - offset_in_page(walk->page)); walk->flags |= SKCIPHER_WALK_COPY; err = skcipher_next_copy(walk); goto set_phys_lowmem; } walk->nbytes = n; return skcipher_next_fast(walk); set_phys_lowmem: if (!err && (walk->flags & SKCIPHER_WALK_PHYS)) { walk->src.phys.page = virt_to_page(walk->src.virt.addr); walk->dst.phys.page = virt_to_page(walk->dst.virt.addr); walk->src.phys.offset &= PAGE_SIZE - 1; walk->dst.phys.offset &= PAGE_SIZE - 1; } return err; } static int skcipher_copy_iv(struct skcipher_walk *walk) { unsigned a = crypto_tfm_ctx_alignment() - 1; unsigned alignmask = walk->alignmask; unsigned ivsize = walk->ivsize; unsigned bs = walk->stride; unsigned aligned_bs; unsigned size; u8 *iv; aligned_bs = ALIGN(bs, alignmask + 1); /* Minimum size to align buffer by alignmask. */ size = alignmask & ~a; if (walk->flags & SKCIPHER_WALK_PHYS) size += ivsize; else { size += aligned_bs + ivsize; /* Minimum size to ensure buffer does not straddle a page. */ size += (bs - 1) & ~(alignmask | a); } walk->buffer = kmalloc(size, skcipher_walk_gfp(walk)); if (!walk->buffer) return -ENOMEM; iv = PTR_ALIGN(walk->buffer, alignmask + 1); iv = skcipher_get_spot(iv, bs) + aligned_bs; walk->iv = memcpy(iv, walk->iv, walk->ivsize); return 0; } static int skcipher_walk_first(struct skcipher_walk *walk) { if (WARN_ON_ONCE(in_hardirq())) return -EDEADLK; walk->buffer = NULL; if (unlikely(((unsigned long)walk->iv & walk->alignmask))) { int err = skcipher_copy_iv(walk); if (err) return err; } walk->page = NULL; return skcipher_walk_next(walk); } static int skcipher_walk_skcipher(struct skcipher_walk *walk, struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); walk->total = req->cryptlen; walk->nbytes = 0; walk->iv = req->iv; walk->oiv = req->iv; if (unlikely(!walk->total)) return 0; scatterwalk_start(&walk->in, req->src); scatterwalk_start(&walk->out, req->dst); walk->flags &= ~SKCIPHER_WALK_SLEEP; walk->flags |= req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP ? SKCIPHER_WALK_SLEEP : 0; walk->blocksize = crypto_skcipher_blocksize(tfm); walk->stride = crypto_skcipher_walksize(tfm); walk->ivsize = crypto_skcipher_ivsize(tfm); walk->alignmask = crypto_skcipher_alignmask(tfm); return skcipher_walk_first(walk); } int skcipher_walk_virt(struct skcipher_walk *walk, struct skcipher_request *req, bool atomic) { int err; might_sleep_if(req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP); walk->flags &= ~SKCIPHER_WALK_PHYS; err = skcipher_walk_skcipher(walk, req); walk->flags &= atomic ? ~SKCIPHER_WALK_SLEEP : ~0; return err; } EXPORT_SYMBOL_GPL(skcipher_walk_virt); int skcipher_walk_async(struct skcipher_walk *walk, struct skcipher_request *req) { walk->flags |= SKCIPHER_WALK_PHYS; INIT_LIST_HEAD(&walk->buffers); return skcipher_walk_skcipher(walk, req); } EXPORT_SYMBOL_GPL(skcipher_walk_async); static int skcipher_walk_aead_common(struct skcipher_walk *walk, struct aead_request *req, bool atomic) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); int err; walk->nbytes = 0; walk->iv = req->iv; walk->oiv = req->iv; if (unlikely(!walk->total)) return 0; walk->flags &= ~SKCIPHER_WALK_PHYS; scatterwalk_start(&walk->in, req->src); scatterwalk_start(&walk->out, req->dst); scatterwalk_copychunks(NULL, &walk->in, req->assoclen, 2); scatterwalk_copychunks(NULL, &walk->out, req->assoclen, 2); scatterwalk_done(&walk->in, 0, walk->total); scatterwalk_done(&walk->out, 0, walk->total); if (req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) walk->flags |= SKCIPHER_WALK_SLEEP; else walk->flags &= ~SKCIPHER_WALK_SLEEP; walk->blocksize = crypto_aead_blocksize(tfm); walk->stride = crypto_aead_chunksize(tfm); walk->ivsize = crypto_aead_ivsize(tfm); walk->alignmask = crypto_aead_alignmask(tfm); err = skcipher_walk_first(walk); if (atomic) walk->flags &= ~SKCIPHER_WALK_SLEEP; return err; } int skcipher_walk_aead_encrypt(struct skcipher_walk *walk, struct aead_request *req, bool atomic) { walk->total = req->cryptlen; return skcipher_walk_aead_common(walk, req, atomic); } EXPORT_SYMBOL_GPL(skcipher_walk_aead_encrypt); int skcipher_walk_aead_decrypt(struct skcipher_walk *walk, struct aead_request *req, bool atomic) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); walk->total = req->cryptlen - crypto_aead_authsize(tfm); return skcipher_walk_aead_common(walk, req, atomic); } EXPORT_SYMBOL_GPL(skcipher_walk_aead_decrypt); static void skcipher_set_needkey(struct crypto_skcipher *tfm) { if (crypto_skcipher_max_keysize(tfm) != 0) crypto_skcipher_set_flags(tfm, CRYPTO_TFM_NEED_KEY); } static int skcipher_setkey_unaligned(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { unsigned long alignmask = crypto_skcipher_alignmask(tfm); struct skcipher_alg *cipher = crypto_skcipher_alg(tfm); u8 *buffer, *alignbuffer; unsigned long absize; int ret; absize = keylen + alignmask; buffer = kmalloc(absize, GFP_ATOMIC); if (!buffer) return -ENOMEM; alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1); memcpy(alignbuffer, key, keylen); ret = cipher->setkey(tfm, alignbuffer, keylen); kfree_sensitive(buffer); return ret; } int crypto_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { struct skcipher_alg *cipher = crypto_skcipher_alg(tfm); unsigned long alignmask = crypto_skcipher_alignmask(tfm); int err; if (keylen < cipher->min_keysize || keylen > cipher->max_keysize) return -EINVAL; if ((unsigned long)key & alignmask) err = skcipher_setkey_unaligned(tfm, key, keylen); else err = cipher->setkey(tfm, key, keylen); if (unlikely(err)) { skcipher_set_needkey(tfm); return err; } crypto_skcipher_clear_flags(tfm, CRYPTO_TFM_NEED_KEY); return 0; } EXPORT_SYMBOL_GPL(crypto_skcipher_setkey); int crypto_skcipher_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct skcipher_alg *alg = crypto_skcipher_alg(tfm); int ret; if (IS_ENABLED(CONFIG_CRYPTO_STATS)) { struct crypto_istat_cipher *istat = skcipher_get_stat(alg); atomic64_inc(&istat->encrypt_cnt); atomic64_add(req->cryptlen, &istat->encrypt_tlen); } if (crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_NEED_KEY) ret = -ENOKEY; else ret = alg->encrypt(req); return crypto_skcipher_errstat(alg, ret); } EXPORT_SYMBOL_GPL(crypto_skcipher_encrypt); int crypto_skcipher_decrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct skcipher_alg *alg = crypto_skcipher_alg(tfm); int ret; if (IS_ENABLED(CONFIG_CRYPTO_STATS)) { struct crypto_istat_cipher *istat = skcipher_get_stat(alg); atomic64_inc(&istat->decrypt_cnt); atomic64_add(req->cryptlen, &istat->decrypt_tlen); } if (crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_NEED_KEY) ret = -ENOKEY; else ret = alg->decrypt(req); return crypto_skcipher_errstat(alg, ret); } EXPORT_SYMBOL_GPL(crypto_skcipher_decrypt); static void crypto_skcipher_exit_tfm(struct crypto_tfm *tfm) { struct crypto_skcipher *skcipher = __crypto_skcipher_cast(tfm); struct skcipher_alg *alg = crypto_skcipher_alg(skcipher); alg->exit(skcipher); } static int crypto_skcipher_init_tfm(struct crypto_tfm *tfm) { struct crypto_skcipher *skcipher = __crypto_skcipher_cast(tfm); struct skcipher_alg *alg = crypto_skcipher_alg(skcipher); skcipher_set_needkey(skcipher); if (alg->exit) skcipher->base.exit = crypto_skcipher_exit_tfm; if (alg->init) return alg->init(skcipher); return 0; } static void crypto_skcipher_free_instance(struct crypto_instance *inst) { struct skcipher_instance *skcipher = container_of(inst, struct skcipher_instance, s.base); skcipher->free(skcipher); } static void crypto_skcipher_show(struct seq_file *m, struct crypto_alg *alg) __maybe_unused; static void crypto_skcipher_show(struct seq_file *m, struct crypto_alg *alg) { struct skcipher_alg *skcipher = __crypto_skcipher_alg(alg); seq_printf(m, "type : skcipher\n"); seq_printf(m, "async : %s\n", alg->cra_flags & CRYPTO_ALG_ASYNC ? "yes" : "no"); seq_printf(m, "blocksize : %u\n", alg->cra_blocksize); seq_printf(m, "min keysize : %u\n", skcipher->min_keysize); seq_printf(m, "max keysize : %u\n", skcipher->max_keysize); seq_printf(m, "ivsize : %u\n", skcipher->ivsize); seq_printf(m, "chunksize : %u\n", skcipher->chunksize); seq_printf(m, "walksize : %u\n", skcipher->walksize); } static int __maybe_unused crypto_skcipher_report( struct sk_buff *skb, struct crypto_alg *alg) { struct skcipher_alg *skcipher = __crypto_skcipher_alg(alg); struct crypto_report_blkcipher rblkcipher; memset(&rblkcipher, 0, sizeof(rblkcipher)); strscpy(rblkcipher.type, "skcipher", sizeof(rblkcipher.type)); strscpy(rblkcipher.geniv, "<none>", sizeof(rblkcipher.geniv)); rblkcipher.blocksize = alg->cra_blocksize; rblkcipher.min_keysize = skcipher->min_keysize; rblkcipher.max_keysize = skcipher->max_keysize; rblkcipher.ivsize = skcipher->ivsize; return nla_put(skb, CRYPTOCFGA_REPORT_BLKCIPHER, sizeof(rblkcipher), &rblkcipher); } static int __maybe_unused crypto_skcipher_report_stat( struct sk_buff *skb, struct crypto_alg *alg) { struct skcipher_alg *skcipher = __crypto_skcipher_alg(alg); struct crypto_istat_cipher *istat; struct crypto_stat_cipher rcipher; istat = skcipher_get_stat(skcipher); memset(&rcipher, 0, sizeof(rcipher)); strscpy(rcipher.type, "cipher", sizeof(rcipher.type)); rcipher.stat_encrypt_cnt = atomic64_read(&istat->encrypt_cnt); rcipher.stat_encrypt_tlen = atomic64_read(&istat->encrypt_tlen); rcipher.stat_decrypt_cnt = atomic64_read(&istat->decrypt_cnt); rcipher.stat_decrypt_tlen = atomic64_read(&istat->decrypt_tlen); rcipher.stat_err_cnt = atomic64_read(&istat->err_cnt); return nla_put(skb, CRYPTOCFGA_STAT_CIPHER, sizeof(rcipher), &rcipher); } static const struct crypto_type crypto_skcipher_type = { .extsize = crypto_alg_extsize, .init_tfm = crypto_skcipher_init_tfm, .free = crypto_skcipher_free_instance, #ifdef CONFIG_PROC_FS .show = crypto_skcipher_show, #endif #if IS_ENABLED(CONFIG_CRYPTO_USER) .report = crypto_skcipher_report, #endif #ifdef CONFIG_CRYPTO_STATS .report_stat = crypto_skcipher_report_stat, #endif .maskclear = ~CRYPTO_ALG_TYPE_MASK, .maskset = CRYPTO_ALG_TYPE_MASK, .type = CRYPTO_ALG_TYPE_SKCIPHER, .tfmsize = offsetof(struct crypto_skcipher, base), }; int crypto_grab_skcipher(struct crypto_skcipher_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask) { spawn->base.frontend = &crypto_skcipher_type; return crypto_grab_spawn(&spawn->base, inst, name, type, mask); } EXPORT_SYMBOL_GPL(crypto_grab_skcipher); struct crypto_skcipher *crypto_alloc_skcipher(const char *alg_name, u32 type, u32 mask) { return crypto_alloc_tfm(alg_name, &crypto_skcipher_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_alloc_skcipher); struct crypto_sync_skcipher *crypto_alloc_sync_skcipher( const char *alg_name, u32 type, u32 mask) { struct crypto_skcipher *tfm; /* Only sync algorithms allowed. */ mask |= CRYPTO_ALG_ASYNC | CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE; tfm = crypto_alloc_tfm(alg_name, &crypto_skcipher_type, type, mask); /* * Make sure we do not allocate something that might get used with * an on-stack request: check the request size. */ if (!IS_ERR(tfm) && WARN_ON(crypto_skcipher_reqsize(tfm) > MAX_SYNC_SKCIPHER_REQSIZE)) { crypto_free_skcipher(tfm); return ERR_PTR(-EINVAL); } return (struct crypto_sync_skcipher *)tfm; } EXPORT_SYMBOL_GPL(crypto_alloc_sync_skcipher); int crypto_has_skcipher(const char *alg_name, u32 type, u32 mask) { return crypto_type_has_alg(alg_name, &crypto_skcipher_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_has_skcipher); static int skcipher_prepare_alg(struct skcipher_alg *alg) { struct crypto_istat_cipher *istat = skcipher_get_stat(alg); struct crypto_alg *base = &alg->base; if (alg->ivsize > PAGE_SIZE / 8 || alg->chunksize > PAGE_SIZE / 8 || alg->walksize > PAGE_SIZE / 8) return -EINVAL; if (!alg->chunksize) alg->chunksize = base->cra_blocksize; if (!alg->walksize) alg->walksize = alg->chunksize; base->cra_type = &crypto_skcipher_type; base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK; base->cra_flags |= CRYPTO_ALG_TYPE_SKCIPHER; if (IS_ENABLED(CONFIG_CRYPTO_STATS)) memset(istat, 0, sizeof(*istat)); return 0; } int crypto_register_skcipher(struct skcipher_alg *alg) { struct crypto_alg *base = &alg->base; int err; err = skcipher_prepare_alg(alg); if (err) return err; return crypto_register_alg(base); } EXPORT_SYMBOL_GPL(crypto_register_skcipher); void crypto_unregister_skcipher(struct skcipher_alg *alg) { crypto_unregister_alg(&alg->base); } EXPORT_SYMBOL_GPL(crypto_unregister_skcipher); int crypto_register_skciphers(struct skcipher_alg *algs, int count) { int i, ret; for (i = 0; i < count; i++) { ret = crypto_register_skcipher(&algs[i]); if (ret) goto err; } return 0; err: for (--i; i >= 0; --i) crypto_unregister_skcipher(&algs[i]); return ret; } EXPORT_SYMBOL_GPL(crypto_register_skciphers); void crypto_unregister_skciphers(struct skcipher_alg *algs, int count) { int i; for (i = count - 1; i >= 0; --i) crypto_unregister_skcipher(&algs[i]); } EXPORT_SYMBOL_GPL(crypto_unregister_skciphers); int skcipher_register_instance(struct crypto_template *tmpl, struct skcipher_instance *inst) { int err; if (WARN_ON(!inst->free)) return -EINVAL; err = skcipher_prepare_alg(&inst->alg); if (err) return err; return crypto_register_instance(tmpl, skcipher_crypto_instance(inst)); } EXPORT_SYMBOL_GPL(skcipher_register_instance); static int skcipher_setkey_simple(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { struct crypto_cipher *cipher = skcipher_cipher_simple(tfm); crypto_cipher_clear_flags(cipher, CRYPTO_TFM_REQ_MASK); crypto_cipher_set_flags(cipher, crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_REQ_MASK); return crypto_cipher_setkey(cipher, key, keylen); } static int skcipher_init_tfm_simple(struct crypto_skcipher *tfm) { struct skcipher_instance *inst = skcipher_alg_instance(tfm); struct crypto_cipher_spawn *spawn = skcipher_instance_ctx(inst); struct skcipher_ctx_simple *ctx = crypto_skcipher_ctx(tfm); struct crypto_cipher *cipher; cipher = crypto_spawn_cipher(spawn); if (IS_ERR(cipher)) return PTR_ERR(cipher); ctx->cipher = cipher; return 0; } static void skcipher_exit_tfm_simple(struct crypto_skcipher *tfm) { struct skcipher_ctx_simple *ctx = crypto_skcipher_ctx(tfm); crypto_free_cipher(ctx->cipher); } static void skcipher_free_instance_simple(struct skcipher_instance *inst) { crypto_drop_cipher(skcipher_instance_ctx(inst)); kfree(inst); } /** * skcipher_alloc_instance_simple - allocate instance of simple block cipher mode * * Allocate an skcipher_instance for a simple block cipher mode of operation, * e.g. cbc or ecb. The instance context will have just a single crypto_spawn, * that for the underlying cipher. The {min,max}_keysize, ivsize, blocksize, * alignmask, and priority are set from the underlying cipher but can be * overridden if needed. The tfm context defaults to skcipher_ctx_simple, and * default ->setkey(), ->init(), and ->exit() methods are installed. * * @tmpl: the template being instantiated * @tb: the template parameters * * Return: a pointer to the new instance, or an ERR_PTR(). The caller still * needs to register the instance. */ struct skcipher_instance *skcipher_alloc_instance_simple( struct crypto_template *tmpl, struct rtattr **tb) { u32 mask; struct skcipher_instance *inst; struct crypto_cipher_spawn *spawn; struct crypto_alg *cipher_alg; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_SKCIPHER, &mask); if (err) return ERR_PTR(err); inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL); if (!inst) return ERR_PTR(-ENOMEM); spawn = skcipher_instance_ctx(inst); err = crypto_grab_cipher(spawn, skcipher_crypto_instance(inst), crypto_attr_alg_name(tb[1]), 0, mask); if (err) goto err_free_inst; cipher_alg = crypto_spawn_cipher_alg(spawn); err = crypto_inst_setname(skcipher_crypto_instance(inst), tmpl->name, cipher_alg); if (err) goto err_free_inst; inst->free = skcipher_free_instance_simple; /* Default algorithm properties, can be overridden */ inst->alg.base.cra_blocksize = cipher_alg->cra_blocksize; inst->alg.base.cra_alignmask = cipher_alg->cra_alignmask; inst->alg.base.cra_priority = cipher_alg->cra_priority; inst->alg.min_keysize = cipher_alg->cra_cipher.cia_min_keysize; inst->alg.max_keysize = cipher_alg->cra_cipher.cia_max_keysize; inst->alg.ivsize = cipher_alg->cra_blocksize; /* Use skcipher_ctx_simple by default, can be overridden */ inst->alg.base.cra_ctxsize = sizeof(struct skcipher_ctx_simple); inst->alg.setkey = skcipher_setkey_simple; inst->alg.init = skcipher_init_tfm_simple; inst->alg.exit = skcipher_exit_tfm_simple; return inst; err_free_inst: skcipher_free_instance_simple(inst); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(skcipher_alloc_instance_simple); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Symmetric key cipher type"); MODULE_IMPORT_NS(CRYPTO_INTERNAL);
436 4878 4886 1277 2281 1203 602 1203 3765 3764 3771 2315 2315 485 2315 604 604 604 604 638 638 507 505 483 637 1270 1270 1270 3 3 3 3 2 3 1469 1466 3 3 1499 36 36 36 3 33 33 33 33 28 7 7 606 606 3 3 1500 1500 1643 1643 1643 1452 1450 1450 1450 296 297 297 297 1450 1450 1450 3360 3359 3362 3350 3347 953 954 955 955 956 6 5 6 4586 2463 2467 2466 2462 2459 2292 6 948 950 5 4 5 4590 948 141 2279 2101 200 4594 2463 2460 2462 2 2 2 2 2 1450 1450 1450 23 1428 1446 1452 1450 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 // SPDX-License-Identifier: GPL-2.0-only #include <crypto/hash.h> #include <linux/export.h> #include <linux/bvec.h> #include <linux/fault-inject-usercopy.h> #include <linux/uio.h> #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/splice.h> #include <linux/compat.h> #include <net/checksum.h> #include <linux/scatterlist.h> #include <linux/instrumented.h> /* covers ubuf and kbuf alike */ #define iterate_buf(i, n, base, len, off, __p, STEP) { \ size_t __maybe_unused off = 0; \ len = n; \ base = __p + i->iov_offset; \ len -= (STEP); \ i->iov_offset += len; \ n = len; \ } /* covers iovec and kvec alike */ #define iterate_iovec(i, n, base, len, off, __p, STEP) { \ size_t off = 0; \ size_t skip = i->iov_offset; \ do { \ len = min(n, __p->iov_len - skip); \ if (likely(len)) { \ base = __p->iov_base + skip; \ len -= (STEP); \ off += len; \ skip += len; \ n -= len; \ if (skip < __p->iov_len) \ break; \ } \ __p++; \ skip = 0; \ } while (n); \ i->iov_offset = skip; \ n = off; \ } #define iterate_bvec(i, n, base, len, off, p, STEP) { \ size_t off = 0; \ unsigned skip = i->iov_offset; \ while (n) { \ unsigned offset = p->bv_offset + skip; \ unsigned left; \ void *kaddr = kmap_local_page(p->bv_page + \ offset / PAGE_SIZE); \ base = kaddr + offset % PAGE_SIZE; \ len = min(min(n, (size_t)(p->bv_len - skip)), \ (size_t)(PAGE_SIZE - offset % PAGE_SIZE)); \ left = (STEP); \ kunmap_local(kaddr); \ len -= left; \ off += len; \ skip += len; \ if (skip == p->bv_len) { \ skip = 0; \ p++; \ } \ n -= len; \ if (left) \ break; \ } \ i->iov_offset = skip; \ n = off; \ } #define iterate_xarray(i, n, base, len, __off, STEP) { \ __label__ __out; \ size_t __off = 0; \ struct folio *folio; \ loff_t start = i->xarray_start + i->iov_offset; \ pgoff_t index = start / PAGE_SIZE; \ XA_STATE(xas, i->xarray, index); \ \ len = PAGE_SIZE - offset_in_page(start); \ rcu_read_lock(); \ xas_for_each(&xas, folio, ULONG_MAX) { \ unsigned left; \ size_t offset; \ if (xas_retry(&xas, folio)) \ continue; \ if (WARN_ON(xa_is_value(folio))) \ break; \ if (WARN_ON(folio_test_hugetlb(folio))) \ break; \ offset = offset_in_folio(folio, start + __off); \ while (offset < folio_size(folio)) { \ base = kmap_local_folio(folio, offset); \ len = min(n, len); \ left = (STEP); \ kunmap_local(base); \ len -= left; \ __off += len; \ n -= len; \ if (left || n == 0) \ goto __out; \ offset += len; \ len = PAGE_SIZE; \ } \ } \ __out: \ rcu_read_unlock(); \ i->iov_offset += __off; \ n = __off; \ } #define __iterate_and_advance(i, n, base, len, off, I, K) { \ if (unlikely(i->count < n)) \ n = i->count; \ if (likely(n)) { \ if (likely(iter_is_ubuf(i))) { \ void __user *base; \ size_t len; \ iterate_buf(i, n, base, len, off, \ i->ubuf, (I)) \ } else if (likely(iter_is_iovec(i))) { \ const struct iovec *iov = iter_iov(i); \ void __user *base; \ size_t len; \ iterate_iovec(i, n, base, len, off, \ iov, (I)) \ i->nr_segs -= iov - iter_iov(i); \ i->__iov = iov; \ } else if (iov_iter_is_bvec(i)) { \ const struct bio_vec *bvec = i->bvec; \ void *base; \ size_t len; \ iterate_bvec(i, n, base, len, off, \ bvec, (K)) \ i->nr_segs -= bvec - i->bvec; \ i->bvec = bvec; \ } else if (iov_iter_is_kvec(i)) { \ const struct kvec *kvec = i->kvec; \ void *base; \ size_t len; \ iterate_iovec(i, n, base, len, off, \ kvec, (K)) \ i->nr_segs -= kvec - i->kvec; \ i->kvec = kvec; \ } else if (iov_iter_is_xarray(i)) { \ void *base; \ size_t len; \ iterate_xarray(i, n, base, len, off, \ (K)) \ } \ i->count -= n; \ } \ } #define iterate_and_advance(i, n, base, len, off, I, K) \ __iterate_and_advance(i, n, base, len, off, I, ((void)(K),0)) static int copyout(void __user *to, const void *from, size_t n) { if (should_fail_usercopy()) return n; if (access_ok(to, n)) { instrument_copy_to_user(to, from, n); n = raw_copy_to_user(to, from, n); } return n; } static int copyout_nofault(void __user *to, const void *from, size_t n) { long res; if (should_fail_usercopy()) return n; res = copy_to_user_nofault(to, from, n); return res < 0 ? n : res; } static int copyin(void *to, const void __user *from, size_t n) { size_t res = n; if (should_fail_usercopy()) return n; if (access_ok(from, n)) { instrument_copy_from_user_before(to, from, n); res = raw_copy_from_user(to, from, n); instrument_copy_from_user_after(to, from, n, res); } return res; } /* * fault_in_iov_iter_readable - fault in iov iterator for reading * @i: iterator * @size: maximum length * * Fault in one or more iovecs of the given iov_iter, to a maximum length of * @size. For each iovec, fault in each page that constitutes the iovec. * * Returns the number of bytes not faulted in (like copy_to_user() and * copy_from_user()). * * Always returns 0 for non-userspace iterators. */ size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size) { if (iter_is_ubuf(i)) { size_t n = min(size, iov_iter_count(i)); n -= fault_in_readable(i->ubuf + i->iov_offset, n); return size - n; } else if (iter_is_iovec(i)) { size_t count = min(size, iov_iter_count(i)); const struct iovec *p; size_t skip; size -= count; for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) { size_t len = min(count, p->iov_len - skip); size_t ret; if (unlikely(!len)) continue; ret = fault_in_readable(p->iov_base + skip, len); count -= len - ret; if (ret) break; } return count + size; } return 0; } EXPORT_SYMBOL(fault_in_iov_iter_readable); /* * fault_in_iov_iter_writeable - fault in iov iterator for writing * @i: iterator * @size: maximum length * * Faults in the iterator using get_user_pages(), i.e., without triggering * hardware page faults. This is primarily useful when we already know that * some or all of the pages in @i aren't in memory. * * Returns the number of bytes not faulted in, like copy_to_user() and * copy_from_user(). * * Always returns 0 for non-user-space iterators. */ size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size) { if (iter_is_ubuf(i)) { size_t n = min(size, iov_iter_count(i)); n -= fault_in_safe_writeable(i->ubuf + i->iov_offset, n); return size - n; } else if (iter_is_iovec(i)) { size_t count = min(size, iov_iter_count(i)); const struct iovec *p; size_t skip; size -= count; for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) { size_t len = min(count, p->iov_len - skip); size_t ret; if (unlikely(!len)) continue; ret = fault_in_safe_writeable(p->iov_base + skip, len); count -= len - ret; if (ret) break; } return count + size; } return 0; } EXPORT_SYMBOL(fault_in_iov_iter_writeable); void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov, unsigned long nr_segs, size_t count) { WARN_ON(direction & ~(READ | WRITE)); *i = (struct iov_iter) { .iter_type = ITER_IOVEC, .copy_mc = false, .nofault = false, .user_backed = true, .data_source = direction, .__iov = iov, .nr_segs = nr_segs, .iov_offset = 0, .count = count }; } EXPORT_SYMBOL(iov_iter_init); static __wsum csum_and_memcpy(void *to, const void *from, size_t len, __wsum sum, size_t off) { __wsum next = csum_partial_copy_nocheck(from, to, len); return csum_block_add(sum, next, off); } size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(i->data_source)) return 0; if (user_backed_iter(i)) might_fault(); iterate_and_advance(i, bytes, base, len, off, copyout(base, addr + off, len), memcpy(base, addr + off, len) ) return bytes; } EXPORT_SYMBOL(_copy_to_iter); #ifdef CONFIG_ARCH_HAS_COPY_MC static int copyout_mc(void __user *to, const void *from, size_t n) { if (access_ok(to, n)) { instrument_copy_to_user(to, from, n); n = copy_mc_to_user((__force void *) to, from, n); } return n; } /** * _copy_mc_to_iter - copy to iter with source memory error exception handling * @addr: source kernel address * @bytes: total transfer length * @i: destination iterator * * The pmem driver deploys this for the dax operation * (dax_copy_to_iter()) for dax reads (bypass page-cache and the * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes * successfully copied. * * The main differences between this and typical _copy_to_iter(). * * * Typical tail/residue handling after a fault retries the copy * byte-by-byte until the fault happens again. Re-triggering machine * checks is potentially fatal so the implementation uses source * alignment and poison alignment assumptions to avoid re-triggering * hardware exceptions. * * * ITER_KVEC and ITER_BVEC can return short copies. Compare to * copy_to_iter() where only ITER_IOVEC attempts might return a short copy. * * Return: number of bytes copied (may be %0) */ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(i->data_source)) return 0; if (user_backed_iter(i)) might_fault(); __iterate_and_advance(i, bytes, base, len, off, copyout_mc(base, addr + off, len), copy_mc_to_kernel(base, addr + off, len) ) return bytes; } EXPORT_SYMBOL_GPL(_copy_mc_to_iter); #endif /* CONFIG_ARCH_HAS_COPY_MC */ static void *memcpy_from_iter(struct iov_iter *i, void *to, const void *from, size_t size) { if (iov_iter_is_copy_mc(i)) return (void *)copy_mc_to_kernel(to, from, size); return memcpy(to, from, size); } size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(!i->data_source)) return 0; if (user_backed_iter(i)) might_fault(); iterate_and_advance(i, bytes, base, len, off, copyin(addr + off, base, len), memcpy_from_iter(i, addr + off, base, len) ) return bytes; } EXPORT_SYMBOL(_copy_from_iter); size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(!i->data_source)) return 0; iterate_and_advance(i, bytes, base, len, off, __copy_from_user_inatomic_nocache(addr + off, base, len), memcpy(addr + off, base, len) ) return bytes; } EXPORT_SYMBOL(_copy_from_iter_nocache); #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE /** * _copy_from_iter_flushcache - write destination through cpu cache * @addr: destination kernel address * @bytes: total transfer length * @i: source iterator * * The pmem driver arranges for filesystem-dax to use this facility via * dax_copy_from_iter() for ensuring that writes to persistent memory * are flushed through the CPU cache. It is differentiated from * _copy_from_iter_nocache() in that guarantees all data is flushed for * all iterator types. The _copy_from_iter_nocache() only attempts to * bypass the cache for the ITER_IOVEC case, and on some archs may use * instructions that strand dirty-data in the cache. * * Return: number of bytes copied (may be %0) */ size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(!i->data_source)) return 0; iterate_and_advance(i, bytes, base, len, off, __copy_from_user_flushcache(addr + off, base, len), memcpy_flushcache(addr + off, base, len) ) return bytes; } EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache); #endif static inline bool page_copy_sane(struct page *page, size_t offset, size_t n) { struct page *head; size_t v = n + offset; /* * The general case needs to access the page order in order * to compute the page size. * However, we mostly deal with order-0 pages and thus can * avoid a possible cache line miss for requests that fit all * page orders. */ if (n <= v && v <= PAGE_SIZE) return true; head = compound_head(page); v += (page - head) << PAGE_SHIFT; if (WARN_ON(n > v || v > page_size(head))) return false; return true; } size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { size_t res = 0; if (!page_copy_sane(page, offset, bytes)) return 0; if (WARN_ON_ONCE(i->data_source)) return 0; page += offset / PAGE_SIZE; // first subpage offset %= PAGE_SIZE; while (1) { void *kaddr = kmap_local_page(page); size_t n = min(bytes, (size_t)PAGE_SIZE - offset); n = _copy_to_iter(kaddr + offset, n, i); kunmap_local(kaddr); res += n; bytes -= n; if (!bytes || !n) break; offset += n; if (offset == PAGE_SIZE) { page++; offset = 0; } } return res; } EXPORT_SYMBOL(copy_page_to_iter); size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t bytes, struct iov_iter *i) { size_t res = 0; if (!page_copy_sane(page, offset, bytes)) return 0; if (WARN_ON_ONCE(i->data_source)) return 0; page += offset / PAGE_SIZE; // first subpage offset %= PAGE_SIZE; while (1) { void *kaddr = kmap_local_page(page); size_t n = min(bytes, (size_t)PAGE_SIZE - offset); iterate_and_advance(i, n, base, len, off, copyout_nofault(base, kaddr + offset + off, len), memcpy(base, kaddr + offset + off, len) ) kunmap_local(kaddr); res += n; bytes -= n; if (!bytes || !n) break; offset += n; if (offset == PAGE_SIZE) { page++; offset = 0; } } return res; } EXPORT_SYMBOL(copy_page_to_iter_nofault); size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { size_t res = 0; if (!page_copy_sane(page, offset, bytes)) return 0; page += offset / PAGE_SIZE; // first subpage offset %= PAGE_SIZE; while (1) { void *kaddr = kmap_local_page(page); size_t n = min(bytes, (size_t)PAGE_SIZE - offset); n = _copy_from_iter(kaddr + offset, n, i); kunmap_local(kaddr); res += n; bytes -= n; if (!bytes || !n) break; offset += n; if (offset == PAGE_SIZE) { page++; offset = 0; } } return res; } EXPORT_SYMBOL(copy_page_from_iter); size_t iov_iter_zero(size_t bytes, struct iov_iter *i) { iterate_and_advance(i, bytes, base, len, count, clear_user(base, len), memset(base, 0, len) ) return bytes; } EXPORT_SYMBOL(iov_iter_zero); size_t copy_page_from_iter_atomic(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { size_t n, copied = 0; if (!page_copy_sane(page, offset, bytes)) return 0; if (WARN_ON_ONCE(!i->data_source)) return 0; do { char *p; n = bytes - copied; if (PageHighMem(page)) { page += offset / PAGE_SIZE; offset %= PAGE_SIZE; n = min_t(size_t, n, PAGE_SIZE - offset); } p = kmap_atomic(page) + offset; iterate_and_advance(i, n, base, len, off, copyin(p + off, base, len), memcpy_from_iter(i, p + off, base, len) ) kunmap_atomic(p); copied += n; offset += n; } while (PageHighMem(page) && copied != bytes && n > 0); return copied; } EXPORT_SYMBOL(copy_page_from_iter_atomic); static void iov_iter_bvec_advance(struct iov_iter *i, size_t size) { const struct bio_vec *bvec, *end; if (!i->count) return; i->count -= size; size += i->iov_offset; for (bvec = i->bvec, end = bvec + i->nr_segs; bvec < end; bvec++) { if (likely(size < bvec->bv_len)) break; size -= bvec->bv_len; } i->iov_offset = size; i->nr_segs -= bvec - i->bvec; i->bvec = bvec; } static void iov_iter_iovec_advance(struct iov_iter *i, size_t size) { const struct iovec *iov, *end; if (!i->count) return; i->count -= size; size += i->iov_offset; // from beginning of current segment for (iov = iter_iov(i), end = iov + i->nr_segs; iov < end; iov++) { if (likely(size < iov->iov_len)) break; size -= iov->iov_len; } i->iov_offset = size; i->nr_segs -= iov - iter_iov(i); i->__iov = iov; } void iov_iter_advance(struct iov_iter *i, size_t size) { if (unlikely(i->count < size)) size = i->count; if (likely(iter_is_ubuf(i)) || unlikely(iov_iter_is_xarray(i))) { i->iov_offset += size; i->count -= size; } else if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) { /* iovec and kvec have identical layouts */ iov_iter_iovec_advance(i, size); } else if (iov_iter_is_bvec(i)) { iov_iter_bvec_advance(i, size); } else if (iov_iter_is_discard(i)) { i->count -= size; } } EXPORT_SYMBOL(iov_iter_advance); void iov_iter_revert(struct iov_iter *i, size_t unroll) { if (!unroll) return; if (WARN_ON(unroll > MAX_RW_COUNT)) return; i->count += unroll; if (unlikely(iov_iter_is_discard(i))) return; if (unroll <= i->iov_offset) { i->iov_offset -= unroll; return; } unroll -= i->iov_offset; if (iov_iter_is_xarray(i) || iter_is_ubuf(i)) { BUG(); /* We should never go beyond the start of the specified * range since we might then be straying into pages that * aren't pinned. */ } else if (iov_iter_is_bvec(i)) { const struct bio_vec *bvec = i->bvec; while (1) { size_t n = (--bvec)->bv_len; i->nr_segs++; if (unroll <= n) { i->bvec = bvec; i->iov_offset = n - unroll; return; } unroll -= n; } } else { /* same logics for iovec and kvec */ const struct iovec *iov = iter_iov(i); while (1) { size_t n = (--iov)->iov_len; i->nr_segs++; if (unroll <= n) { i->__iov = iov; i->iov_offset = n - unroll; return; } unroll -= n; } } } EXPORT_SYMBOL(iov_iter_revert); /* * Return the count of just the current iov_iter segment. */ size_t iov_iter_single_seg_count(const struct iov_iter *i) { if (i->nr_segs > 1) { if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) return min(i->count, iter_iov(i)->iov_len - i->iov_offset); if (iov_iter_is_bvec(i)) return min(i->count, i->bvec->bv_len - i->iov_offset); } return i->count; } EXPORT_SYMBOL(iov_iter_single_seg_count); void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec *kvec, unsigned long nr_segs, size_t count) { WARN_ON(direction & ~(READ | WRITE)); *i = (struct iov_iter){ .iter_type = ITER_KVEC, .copy_mc = false, .data_source = direction, .kvec = kvec, .nr_segs = nr_segs, .iov_offset = 0, .count = count }; } EXPORT_SYMBOL(iov_iter_kvec); void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec, unsigned long nr_segs, size_t count) { WARN_ON(direction & ~(READ | WRITE)); *i = (struct iov_iter){ .iter_type = ITER_BVEC, .copy_mc = false, .data_source = direction, .bvec = bvec, .nr_segs = nr_segs, .iov_offset = 0, .count = count }; } EXPORT_SYMBOL(iov_iter_bvec); /** * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray * @i: The iterator to initialise. * @direction: The direction of the transfer. * @xarray: The xarray to access. * @start: The start file position. * @count: The size of the I/O buffer in bytes. * * Set up an I/O iterator to either draw data out of the pages attached to an * inode or to inject data into those pages. The pages *must* be prevented * from evaporation, either by taking a ref on them or locking them by the * caller. */ void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray, loff_t start, size_t count) { BUG_ON(direction & ~1); *i = (struct iov_iter) { .iter_type = ITER_XARRAY, .copy_mc = false, .data_source = direction, .xarray = xarray, .xarray_start = start, .count = count, .iov_offset = 0 }; } EXPORT_SYMBOL(iov_iter_xarray); /** * iov_iter_discard - Initialise an I/O iterator that discards data * @i: The iterator to initialise. * @direction: The direction of the transfer. * @count: The size of the I/O buffer in bytes. * * Set up an I/O iterator that just discards everything that's written to it. * It's only available as a READ iterator. */ void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count) { BUG_ON(direction != READ); *i = (struct iov_iter){ .iter_type = ITER_DISCARD, .copy_mc = false, .data_source = false, .count = count, .iov_offset = 0 }; } EXPORT_SYMBOL(iov_iter_discard); static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask, unsigned len_mask) { size_t size = i->count; size_t skip = i->iov_offset; unsigned k; for (k = 0; k < i->nr_segs; k++, skip = 0) { const struct iovec *iov = iter_iov(i) + k; size_t len = iov->iov_len - skip; if (len > size) len = size; if (len & len_mask) return false; if ((unsigned long)(iov->iov_base + skip) & addr_mask) return false; size -= len; if (!size) break; } return true; } static bool iov_iter_aligned_bvec(const struct iov_iter *i, unsigned addr_mask, unsigned len_mask) { size_t size = i->count; unsigned skip = i->iov_offset; unsigned k; for (k = 0; k < i->nr_segs; k++, skip = 0) { size_t len = i->bvec[k].bv_len - skip; if (len > size) len = size; if (len & len_mask) return false; if ((unsigned long)(i->bvec[k].bv_offset + skip) & addr_mask) return false; size -= len; if (!size) break; } return true; } /** * iov_iter_is_aligned() - Check if the addresses and lengths of each segments * are aligned to the parameters. * * @i: &struct iov_iter to restore * @addr_mask: bit mask to check against the iov element's addresses * @len_mask: bit mask to check against the iov element's lengths * * Return: false if any addresses or lengths intersect with the provided masks */ bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask, unsigned len_mask) { if (likely(iter_is_ubuf(i))) { if (i->count & len_mask) return false; if ((unsigned long)(i->ubuf + i->iov_offset) & addr_mask) return false; return true; } if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) return iov_iter_aligned_iovec(i, addr_mask, len_mask); if (iov_iter_is_bvec(i)) return iov_iter_aligned_bvec(i, addr_mask, len_mask); if (iov_iter_is_xarray(i)) { if (i->count & len_mask) return false; if ((i->xarray_start + i->iov_offset) & addr_mask) return false; } return true; } EXPORT_SYMBOL_GPL(iov_iter_is_aligned); static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i) { unsigned long res = 0; size_t size = i->count; size_t skip = i->iov_offset; unsigned k; for (k = 0; k < i->nr_segs; k++, skip = 0) { const struct iovec *iov = iter_iov(i) + k; size_t len = iov->iov_len - skip; if (len) { res |= (unsigned long)iov->iov_base + skip; if (len > size) len = size; res |= len; size -= len; if (!size) break; } } return res; } static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i) { unsigned res = 0; size_t size = i->count; unsigned skip = i->iov_offset; unsigned k; for (k = 0; k < i->nr_segs; k++, skip = 0) { size_t len = i->bvec[k].bv_len - skip; res |= (unsigned long)i->bvec[k].bv_offset + skip; if (len > size) len = size; res |= len; size -= len; if (!size) break; } return res; } unsigned long iov_iter_alignment(const struct iov_iter *i) { if (likely(iter_is_ubuf(i))) { size_t size = i->count; if (size) return ((unsigned long)i->ubuf + i->iov_offset) | size; return 0; } /* iovec and kvec have identical layouts */ if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) return iov_iter_alignment_iovec(i); if (iov_iter_is_bvec(i)) return iov_iter_alignment_bvec(i); if (iov_iter_is_xarray(i)) return (i->xarray_start + i->iov_offset) | i->count; return 0; } EXPORT_SYMBOL(iov_iter_alignment); unsigned long iov_iter_gap_alignment(const struct iov_iter *i) { unsigned long res = 0; unsigned long v = 0; size_t size = i->count; unsigned k; if (iter_is_ubuf(i)) return 0; if (WARN_ON(!iter_is_iovec(i))) return ~0U; for (k = 0; k < i->nr_segs; k++) { const struct iovec *iov = iter_iov(i) + k; if (iov->iov_len) { unsigned long base = (unsigned long)iov->iov_base; if (v) // if not the first one res |= base | v; // this start | previous end v = base + iov->iov_len; if (size <= iov->iov_len) break; size -= iov->iov_len; } } return res; } EXPORT_SYMBOL(iov_iter_gap_alignment); static int want_pages_array(struct page ***res, size_t size, size_t start, unsigned int maxpages) { unsigned int count = DIV_ROUND_UP(size + start, PAGE_SIZE); if (count > maxpages) count = maxpages; WARN_ON(!count); // caller should've prevented that if (!*res) { *res = kvmalloc_array(count, sizeof(struct page *), GFP_KERNEL); if (!*res) return 0; } return count; } static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa, pgoff_t index, unsigned int nr_pages) { XA_STATE(xas, xa, index); struct page *page; unsigned int ret = 0; rcu_read_lock(); for (page = xas_load(&xas); page; page = xas_next(&xas)) { if (xas_retry(&xas, page)) continue; /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) { xas_reset(&xas); continue; } pages[ret] = find_subpage(page, xas.xa_index); get_page(pages[ret]); if (++ret == nr_pages) break; } rcu_read_unlock(); return ret; } static ssize_t iter_xarray_get_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned maxpages, size_t *_start_offset) { unsigned nr, offset, count; pgoff_t index; loff_t pos; pos = i->xarray_start + i->iov_offset; index = pos >> PAGE_SHIFT; offset = pos & ~PAGE_MASK; *_start_offset = offset; count = want_pages_array(pages, maxsize, offset, maxpages); if (!count) return -ENOMEM; nr = iter_xarray_populate_pages(*pages, i->xarray, index, count); if (nr == 0) return 0; maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize); i->iov_offset += maxsize; i->count -= maxsize; return maxsize; } /* must be done on non-empty ITER_UBUF or ITER_IOVEC one */ static unsigned long first_iovec_segment(const struct iov_iter *i, size_t *size) { size_t skip; long k; if (iter_is_ubuf(i)) return (unsigned long)i->ubuf + i->iov_offset; for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) { const struct iovec *iov = iter_iov(i) + k; size_t len = iov->iov_len - skip; if (unlikely(!len)) continue; if (*size > len) *size = len; return (unsigned long)iov->iov_base + skip; } BUG(); // if it had been empty, we wouldn't get called } /* must be done on non-empty ITER_BVEC one */ static struct page *first_bvec_segment(const struct iov_iter *i, size_t *size, size_t *start) { struct page *page; size_t skip = i->iov_offset, len; len = i->bvec->bv_len - skip; if (*size > len) *size = len; skip += i->bvec->bv_offset; page = i->bvec->bv_page + skip / PAGE_SIZE; *start = skip % PAGE_SIZE; return page; } static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, size_t *start) { unsigned int n, gup_flags = 0; if (maxsize > i->count) maxsize = i->count; if (!maxsize) return 0; if (maxsize > MAX_RW_COUNT) maxsize = MAX_RW_COUNT; if (likely(user_backed_iter(i))) { unsigned long addr; int res; if (iov_iter_rw(i) != WRITE) gup_flags |= FOLL_WRITE; if (i->nofault) gup_flags |= FOLL_NOFAULT; addr = first_iovec_segment(i, &maxsize); *start = addr % PAGE_SIZE; addr &= PAGE_MASK; n = want_pages_array(pages, maxsize, *start, maxpages); if (!n) return -ENOMEM; res = get_user_pages_fast(addr, n, gup_flags, *pages); if (unlikely(res <= 0)) return res; maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - *start); iov_iter_advance(i, maxsize); return maxsize; } if (iov_iter_is_bvec(i)) { struct page **p; struct page *page; page = first_bvec_segment(i, &maxsize, start); n = want_pages_array(pages, maxsize, *start, maxpages); if (!n) return -ENOMEM; p = *pages; for (int k = 0; k < n; k++) get_page(p[k] = page + k); maxsize = min_t(size_t, maxsize, n * PAGE_SIZE - *start); i->count -= maxsize; i->iov_offset += maxsize; if (i->iov_offset == i->bvec->bv_len) { i->iov_offset = 0; i->bvec++; i->nr_segs--; } return maxsize; } if (iov_iter_is_xarray(i)) return iter_xarray_get_pages(i, pages, maxsize, maxpages, start); return -EFAULT; } ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start) { if (!maxpages) return 0; BUG_ON(!pages); return __iov_iter_get_pages_alloc(i, &pages, maxsize, maxpages, start); } EXPORT_SYMBOL(iov_iter_get_pages2); ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, struct page ***pages, size_t maxsize, size_t *start) { ssize_t len; *pages = NULL; len = __iov_iter_get_pages_alloc(i, pages, maxsize, ~0U, start); if (len <= 0) { kvfree(*pages); *pages = NULL; } return len; } EXPORT_SYMBOL(iov_iter_get_pages_alloc2); size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i) { __wsum sum, next; sum = *csum; if (WARN_ON_ONCE(!i->data_source)) return 0; iterate_and_advance(i, bytes, base, len, off, ({ next = csum_and_copy_from_user(base, addr + off, len); sum = csum_block_add(sum, next, off); next ? 0 : len; }), ({ sum = csum_and_memcpy(addr + off, base, len, sum, off); }) ) *csum = sum; return bytes; } EXPORT_SYMBOL(csum_and_copy_from_iter); size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate, struct iov_iter *i) { struct csum_state *csstate = _csstate; __wsum sum, next; if (WARN_ON_ONCE(i->data_source)) return 0; if (unlikely(iov_iter_is_discard(i))) { // can't use csum_memcpy() for that one - data is not copied csstate->csum = csum_block_add(csstate->csum, csum_partial(addr, bytes, 0), csstate->off); csstate->off += bytes; return bytes; } sum = csum_shift(csstate->csum, csstate->off); iterate_and_advance(i, bytes, base, len, off, ({ next = csum_and_copy_to_user(addr + off, base, len); sum = csum_block_add(sum, next, off); next ? 0 : len; }), ({ sum = csum_and_memcpy(base, addr + off, len, sum, off); }) ) csstate->csum = csum_shift(sum, csstate->off); csstate->off += bytes; return bytes; } EXPORT_SYMBOL(csum_and_copy_to_iter); size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp, struct iov_iter *i) { #ifdef CONFIG_CRYPTO_HASH struct ahash_request *hash = hashp; struct scatterlist sg; size_t copied; copied = copy_to_iter(addr, bytes, i); sg_init_one(&sg, addr, copied); ahash_request_set_crypt(hash, &sg, NULL, copied); crypto_ahash_update(hash); return copied; #else return 0; #endif } EXPORT_SYMBOL(hash_and_copy_to_iter); static int iov_npages(const struct iov_iter *i, int maxpages) { size_t skip = i->iov_offset, size = i->count; const struct iovec *p; int npages = 0; for (p = iter_iov(i); size; skip = 0, p++) { unsigned offs = offset_in_page(p->iov_base + skip); size_t len = min(p->iov_len - skip, size); if (len) { size -= len; npages += DIV_ROUND_UP(offs + len, PAGE_SIZE); if (unlikely(npages > maxpages)) return maxpages; } } return npages; } static int bvec_npages(const struct iov_iter *i, int maxpages) { size_t skip = i->iov_offset, size = i->count; const struct bio_vec *p; int npages = 0; for (p = i->bvec; size; skip = 0, p++) { unsigned offs = (p->bv_offset + skip) % PAGE_SIZE; size_t len = min(p->bv_len - skip, size); size -= len; npages += DIV_ROUND_UP(offs + len, PAGE_SIZE); if (unlikely(npages > maxpages)) return maxpages; } return npages; } int iov_iter_npages(const struct iov_iter *i, int maxpages) { if (unlikely(!i->count)) return 0; if (likely(iter_is_ubuf(i))) { unsigned offs = offset_in_page(i->ubuf + i->iov_offset); int npages = DIV_ROUND_UP(offs + i->count, PAGE_SIZE); return min(npages, maxpages); } /* iovec and kvec have identical layouts */ if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) return iov_npages(i, maxpages); if (iov_iter_is_bvec(i)) return bvec_npages(i, maxpages); if (iov_iter_is_xarray(i)) { unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE; int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE); return min(npages, maxpages); } return 0; } EXPORT_SYMBOL(iov_iter_npages); const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) { *new = *old; if (iov_iter_is_bvec(new)) return new->bvec = kmemdup(new->bvec, new->nr_segs * sizeof(struct bio_vec), flags); else if (iov_iter_is_kvec(new) || iter_is_iovec(new)) /* iovec and kvec have identical layout */ return new->__iov = kmemdup(new->__iov, new->nr_segs * sizeof(struct iovec), flags); return NULL; } EXPORT_SYMBOL(dup_iter); static __noclone int copy_compat_iovec_from_user(struct iovec *iov, const struct iovec __user *uvec, unsigned long nr_segs) { const struct compat_iovec __user *uiov = (const struct compat_iovec __user *)uvec; int ret = -EFAULT, i; if (!user_access_begin(uiov, nr_segs * sizeof(*uiov))) return -EFAULT; for (i = 0; i < nr_segs; i++) { compat_uptr_t buf; compat_ssize_t len; unsafe_get_user(len, &uiov[i].iov_len, uaccess_end); unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end); /* check for compat_size_t not fitting in compat_ssize_t .. */ if (len < 0) { ret = -EINVAL; goto uaccess_end; } iov[i].iov_base = compat_ptr(buf); iov[i].iov_len = len; } ret = 0; uaccess_end: user_access_end(); return ret; } static __noclone int copy_iovec_from_user(struct iovec *iov, const struct iovec __user *uiov, unsigned long nr_segs) { int ret = -EFAULT; if (!user_access_begin(uiov, nr_segs * sizeof(*uiov))) return -EFAULT; do { void __user *buf; ssize_t len; unsafe_get_user(len, &uiov->iov_len, uaccess_end); unsafe_get_user(buf, &uiov->iov_base, uaccess_end); /* check for size_t not fitting in ssize_t .. */ if (unlikely(len < 0)) { ret = -EINVAL; goto uaccess_end; } iov->iov_base = buf; iov->iov_len = len; uiov++; iov++; } while (--nr_segs); ret = 0; uaccess_end: user_access_end(); return ret; } struct iovec *iovec_from_user(const struct iovec __user *uvec, unsigned long nr_segs, unsigned long fast_segs, struct iovec *fast_iov, bool compat) { struct iovec *iov = fast_iov; int ret; /* * SuS says "The readv() function *may* fail if the iovcnt argument was * less than or equal to 0, or greater than {IOV_MAX}. Linux has * traditionally returned zero for zero segments, so... */ if (nr_segs == 0) return iov; if (nr_segs > UIO_MAXIOV) return ERR_PTR(-EINVAL); if (nr_segs > fast_segs) { iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL); if (!iov) return ERR_PTR(-ENOMEM); } if (unlikely(compat)) ret = copy_compat_iovec_from_user(iov, uvec, nr_segs); else ret = copy_iovec_from_user(iov, uvec, nr_segs); if (ret) { if (iov != fast_iov) kfree(iov); return ERR_PTR(ret); } return iov; } /* * Single segment iovec supplied by the user, import it as ITER_UBUF. */ static ssize_t __import_iovec_ubuf(int type, const struct iovec __user *uvec, struct iovec **iovp, struct iov_iter *i, bool compat) { struct iovec *iov = *iovp; ssize_t ret; if (compat) ret = copy_compat_iovec_from_user(iov, uvec, 1); else ret = copy_iovec_from_user(iov, uvec, 1); if (unlikely(ret)) return ret; ret = import_ubuf(type, iov->iov_base, iov->iov_len, i); if (unlikely(ret)) return ret; *iovp = NULL; return i->count; } ssize_t __import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i, bool compat) { ssize_t total_len = 0; unsigned long seg; struct iovec *iov; if (nr_segs == 1) return __import_iovec_ubuf(type, uvec, iovp, i, compat); iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat); if (IS_ERR(iov)) { *iovp = NULL; return PTR_ERR(iov); } /* * According to the Single Unix Specification we should return EINVAL if * an element length is < 0 when cast to ssize_t or if the total length * would overflow the ssize_t return value of the system call. * * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the * overflow case. */ for (seg = 0; seg < nr_segs; seg++) { ssize_t len = (ssize_t)iov[seg].iov_len; if (!access_ok(iov[seg].iov_base, len)) { if (iov != *iovp) kfree(iov); *iovp = NULL; return -EFAULT; } if (len > MAX_RW_COUNT - total_len) { len = MAX_RW_COUNT - total_len; iov[seg].iov_len = len; } total_len += len; } iov_iter_init(i, type, iov, nr_segs, total_len); if (iov == *iovp) *iovp = NULL; else *iovp = iov; return total_len; } /** * import_iovec() - Copy an array of &struct iovec from userspace * into the kernel, check that it is valid, and initialize a new * &struct iov_iter iterator to access it. * * @type: One of %READ or %WRITE. * @uvec: Pointer to the userspace array. * @nr_segs: Number of elements in userspace array. * @fast_segs: Number of elements in @iov. * @iovp: (input and output parameter) Pointer to pointer to (usually small * on-stack) kernel array. * @i: Pointer to iterator that will be initialized on success. * * If the array pointed to by *@iov is large enough to hold all @nr_segs, * then this function places %NULL in *@iov on return. Otherwise, a new * array will be allocated and the result placed in *@iov. This means that * the caller may call kfree() on *@iov regardless of whether the small * on-stack array was used or not (and regardless of whether this function * returns an error or not). * * Return: Negative error code on error, bytes imported on success */ ssize_t import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i) { return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i, in_compat_syscall()); } EXPORT_SYMBOL(import_iovec); int import_single_range(int rw, void __user *buf, size_t len, struct iovec *iov, struct iov_iter *i) { if (len > MAX_RW_COUNT) len = MAX_RW_COUNT; if (unlikely(!access_ok(buf, len))) return -EFAULT; iov_iter_ubuf(i, rw, buf, len); return 0; } EXPORT_SYMBOL(import_single_range); int import_ubuf(int rw, void __user *buf, size_t len, struct iov_iter *i) { if (len > MAX_RW_COUNT) len = MAX_RW_COUNT; if (unlikely(!access_ok(buf, len))) return -EFAULT; iov_iter_ubuf(i, rw, buf, len); return 0; } EXPORT_SYMBOL_GPL(import_ubuf); /** * iov_iter_restore() - Restore a &struct iov_iter to the same state as when * iov_iter_save_state() was called. * * @i: &struct iov_iter to restore * @state: state to restore from * * Used after iov_iter_save_state() to bring restore @i, if operations may * have advanced it. * * Note: only works on ITER_IOVEC, ITER_BVEC, and ITER_KVEC */ void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state) { if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i) && !iter_is_ubuf(i)) && !iov_iter_is_kvec(i)) return; i->iov_offset = state->iov_offset; i->count = state->count; if (iter_is_ubuf(i)) return; /* * For the *vec iters, nr_segs + iov is constant - if we increment * the vec, then we also decrement the nr_segs count. Hence we don't * need to track both of these, just one is enough and we can deduct * the other from that. ITER_KVEC and ITER_IOVEC are the same struct * size, so we can just increment the iov pointer as they are unionzed. * ITER_BVEC _may_ be the same size on some archs, but on others it is * not. Be safe and handle it separately. */ BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec)); if (iov_iter_is_bvec(i)) i->bvec -= state->nr_segs - i->nr_segs; else i->__iov -= state->nr_segs - i->nr_segs; i->nr_segs = state->nr_segs; } /* * Extract a list of contiguous pages from an ITER_XARRAY iterator. This does not * get references on the pages, nor does it get a pin on them. */ static ssize_t iov_iter_extract_xarray_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0) { struct page *page, **p; unsigned int nr = 0, offset; loff_t pos = i->xarray_start + i->iov_offset; pgoff_t index = pos >> PAGE_SHIFT; XA_STATE(xas, i->xarray, index); offset = pos & ~PAGE_MASK; *offset0 = offset; maxpages = want_pages_array(pages, maxsize, offset, maxpages); if (!maxpages) return -ENOMEM; p = *pages; rcu_read_lock(); for (page = xas_load(&xas); page; page = xas_next(&xas)) { if (xas_retry(&xas, page)) continue; /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) { xas_reset(&xas); continue; } p[nr++] = find_subpage(page, xas.xa_index); if (nr == maxpages) break; } rcu_read_unlock(); maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize); iov_iter_advance(i, maxsize); return maxsize; } /* * Extract a list of contiguous pages from an ITER_BVEC iterator. This does * not get references on the pages, nor does it get a pin on them. */ static ssize_t iov_iter_extract_bvec_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0) { struct page **p, *page; size_t skip = i->iov_offset, offset, size; int k; for (;;) { if (i->nr_segs == 0) return 0; size = min(maxsize, i->bvec->bv_len - skip); if (size) break; i->iov_offset = 0; i->nr_segs--; i->bvec++; skip = 0; } skip += i->bvec->bv_offset; page = i->bvec->bv_page + skip / PAGE_SIZE; offset = skip % PAGE_SIZE; *offset0 = offset; maxpages = want_pages_array(pages, size, offset, maxpages); if (!maxpages) return -ENOMEM; p = *pages; for (k = 0; k < maxpages; k++) p[k] = page + k; size = min_t(size_t, size, maxpages * PAGE_SIZE - offset); iov_iter_advance(i, size); return size; } /* * Extract a list of virtually contiguous pages from an ITER_KVEC iterator. * This does not get references on the pages, nor does it get a pin on them. */ static ssize_t iov_iter_extract_kvec_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0) { struct page **p, *page; const void *kaddr; size_t skip = i->iov_offset, offset, len, size; int k; for (;;) { if (i->nr_segs == 0) return 0; size = min(maxsize, i->kvec->iov_len - skip); if (size) break; i->iov_offset = 0; i->nr_segs--; i->kvec++; skip = 0; } kaddr = i->kvec->iov_base + skip; offset = (unsigned long)kaddr & ~PAGE_MASK; *offset0 = offset; maxpages = want_pages_array(pages, size, offset, maxpages); if (!maxpages) return -ENOMEM; p = *pages; kaddr -= offset; len = offset + size; for (k = 0; k < maxpages; k++) { size_t seg = min_t(size_t, len, PAGE_SIZE); if (is_vmalloc_or_module_addr(kaddr)) page = vmalloc_to_page(kaddr); else page = virt_to_page(kaddr); p[k] = page; len -= seg; kaddr += PAGE_SIZE; } size = min_t(size_t, size, maxpages * PAGE_SIZE - offset); iov_iter_advance(i, size); return size; } /* * Extract a list of contiguous pages from a user iterator and get a pin on * each of them. This should only be used if the iterator is user-backed * (IOBUF/UBUF). * * It does not get refs on the pages, but the pages must be unpinned by the * caller once the transfer is complete. * * This is safe to be used where background IO/DMA *is* going to be modifying * the buffer; using a pin rather than a ref makes forces fork() to give the * child a copy of the page. */ static ssize_t iov_iter_extract_user_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0) { unsigned long addr; unsigned int gup_flags = 0; size_t offset; int res; if (i->data_source == ITER_DEST) gup_flags |= FOLL_WRITE; if (extraction_flags & ITER_ALLOW_P2PDMA) gup_flags |= FOLL_PCI_P2PDMA; if (i->nofault) gup_flags |= FOLL_NOFAULT; addr = first_iovec_segment(i, &maxsize); *offset0 = offset = addr % PAGE_SIZE; addr &= PAGE_MASK; maxpages = want_pages_array(pages, maxsize, offset, maxpages); if (!maxpages) return -ENOMEM; res = pin_user_pages_fast(addr, maxpages, gup_flags, *pages); if (unlikely(res <= 0)) return res; maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - offset); iov_iter_advance(i, maxsize); return maxsize; } /** * iov_iter_extract_pages - Extract a list of contiguous pages from an iterator * @i: The iterator to extract from * @pages: Where to return the list of pages * @maxsize: The maximum amount of iterator to extract * @maxpages: The maximum size of the list of pages * @extraction_flags: Flags to qualify request * @offset0: Where to return the starting offset into (*@pages)[0] * * Extract a list of contiguous pages from the current point of the iterator, * advancing the iterator. The maximum number of pages and the maximum amount * of page contents can be set. * * If *@pages is NULL, a page list will be allocated to the required size and * *@pages will be set to its base. If *@pages is not NULL, it will be assumed * that the caller allocated a page list at least @maxpages in size and this * will be filled in. * * @extraction_flags can have ITER_ALLOW_P2PDMA set to request peer-to-peer DMA * be allowed on the pages extracted. * * The iov_iter_extract_will_pin() function can be used to query how cleanup * should be performed. * * Extra refs or pins on the pages may be obtained as follows: * * (*) If the iterator is user-backed (ITER_IOVEC/ITER_UBUF), pins will be * added to the pages, but refs will not be taken. * iov_iter_extract_will_pin() will return true. * * (*) If the iterator is ITER_KVEC, ITER_BVEC or ITER_XARRAY, the pages are * merely listed; no extra refs or pins are obtained. * iov_iter_extract_will_pin() will return 0. * * Note also: * * (*) Use with ITER_DISCARD is not supported as that has no content. * * On success, the function sets *@pages to the new pagelist, if allocated, and * sets *offset0 to the offset into the first page. * * It may also return -ENOMEM and -EFAULT. */ ssize_t iov_iter_extract_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0) { maxsize = min_t(size_t, min_t(size_t, maxsize, i->count), MAX_RW_COUNT); if (!maxsize) return 0; if (likely(user_backed_iter(i))) return iov_iter_extract_user_pages(i, pages, maxsize, maxpages, extraction_flags, offset0); if (iov_iter_is_kvec(i)) return iov_iter_extract_kvec_pages(i, pages, maxsize, maxpages, extraction_flags, offset0); if (iov_iter_is_bvec(i)) return iov_iter_extract_bvec_pages(i, pages, maxsize, maxpages, extraction_flags, offset0); if (iov_iter_is_xarray(i)) return iov_iter_extract_xarray_pages(i, pages, maxsize, maxpages, extraction_flags, offset0); return -EFAULT; } EXPORT_SYMBOL_GPL(iov_iter_extract_pages);
111 1328 2375 2493 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 /* SPDX-License-Identifier: GPL-2.0 */ /* * fscrypt.h: declarations for per-file encryption * * Filesystems that implement per-file encryption must include this header * file. * * Copyright (C) 2015, Google, Inc. * * Written by Michael Halcrow, 2015. * Modified by Jaegeuk Kim, 2015. */ #ifndef _LINUX_FSCRYPT_H #define _LINUX_FSCRYPT_H #include <linux/fs.h> #include <linux/mm.h> #include <linux/slab.h> #include <uapi/linux/fscrypt.h> /* * The lengths of all file contents blocks must be divisible by this value. * This is needed to ensure that all contents encryption modes will work, as * some of the supported modes don't support arbitrarily byte-aligned messages. * * Since the needed alignment is 16 bytes, most filesystems will meet this * requirement naturally, as typical block sizes are powers of 2. However, if a * filesystem can generate arbitrarily byte-aligned block lengths (e.g., via * compression), then it will need to pad to this alignment before encryption. */ #define FSCRYPT_CONTENTS_ALIGNMENT 16 union fscrypt_policy; struct fscrypt_info; struct fs_parameter; struct seq_file; struct fscrypt_str { unsigned char *name; u32 len; }; struct fscrypt_name { const struct qstr *usr_fname; struct fscrypt_str disk_name; u32 hash; u32 minor_hash; struct fscrypt_str crypto_buf; bool is_nokey_name; }; #define FSTR_INIT(n, l) { .name = n, .len = l } #define FSTR_TO_QSTR(f) QSTR_INIT((f)->name, (f)->len) #define fname_name(p) ((p)->disk_name.name) #define fname_len(p) ((p)->disk_name.len) /* Maximum value for the third parameter of fscrypt_operations.set_context(). */ #define FSCRYPT_SET_CONTEXT_MAX_SIZE 40 #ifdef CONFIG_FS_ENCRYPTION /* * If set, the fscrypt bounce page pool won't be allocated (unless another * filesystem needs it). Set this if the filesystem always uses its own bounce * pages for writes and therefore won't need the fscrypt bounce page pool. */ #define FS_CFLG_OWN_PAGES (1U << 1) /* Crypto operations for filesystems */ struct fscrypt_operations { /* Set of optional flags; see above for allowed flags */ unsigned int flags; /* * If set, this is a filesystem-specific key description prefix that * will be accepted for "logon" keys for v1 fscrypt policies, in * addition to the generic prefix "fscrypt:". This functionality is * deprecated, so new filesystems shouldn't set this field. */ const char *key_prefix; /* * Get the fscrypt context of the given inode. * * @inode: the inode whose context to get * @ctx: the buffer into which to get the context * @len: length of the @ctx buffer in bytes * * Return: On success, returns the length of the context in bytes; this * may be less than @len. On failure, returns -ENODATA if the * inode doesn't have a context, -ERANGE if the context is * longer than @len, or another -errno code. */ int (*get_context)(struct inode *inode, void *ctx, size_t len); /* * Set an fscrypt context on the given inode. * * @inode: the inode whose context to set. The inode won't already have * an fscrypt context. * @ctx: the context to set * @len: length of @ctx in bytes (at most FSCRYPT_SET_CONTEXT_MAX_SIZE) * @fs_data: If called from fscrypt_set_context(), this will be the * value the filesystem passed to fscrypt_set_context(). * Otherwise (i.e. when called from * FS_IOC_SET_ENCRYPTION_POLICY) this will be NULL. * * i_rwsem will be held for write. * * Return: 0 on success, -errno on failure. */ int (*set_context)(struct inode *inode, const void *ctx, size_t len, void *fs_data); /* * Get the dummy fscrypt policy in use on the filesystem (if any). * * Filesystems only need to implement this function if they support the * test_dummy_encryption mount option. * * Return: A pointer to the dummy fscrypt policy, if the filesystem is * mounted with test_dummy_encryption; otherwise NULL. */ const union fscrypt_policy *(*get_dummy_policy)(struct super_block *sb); /* * Check whether a directory is empty. i_rwsem will be held for write. */ bool (*empty_dir)(struct inode *inode); /* * Check whether the filesystem's inode numbers and UUID are stable, * meaning that they will never be changed even by offline operations * such as filesystem shrinking and therefore can be used in the * encryption without the possibility of files becoming unreadable. * * Filesystems only need to implement this function if they want to * support the FSCRYPT_POLICY_FLAG_IV_INO_LBLK_{32,64} flags. These * flags are designed to work around the limitations of UFS and eMMC * inline crypto hardware, and they shouldn't be used in scenarios where * such hardware isn't being used. * * Leaving this NULL is equivalent to always returning false. */ bool (*has_stable_inodes)(struct super_block *sb); /* * Get the number of bits that the filesystem uses to represent inode * numbers and file logical block numbers. * * By default, both of these are assumed to be 64-bit. This function * can be implemented to declare that either or both of these numbers is * shorter, which may allow the use of the * FSCRYPT_POLICY_FLAG_IV_INO_LBLK_{32,64} flags and/or the use of * inline crypto hardware whose maximum DUN length is less than 64 bits * (e.g., eMMC v5.2 spec compliant hardware). This function only needs * to be implemented if support for one of these features is needed. */ void (*get_ino_and_lblk_bits)(struct super_block *sb, int *ino_bits_ret, int *lblk_bits_ret); /* * Return an array of pointers to the block devices to which the * filesystem may write encrypted file contents, NULL if the filesystem * only has a single such block device, or an ERR_PTR() on error. * * On successful non-NULL return, *num_devs is set to the number of * devices in the returned array. The caller must free the returned * array using kfree(). * * If the filesystem can use multiple block devices (other than block * devices that aren't used for encrypted file contents, such as * external journal devices), and wants to support inline encryption, * then it must implement this function. Otherwise it's not needed. */ struct block_device **(*get_devices)(struct super_block *sb, unsigned int *num_devs); }; static inline struct fscrypt_info *fscrypt_get_info(const struct inode *inode) { /* * Pairs with the cmpxchg_release() in fscrypt_setup_encryption_info(). * I.e., another task may publish ->i_crypt_info concurrently, executing * a RELEASE barrier. We need to use smp_load_acquire() here to safely * ACQUIRE the memory the other task published. */ return smp_load_acquire(&inode->i_crypt_info); } /** * fscrypt_needs_contents_encryption() - check whether an inode needs * contents encryption * @inode: the inode to check * * Return: %true iff the inode is an encrypted regular file and the kernel was * built with fscrypt support. * * If you need to know whether the encrypt bit is set even when the kernel was * built without fscrypt support, you must use IS_ENCRYPTED() directly instead. */ static inline bool fscrypt_needs_contents_encryption(const struct inode *inode) { return IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode); } /* * When d_splice_alias() moves a directory's no-key alias to its plaintext alias * as a result of the encryption key being added, DCACHE_NOKEY_NAME must be * cleared. Note that we don't have to support arbitrary moves of this flag * because fscrypt doesn't allow no-key names to be the source or target of a * rename(). */ static inline void fscrypt_handle_d_move(struct dentry *dentry) { dentry->d_flags &= ~DCACHE_NOKEY_NAME; } /** * fscrypt_is_nokey_name() - test whether a dentry is a no-key name * @dentry: the dentry to check * * This returns true if the dentry is a no-key dentry. A no-key dentry is a * dentry that was created in an encrypted directory that hasn't had its * encryption key added yet. Such dentries may be either positive or negative. * * When a filesystem is asked to create a new filename in an encrypted directory * and the new filename's dentry is a no-key dentry, it must fail the operation * with ENOKEY. This includes ->create(), ->mkdir(), ->mknod(), ->symlink(), * ->rename(), and ->link(). (However, ->rename() and ->link() are already * handled by fscrypt_prepare_rename() and fscrypt_prepare_link().) * * This is necessary because creating a filename requires the directory's * encryption key, but just checking for the key on the directory inode during * the final filesystem operation doesn't guarantee that the key was available * during the preceding dentry lookup. And the key must have already been * available during the dentry lookup in order for it to have been checked * whether the filename already exists in the directory and for the new file's * dentry not to be invalidated due to it incorrectly having the no-key flag. * * Return: %true if the dentry is a no-key name */ static inline bool fscrypt_is_nokey_name(const struct dentry *dentry) { return dentry->d_flags & DCACHE_NOKEY_NAME; } /* crypto.c */ void fscrypt_enqueue_decrypt_work(struct work_struct *); struct page *fscrypt_encrypt_pagecache_blocks(struct page *page, unsigned int len, unsigned int offs, gfp_t gfp_flags); int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, unsigned int offs, u64 lblk_num, gfp_t gfp_flags); int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len, size_t offs); int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, unsigned int offs, u64 lblk_num); static inline bool fscrypt_is_bounce_page(struct page *page) { return page->mapping == NULL; } static inline struct page *fscrypt_pagecache_page(struct page *bounce_page) { return (struct page *)page_private(bounce_page); } static inline bool fscrypt_is_bounce_folio(struct folio *folio) { return folio->mapping == NULL; } static inline struct folio *fscrypt_pagecache_folio(struct folio *bounce_folio) { return bounce_folio->private; } void fscrypt_free_bounce_page(struct page *bounce_page); /* policy.c */ int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg); int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg); int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *arg); int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg); int fscrypt_has_permitted_context(struct inode *parent, struct inode *child); int fscrypt_context_for_new_inode(void *ctx, struct inode *inode); int fscrypt_set_context(struct inode *inode, void *fs_data); struct fscrypt_dummy_policy { const union fscrypt_policy *policy; }; int fscrypt_parse_test_dummy_encryption(const struct fs_parameter *param, struct fscrypt_dummy_policy *dummy_policy); bool fscrypt_dummy_policies_equal(const struct fscrypt_dummy_policy *p1, const struct fscrypt_dummy_policy *p2); void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep, struct super_block *sb); static inline bool fscrypt_is_dummy_policy_set(const struct fscrypt_dummy_policy *dummy_policy) { return dummy_policy->policy != NULL; } static inline void fscrypt_free_dummy_policy(struct fscrypt_dummy_policy *dummy_policy) { kfree(dummy_policy->policy); dummy_policy->policy = NULL; } /* keyring.c */ void fscrypt_destroy_keyring(struct super_block *sb); int fscrypt_ioctl_add_key(struct file *filp, void __user *arg); int fscrypt_ioctl_remove_key(struct file *filp, void __user *arg); int fscrypt_ioctl_remove_key_all_users(struct file *filp, void __user *arg); int fscrypt_ioctl_get_key_status(struct file *filp, void __user *arg); /* keysetup.c */ int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode, bool *encrypt_ret); void fscrypt_put_encryption_info(struct inode *inode); void fscrypt_free_inode(struct inode *inode); int fscrypt_drop_inode(struct inode *inode); /* fname.c */ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, u8 *out, unsigned int olen); bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, u32 max_len, u32 *encrypted_len_ret); int fscrypt_setup_filename(struct inode *inode, const struct qstr *iname, int lookup, struct fscrypt_name *fname); static inline void fscrypt_free_filename(struct fscrypt_name *fname) { kfree(fname->crypto_buf.name); } int fscrypt_fname_alloc_buffer(u32 max_encrypted_len, struct fscrypt_str *crypto_str); void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str); int fscrypt_fname_disk_to_usr(const struct inode *inode, u32 hash, u32 minor_hash, const struct fscrypt_str *iname, struct fscrypt_str *oname); bool fscrypt_match_name(const struct fscrypt_name *fname, const u8 *de_name, u32 de_name_len); u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name); int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags); /* bio.c */ bool fscrypt_decrypt_bio(struct bio *bio); int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, sector_t pblk, unsigned int len); /* hooks.c */ int fscrypt_file_open(struct inode *inode, struct file *filp); int __fscrypt_prepare_link(struct inode *inode, struct inode *dir, struct dentry *dentry); int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, struct fscrypt_name *fname); int fscrypt_prepare_lookup_partial(struct inode *dir, struct dentry *dentry); int __fscrypt_prepare_readdir(struct inode *dir); int __fscrypt_prepare_setattr(struct dentry *dentry, struct iattr *attr); int fscrypt_prepare_setflags(struct inode *inode, unsigned int oldflags, unsigned int flags); int fscrypt_prepare_symlink(struct inode *dir, const char *target, unsigned int len, unsigned int max_len, struct fscrypt_str *disk_link); int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, unsigned int len, struct fscrypt_str *disk_link); const char *fscrypt_get_symlink(struct inode *inode, const void *caddr, unsigned int max_size, struct delayed_call *done); int fscrypt_symlink_getattr(const struct path *path, struct kstat *stat); static inline void fscrypt_set_ops(struct super_block *sb, const struct fscrypt_operations *s_cop) { sb->s_cop = s_cop; } #else /* !CONFIG_FS_ENCRYPTION */ static inline struct fscrypt_info *fscrypt_get_info(const struct inode *inode) { return NULL; } static inline bool fscrypt_needs_contents_encryption(const struct inode *inode) { return false; } static inline void fscrypt_handle_d_move(struct dentry *dentry) { } static inline bool fscrypt_is_nokey_name(const struct dentry *dentry) { return false; } /* crypto.c */ static inline void fscrypt_enqueue_decrypt_work(struct work_struct *work) { } static inline struct page *fscrypt_encrypt_pagecache_blocks(struct page *page, unsigned int len, unsigned int offs, gfp_t gfp_flags) { return ERR_PTR(-EOPNOTSUPP); } static inline int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, unsigned int offs, u64 lblk_num, gfp_t gfp_flags) { return -EOPNOTSUPP; } static inline int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len, size_t offs) { return -EOPNOTSUPP; } static inline int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, unsigned int offs, u64 lblk_num) { return -EOPNOTSUPP; } static inline bool fscrypt_is_bounce_page(struct page *page) { return false; } static inline struct page *fscrypt_pagecache_page(struct page *bounce_page) { WARN_ON_ONCE(1); return ERR_PTR(-EINVAL); } static inline bool fscrypt_is_bounce_folio(struct folio *folio) { return false; } static inline struct folio *fscrypt_pagecache_folio(struct folio *bounce_folio) { WARN_ON_ONCE(1); return ERR_PTR(-EINVAL); } static inline void fscrypt_free_bounce_page(struct page *bounce_page) { } /* policy.c */ static inline int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) { return 0; } static inline int fscrypt_set_context(struct inode *inode, void *fs_data) { return -EOPNOTSUPP; } struct fscrypt_dummy_policy { }; static inline int fscrypt_parse_test_dummy_encryption(const struct fs_parameter *param, struct fscrypt_dummy_policy *dummy_policy) { return -EINVAL; } static inline bool fscrypt_dummy_policies_equal(const struct fscrypt_dummy_policy *p1, const struct fscrypt_dummy_policy *p2) { return true; } static inline void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep, struct super_block *sb) { } static inline bool fscrypt_is_dummy_policy_set(const struct fscrypt_dummy_policy *dummy_policy) { return false; } static inline void fscrypt_free_dummy_policy(struct fscrypt_dummy_policy *dummy_policy) { } /* keyring.c */ static inline void fscrypt_destroy_keyring(struct super_block *sb) { } static inline int fscrypt_ioctl_add_key(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_ioctl_remove_key(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_ioctl_remove_key_all_users(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_ioctl_get_key_status(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } /* keysetup.c */ static inline int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode, bool *encrypt_ret) { if (IS_ENCRYPTED(dir)) return -EOPNOTSUPP; return 0; } static inline void fscrypt_put_encryption_info(struct inode *inode) { return; } static inline void fscrypt_free_inode(struct inode *inode) { } static inline int fscrypt_drop_inode(struct inode *inode) { return 0; } /* fname.c */ static inline int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct fscrypt_name *fname) { if (IS_ENCRYPTED(dir)) return -EOPNOTSUPP; memset(fname, 0, sizeof(*fname)); fname->usr_fname = iname; fname->disk_name.name = (unsigned char *)iname->name; fname->disk_name.len = iname->len; return 0; } static inline void fscrypt_free_filename(struct fscrypt_name *fname) { return; } static inline int fscrypt_fname_alloc_buffer(u32 max_encrypted_len, struct fscrypt_str *crypto_str) { return -EOPNOTSUPP; } static inline void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str) { return; } static inline int fscrypt_fname_disk_to_usr(const struct inode *inode, u32 hash, u32 minor_hash, const struct fscrypt_str *iname, struct fscrypt_str *oname) { return -EOPNOTSUPP; } static inline bool fscrypt_match_name(const struct fscrypt_name *fname, const u8 *de_name, u32 de_name_len) { /* Encryption support disabled; use standard comparison */ if (de_name_len != fname->disk_name.len) return false; return !memcmp(de_name, fname->disk_name.name, fname->disk_name.len); } static inline u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name) { WARN_ON_ONCE(1); return 0; } static inline int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) { return 1; } /* bio.c */ static inline bool fscrypt_decrypt_bio(struct bio *bio) { return true; } static inline int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, sector_t pblk, unsigned int len) { return -EOPNOTSUPP; } /* hooks.c */ static inline int fscrypt_file_open(struct inode *inode, struct file *filp) { if (IS_ENCRYPTED(inode)) return -EOPNOTSUPP; return 0; } static inline int __fscrypt_prepare_link(struct inode *inode, struct inode *dir, struct dentry *dentry) { return -EOPNOTSUPP; } static inline int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { return -EOPNOTSUPP; } static inline int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, struct fscrypt_name *fname) { return -EOPNOTSUPP; } static inline int fscrypt_prepare_lookup_partial(struct inode *dir, struct dentry *dentry) { return -EOPNOTSUPP; } static inline int __fscrypt_prepare_readdir(struct inode *dir) { return -EOPNOTSUPP; } static inline int __fscrypt_prepare_setattr(struct dentry *dentry, struct iattr *attr) { return -EOPNOTSUPP; } static inline int fscrypt_prepare_setflags(struct inode *inode, unsigned int oldflags, unsigned int flags) { return 0; } static inline int fscrypt_prepare_symlink(struct inode *dir, const char *target, unsigned int len, unsigned int max_len, struct fscrypt_str *disk_link) { if (IS_ENCRYPTED(dir)) return -EOPNOTSUPP; disk_link->name = (unsigned char *)target; disk_link->len = len + 1; if (disk_link->len > max_len) return -ENAMETOOLONG; return 0; } static inline int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, unsigned int len, struct fscrypt_str *disk_link) { return -EOPNOTSUPP; } static inline const char *fscrypt_get_symlink(struct inode *inode, const void *caddr, unsigned int max_size, struct delayed_call *done) { return ERR_PTR(-EOPNOTSUPP); } static inline int fscrypt_symlink_getattr(const struct path *path, struct kstat *stat) { return -EOPNOTSUPP; } static inline void fscrypt_set_ops(struct super_block *sb, const struct fscrypt_operations *s_cop) { } #endif /* !CONFIG_FS_ENCRYPTION */ /* inline_crypt.c */ #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode); void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, u64 first_lblk, gfp_t gfp_mask); void fscrypt_set_bio_crypt_ctx_bh(struct bio *bio, const struct buffer_head *first_bh, gfp_t gfp_mask); bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, u64 next_lblk); bool fscrypt_mergeable_bio_bh(struct bio *bio, const struct buffer_head *next_bh); bool fscrypt_dio_supported(struct inode *inode); u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks); #else /* CONFIG_FS_ENCRYPTION_INLINE_CRYPT */ static inline bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode) { return false; } static inline void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, u64 first_lblk, gfp_t gfp_mask) { } static inline void fscrypt_set_bio_crypt_ctx_bh( struct bio *bio, const struct buffer_head *first_bh, gfp_t gfp_mask) { } static inline bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, u64 next_lblk) { return true; } static inline bool fscrypt_mergeable_bio_bh(struct bio *bio, const struct buffer_head *next_bh) { return true; } static inline bool fscrypt_dio_supported(struct inode *inode) { return !fscrypt_needs_contents_encryption(inode); } static inline u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks) { return nr_blocks; } #endif /* !CONFIG_FS_ENCRYPTION_INLINE_CRYPT */ /** * fscrypt_inode_uses_inline_crypto() - test whether an inode uses inline * encryption * @inode: an inode. If encrypted, its key must be set up. * * Return: true if the inode requires file contents encryption and if the * encryption should be done in the block layer via blk-crypto rather * than in the filesystem layer. */ static inline bool fscrypt_inode_uses_inline_crypto(const struct inode *inode) { return fscrypt_needs_contents_encryption(inode) && __fscrypt_inode_uses_inline_crypto(inode); } /** * fscrypt_inode_uses_fs_layer_crypto() - test whether an inode uses fs-layer * encryption * @inode: an inode. If encrypted, its key must be set up. * * Return: true if the inode requires file contents encryption and if the * encryption should be done in the filesystem layer rather than in the * block layer via blk-crypto. */ static inline bool fscrypt_inode_uses_fs_layer_crypto(const struct inode *inode) { return fscrypt_needs_contents_encryption(inode) && !__fscrypt_inode_uses_inline_crypto(inode); } /** * fscrypt_has_encryption_key() - check whether an inode has had its key set up * @inode: the inode to check * * Return: %true if the inode has had its encryption key set up, else %false. * * Usually this should be preceded by fscrypt_get_encryption_info() to try to * set up the key first. */ static inline bool fscrypt_has_encryption_key(const struct inode *inode) { return fscrypt_get_info(inode) != NULL; } /** * fscrypt_prepare_link() - prepare to link an inode into a possibly-encrypted * directory * @old_dentry: an existing dentry for the inode being linked * @dir: the target directory * @dentry: negative dentry for the target filename * * A new link can only be added to an encrypted directory if the directory's * encryption key is available --- since otherwise we'd have no way to encrypt * the filename. * * We also verify that the link will not violate the constraint that all files * in an encrypted directory tree use the same encryption policy. * * Return: 0 on success, -ENOKEY if the directory's encryption key is missing, * -EXDEV if the link would result in an inconsistent encryption policy, or * another -errno code. */ static inline int fscrypt_prepare_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { if (IS_ENCRYPTED(dir)) return __fscrypt_prepare_link(d_inode(old_dentry), dir, dentry); return 0; } /** * fscrypt_prepare_rename() - prepare for a rename between possibly-encrypted * directories * @old_dir: source directory * @old_dentry: dentry for source file * @new_dir: target directory * @new_dentry: dentry for target location (may be negative unless exchanging) * @flags: rename flags (we care at least about %RENAME_EXCHANGE) * * Prepare for ->rename() where the source and/or target directories may be * encrypted. A new link can only be added to an encrypted directory if the * directory's encryption key is available --- since otherwise we'd have no way * to encrypt the filename. A rename to an existing name, on the other hand, * *is* cryptographically possible without the key. However, we take the more * conservative approach and just forbid all no-key renames. * * We also verify that the rename will not violate the constraint that all files * in an encrypted directory tree use the same encryption policy. * * Return: 0 on success, -ENOKEY if an encryption key is missing, -EXDEV if the * rename would cause inconsistent encryption policies, or another -errno code. */ static inline int fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { if (IS_ENCRYPTED(old_dir) || IS_ENCRYPTED(new_dir)) return __fscrypt_prepare_rename(old_dir, old_dentry, new_dir, new_dentry, flags); return 0; } /** * fscrypt_prepare_lookup() - prepare to lookup a name in a possibly-encrypted * directory * @dir: directory being searched * @dentry: filename being looked up * @fname: (output) the name to use to search the on-disk directory * * Prepare for ->lookup() in a directory which may be encrypted by determining * the name that will actually be used to search the directory on-disk. If the * directory's encryption policy is supported by this kernel and its encryption * key is available, then the lookup is assumed to be by plaintext name; * otherwise, it is assumed to be by no-key name. * * This will set DCACHE_NOKEY_NAME on the dentry if the lookup is by no-key * name. In this case the filesystem must assign the dentry a dentry_operations * which contains fscrypt_d_revalidate (or contains a d_revalidate method that * calls fscrypt_d_revalidate), so that the dentry will be invalidated if the * directory's encryption key is later added. * * Return: 0 on success; -ENOENT if the directory's key is unavailable but the * filename isn't a valid no-key name, so a negative dentry should be created; * or another -errno code. */ static inline int fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, struct fscrypt_name *fname) { if (IS_ENCRYPTED(dir)) return __fscrypt_prepare_lookup(dir, dentry, fname); memset(fname, 0, sizeof(*fname)); fname->usr_fname = &dentry->d_name; fname->disk_name.name = (unsigned char *)dentry->d_name.name; fname->disk_name.len = dentry->d_name.len; return 0; } /** * fscrypt_prepare_readdir() - prepare to read a possibly-encrypted directory * @dir: the directory inode * * If the directory is encrypted and it doesn't already have its encryption key * set up, try to set it up so that the filenames will be listed in plaintext * form rather than in no-key form. * * Return: 0 on success; -errno on error. Note that the encryption key being * unavailable is not considered an error. It is also not an error if * the encryption policy is unsupported by this kernel; that is treated * like the key being unavailable, so that files can still be deleted. */ static inline int fscrypt_prepare_readdir(struct inode *dir) { if (IS_ENCRYPTED(dir)) return __fscrypt_prepare_readdir(dir); return 0; } /** * fscrypt_prepare_setattr() - prepare to change a possibly-encrypted inode's * attributes * @dentry: dentry through which the inode is being changed * @attr: attributes to change * * Prepare for ->setattr() on a possibly-encrypted inode. On an encrypted file, * most attribute changes are allowed even without the encryption key. However, * without the encryption key we do have to forbid truncates. This is needed * because the size being truncated to may not be a multiple of the filesystem * block size, and in that case we'd have to decrypt the final block, zero the * portion past i_size, and re-encrypt it. (We *could* allow truncating to a * filesystem block boundary, but it's simpler to just forbid all truncates --- * and we already forbid all other contents modifications without the key.) * * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code * if a problem occurred while setting up the encryption key. */ static inline int fscrypt_prepare_setattr(struct dentry *dentry, struct iattr *attr) { if (IS_ENCRYPTED(d_inode(dentry))) return __fscrypt_prepare_setattr(dentry, attr); return 0; } /** * fscrypt_encrypt_symlink() - encrypt the symlink target if needed * @inode: symlink inode * @target: plaintext symlink target * @len: length of @target excluding null terminator * @disk_link: (in/out) the on-disk symlink target being prepared * * If the symlink target needs to be encrypted, then this function encrypts it * into @disk_link->name. fscrypt_prepare_symlink() must have been called * previously to compute @disk_link->len. If the filesystem did not allocate a * buffer for @disk_link->name after calling fscrypt_prepare_link(), then one * will be kmalloc()'ed and the filesystem will be responsible for freeing it. * * Return: 0 on success, -errno on failure */ static inline int fscrypt_encrypt_symlink(struct inode *inode, const char *target, unsigned int len, struct fscrypt_str *disk_link) { if (IS_ENCRYPTED(inode)) return __fscrypt_encrypt_symlink(inode, target, len, disk_link); return 0; } /* If *pagep is a bounce page, free it and set *pagep to the pagecache page */ static inline void fscrypt_finalize_bounce_page(struct page **pagep) { struct page *page = *pagep; if (fscrypt_is_bounce_page(page)) { *pagep = fscrypt_pagecache_page(page); fscrypt_free_bounce_page(page); } } #endif /* _LINUX_FSCRYPT_H */
6 6 6 6 324 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 // SPDX-License-Identifier: GPL-2.0-only /* * async.c: Asynchronous function calls for boot performance * * (C) Copyright 2009 Intel Corporation * Author: Arjan van de Ven <arjan@linux.intel.com> */ /* Goals and Theory of Operation The primary goal of this feature is to reduce the kernel boot time, by doing various independent hardware delays and discovery operations decoupled and not strictly serialized. More specifically, the asynchronous function call concept allows certain operations (primarily during system boot) to happen asynchronously, out of order, while these operations still have their externally visible parts happen sequentially and in-order. (not unlike how out-of-order CPUs retire their instructions in order) Key to the asynchronous function call implementation is the concept of a "sequence cookie" (which, although it has an abstracted type, can be thought of as a monotonically incrementing number). The async core will assign each scheduled event such a sequence cookie and pass this to the called functions. The asynchronously called function should before doing a globally visible operation, such as registering device numbers, call the async_synchronize_cookie() function and pass in its own cookie. The async_synchronize_cookie() function will make sure that all asynchronous operations that were scheduled prior to the operation corresponding with the cookie have completed. Subsystem/driver initialization code that scheduled asynchronous probe functions, but which shares global resources with other drivers/subsystems that do not use the asynchronous call feature, need to do a full synchronization with the async_synchronize_full() function, before returning from their init function. This is to maintain strict ordering between the asynchronous and synchronous parts of the kernel. */ #include <linux/async.h> #include <linux/atomic.h> #include <linux/ktime.h> #include <linux/export.h> #include <linux/wait.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/workqueue.h> #include "workqueue_internal.h" static async_cookie_t next_cookie = 1; #define MAX_WORK 32768 #define ASYNC_COOKIE_MAX ULLONG_MAX /* infinity cookie */ static LIST_HEAD(async_global_pending); /* pending from all registered doms */ static ASYNC_DOMAIN(async_dfl_domain); static DEFINE_SPINLOCK(async_lock); struct async_entry { struct list_head domain_list; struct list_head global_list; struct work_struct work; async_cookie_t cookie; async_func_t func; void *data; struct async_domain *domain; }; static DECLARE_WAIT_QUEUE_HEAD(async_done); static atomic_t entry_count; static long long microseconds_since(ktime_t start) { ktime_t now = ktime_get(); return ktime_to_ns(ktime_sub(now, start)) >> 10; } static async_cookie_t lowest_in_progress(struct async_domain *domain) { struct async_entry *first = NULL; async_cookie_t ret = ASYNC_COOKIE_MAX; unsigned long flags; spin_lock_irqsave(&async_lock, flags); if (domain) { if (!list_empty(&domain->pending)) first = list_first_entry(&domain->pending, struct async_entry, domain_list); } else { if (!list_empty(&async_global_pending)) first = list_first_entry(&async_global_pending, struct async_entry, global_list); } if (first) ret = first->cookie; spin_unlock_irqrestore(&async_lock, flags); return ret; } /* * pick the first pending entry and run it */ static void async_run_entry_fn(struct work_struct *work) { struct async_entry *entry = container_of(work, struct async_entry, work); unsigned long flags; ktime_t calltime; /* 1) run (and print duration) */ pr_debug("calling %lli_%pS @ %i\n", (long long)entry->cookie, entry->func, task_pid_nr(current)); calltime = ktime_get(); entry->func(entry->data, entry->cookie); pr_debug("initcall %lli_%pS returned after %lld usecs\n", (long long)entry->cookie, entry->func, microseconds_since(calltime)); /* 2) remove self from the pending queues */ spin_lock_irqsave(&async_lock, flags); list_del_init(&entry->domain_list); list_del_init(&entry->global_list); /* 3) free the entry */ kfree(entry); atomic_dec(&entry_count); spin_unlock_irqrestore(&async_lock, flags); /* 4) wake up any waiters */ wake_up(&async_done); } /** * async_schedule_node_domain - NUMA specific version of async_schedule_domain * @func: function to execute asynchronously * @data: data pointer to pass to the function * @node: NUMA node that we want to schedule this on or close to * @domain: the domain * * Returns an async_cookie_t that may be used for checkpointing later. * @domain may be used in the async_synchronize_*_domain() functions to * wait within a certain synchronization domain rather than globally. * * Note: This function may be called from atomic or non-atomic contexts. * * The node requested will be honored on a best effort basis. If the node * has no CPUs associated with it then the work is distributed among all * available CPUs. */ async_cookie_t async_schedule_node_domain(async_func_t func, void *data, int node, struct async_domain *domain) { struct async_entry *entry; unsigned long flags; async_cookie_t newcookie; /* allow irq-off callers */ entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC); /* * If we're out of memory or if there's too much work * pending already, we execute synchronously. */ if (!entry || atomic_read(&entry_count) > MAX_WORK) { kfree(entry); spin_lock_irqsave(&async_lock, flags); newcookie = next_cookie++; spin_unlock_irqrestore(&async_lock, flags); /* low on memory.. run synchronously */ func(data, newcookie); return newcookie; } INIT_LIST_HEAD(&entry->domain_list); INIT_LIST_HEAD(&entry->global_list); INIT_WORK(&entry->work, async_run_entry_fn); entry->func = func; entry->data = data; entry->domain = domain; spin_lock_irqsave(&async_lock, flags); /* allocate cookie and queue */ newcookie = entry->cookie = next_cookie++; list_add_tail(&entry->domain_list, &domain->pending); if (domain->registered) list_add_tail(&entry->global_list, &async_global_pending); atomic_inc(&entry_count); spin_unlock_irqrestore(&async_lock, flags); /* schedule for execution */ queue_work_node(node, system_unbound_wq, &entry->work); return newcookie; } EXPORT_SYMBOL_GPL(async_schedule_node_domain); /** * async_schedule_node - NUMA specific version of async_schedule * @func: function to execute asynchronously * @data: data pointer to pass to the function * @node: NUMA node that we want to schedule this on or close to * * Returns an async_cookie_t that may be used for checkpointing later. * Note: This function may be called from atomic or non-atomic contexts. * * The node requested will be honored on a best effort basis. If the node * has no CPUs associated with it then the work is distributed among all * available CPUs. */ async_cookie_t async_schedule_node(async_func_t func, void *data, int node) { return async_schedule_node_domain(func, data, node, &async_dfl_domain); } EXPORT_SYMBOL_GPL(async_schedule_node); /** * async_synchronize_full - synchronize all asynchronous function calls * * This function waits until all asynchronous function calls have been done. */ void async_synchronize_full(void) { async_synchronize_full_domain(NULL); } EXPORT_SYMBOL_GPL(async_synchronize_full); /** * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain * @domain: the domain to synchronize * * This function waits until all asynchronous function calls for the * synchronization domain specified by @domain have been done. */ void async_synchronize_full_domain(struct async_domain *domain) { async_synchronize_cookie_domain(ASYNC_COOKIE_MAX, domain); } EXPORT_SYMBOL_GPL(async_synchronize_full_domain); /** * async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing * @cookie: async_cookie_t to use as checkpoint * @domain: the domain to synchronize (%NULL for all registered domains) * * This function waits until all asynchronous function calls for the * synchronization domain specified by @domain submitted prior to @cookie * have been done. */ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain) { ktime_t starttime; pr_debug("async_waiting @ %i\n", task_pid_nr(current)); starttime = ktime_get(); wait_event(async_done, lowest_in_progress(domain) >= cookie); pr_debug("async_continuing @ %i after %lli usec\n", task_pid_nr(current), microseconds_since(starttime)); } EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain); /** * async_synchronize_cookie - synchronize asynchronous function calls with cookie checkpointing * @cookie: async_cookie_t to use as checkpoint * * This function waits until all asynchronous function calls prior to @cookie * have been done. */ void async_synchronize_cookie(async_cookie_t cookie) { async_synchronize_cookie_domain(cookie, &async_dfl_domain); } EXPORT_SYMBOL_GPL(async_synchronize_cookie); /** * current_is_async - is %current an async worker task? * * Returns %true if %current is an async worker task. */ bool current_is_async(void) { struct worker *worker = current_wq_worker(); return worker && worker->current_func == async_run_entry_fn; } EXPORT_SYMBOL_GPL(current_is_async);
17195 2749 2 15304 8767 206 16465 4320 4319 15188 16463 8548 16466 2551 126 1160 6078 393 2228 5452 5 4348 6058 15539 852 1179 4204 2204 852 1902 881 2274 587 2946 4347 2720 3248 449 7044 1231 2277 1173 3752 3362 2194 1497 7092 81 81 4320 536 1586 2749 1579 2277 2277 1173 654 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 /* SPDX-License-Identifier: GPL-2.0 */ /* * Macros for manipulating and testing page->flags */ #ifndef PAGE_FLAGS_H #define PAGE_FLAGS_H #include <linux/types.h> #include <linux/bug.h> #include <linux/mmdebug.h> #ifndef __GENERATING_BOUNDS_H #include <linux/mm_types.h> #include <generated/bounds.h> #endif /* !__GENERATING_BOUNDS_H */ /* * Various page->flags bits: * * PG_reserved is set for special pages. The "struct page" of such a page * should in general not be touched (e.g. set dirty) except by its owner. * Pages marked as PG_reserved include: * - Pages part of the kernel image (including vDSO) and similar (e.g. BIOS, * initrd, HW tables) * - Pages reserved or allocated early during boot (before the page allocator * was initialized). This includes (depending on the architecture) the * initial vmemmap, initial page tables, crashkernel, elfcorehdr, and much * much more. Once (if ever) freed, PG_reserved is cleared and they will * be given to the page allocator. * - Pages falling into physical memory gaps - not IORESOURCE_SYSRAM. Trying * to read/write these pages might end badly. Don't touch! * - The zero page(s) * - Pages not added to the page allocator when onlining a section because * they were excluded via the online_page_callback() or because they are * PG_hwpoison. * - Pages allocated in the context of kexec/kdump (loaded kernel image, * control pages, vmcoreinfo) * - MMIO/DMA pages. Some architectures don't allow to ioremap pages that are * not marked PG_reserved (as they might be in use by somebody else who does * not respect the caching strategy). * - Pages part of an offline section (struct pages of offline sections should * not be trusted as they will be initialized when first onlined). * - MCA pages on ia64 * - Pages holding CPU notes for POWER Firmware Assisted Dump * - Device memory (e.g. PMEM, DAX, HMM) * Some PG_reserved pages will be excluded from the hibernation image. * PG_reserved does in general not hinder anybody from dumping or swapping * and is no longer required for remap_pfn_range(). ioremap might require it. * Consequently, PG_reserved for a page mapped into user space can indicate * the zero page, the vDSO, MMIO pages or device memory. * * The PG_private bitflag is set on pagecache pages if they contain filesystem * specific data (which is normally at page->private). It can be used by * private allocations for its own usage. * * During initiation of disk I/O, PG_locked is set. This bit is set before I/O * and cleared when writeback _starts_ or when read _completes_. PG_writeback * is set before writeback starts and cleared when it finishes. * * PG_locked also pins a page in pagecache, and blocks truncation of the file * while it is held. * * page_waitqueue(page) is a wait queue of all tasks waiting for the page * to become unlocked. * * PG_swapbacked is set when a page uses swap as a backing storage. This are * usually PageAnon or shmem pages but please note that even anonymous pages * might lose their PG_swapbacked flag when they simply can be dropped (e.g. as * a result of MADV_FREE). * * PG_referenced, PG_reclaim are used for page reclaim for anonymous and * file-backed pagecache (see mm/vmscan.c). * * PG_error is set to indicate that an I/O error occurred on this page. * * PG_arch_1 is an architecture specific page state bit. The generic code * guarantees that this bit is cleared for a page when it first is entered into * the page cache. * * PG_hwpoison indicates that a page got corrupted in hardware and contains * data with incorrect ECC bits that triggered a machine check. Accessing is * not safe since it may cause another machine check. Don't touch! */ /* * Don't use the pageflags directly. Use the PageFoo macros. * * The page flags field is split into two parts, the main flags area * which extends from the low bits upwards, and the fields area which * extends from the high bits downwards. * * | FIELD | ... | FLAGS | * N-1 ^ 0 * (NR_PAGEFLAGS) * * The fields area is reserved for fields mapping zone, node (for NUMA) and * SPARSEMEM section (for variants of SPARSEMEM that require section ids like * SPARSEMEM_EXTREME with !SPARSEMEM_VMEMMAP). */ enum pageflags { PG_locked, /* Page is locked. Don't touch. */ PG_writeback, /* Page is under writeback */ PG_referenced, PG_uptodate, PG_dirty, PG_lru, PG_head, /* Must be in bit 6 */ PG_waiters, /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */ PG_active, PG_workingset, PG_error, PG_slab, PG_owner_priv_1, /* Owner use. If pagecache, fs may use*/ PG_arch_1, PG_reserved, PG_private, /* If pagecache, has fs-private data */ PG_private_2, /* If pagecache, has fs aux data */ PG_mappedtodisk, /* Has blocks allocated on-disk */ PG_reclaim, /* To be reclaimed asap */ PG_swapbacked, /* Page is backed by RAM/swap */ PG_unevictable, /* Page is "unevictable" */ #ifdef CONFIG_MMU PG_mlocked, /* Page is vma mlocked */ #endif #ifdef CONFIG_ARCH_USES_PG_UNCACHED PG_uncached, /* Page has been mapped as uncached */ #endif #ifdef CONFIG_MEMORY_FAILURE PG_hwpoison, /* hardware poisoned page. Don't touch */ #endif #if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT) PG_young, PG_idle, #endif #ifdef CONFIG_ARCH_USES_PG_ARCH_X PG_arch_2, PG_arch_3, #endif __NR_PAGEFLAGS, PG_readahead = PG_reclaim, /* * Depending on the way an anonymous folio can be mapped into a page * table (e.g., single PMD/PUD/CONT of the head page vs. PTE-mapped * THP), PG_anon_exclusive may be set only for the head page or for * tail pages of an anonymous folio. For now, we only expect it to be * set on tail pages for PTE-mapped THP. */ PG_anon_exclusive = PG_mappedtodisk, /* Filesystems */ PG_checked = PG_owner_priv_1, /* SwapBacked */ PG_swapcache = PG_owner_priv_1, /* Swap page: swp_entry_t in private */ /* Two page bits are conscripted by FS-Cache to maintain local caching * state. These bits are set on pages belonging to the netfs's inodes * when those inodes are being locally cached. */ PG_fscache = PG_private_2, /* page backed by cache */ /* XEN */ /* Pinned in Xen as a read-only pagetable page. */ PG_pinned = PG_owner_priv_1, /* Pinned as part of domain save (see xen_mm_pin_all()). */ PG_savepinned = PG_dirty, /* Has a grant mapping of another (foreign) domain's page. */ PG_foreign = PG_owner_priv_1, /* Remapped by swiotlb-xen. */ PG_xen_remapped = PG_owner_priv_1, /* non-lru isolated movable page */ PG_isolated = PG_reclaim, /* Only valid for buddy pages. Used to track pages that are reported */ PG_reported = PG_uptodate, #ifdef CONFIG_MEMORY_HOTPLUG /* For self-hosted memmap pages */ PG_vmemmap_self_hosted = PG_owner_priv_1, #endif /* * Flags only valid for compound pages. Stored in first tail page's * flags word. Cannot use the first 8 flags or any flag marked as * PF_ANY. */ /* At least one page in this folio has the hwpoison flag set */ PG_has_hwpoisoned = PG_error, PG_hugetlb = PG_active, PG_large_rmappable = PG_workingset, /* anon or file-backed */ }; #define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1) #ifndef __GENERATING_BOUNDS_H #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP DECLARE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); /* * Return the real head page struct iff the @page is a fake head page, otherwise * return the @page itself. See Documentation/mm/vmemmap_dedup.rst. */ static __always_inline const struct page *page_fixed_fake_head(const struct page *page) { if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key)) return page; /* * Only addresses aligned with PAGE_SIZE of struct page may be fake head * struct page. The alignment check aims to avoid access the fields ( * e.g. compound_head) of the @page[1]. It can avoid touch a (possibly) * cold cacheline in some cases. */ if (IS_ALIGNED((unsigned long)page, PAGE_SIZE) && test_bit(PG_head, &page->flags)) { /* * We can safely access the field of the @page[1] with PG_head * because the @page is a compound page composed with at least * two contiguous pages. */ unsigned long head = READ_ONCE(page[1].compound_head); if (likely(head & 1)) return (const struct page *)(head - 1); } return page; } #else static inline const struct page *page_fixed_fake_head(const struct page *page) { return page; } #endif static __always_inline int page_is_fake_head(struct page *page) { return page_fixed_fake_head(page) != page; } static inline unsigned long _compound_head(const struct page *page) { unsigned long head = READ_ONCE(page->compound_head); if (unlikely(head & 1)) return head - 1; return (unsigned long)page_fixed_fake_head(page); } #define compound_head(page) ((typeof(page))_compound_head(page)) /** * page_folio - Converts from page to folio. * @p: The page. * * Every page is part of a folio. This function cannot be called on a * NULL pointer. * * Context: No reference, nor lock is required on @page. If the caller * does not hold a reference, this call may race with a folio split, so * it should re-check the folio still contains this page after gaining * a reference on the folio. * Return: The folio which contains this page. */ #define page_folio(p) (_Generic((p), \ const struct page *: (const struct folio *)_compound_head(p), \ struct page *: (struct folio *)_compound_head(p))) /** * folio_page - Return a page from a folio. * @folio: The folio. * @n: The page number to return. * * @n is relative to the start of the folio. This function does not * check that the page number lies within @folio; the caller is presumed * to have a reference to the page. */ #define folio_page(folio, n) nth_page(&(folio)->page, n) static __always_inline int PageTail(struct page *page) { return READ_ONCE(page->compound_head) & 1 || page_is_fake_head(page); } static __always_inline int PageCompound(struct page *page) { return test_bit(PG_head, &page->flags) || READ_ONCE(page->compound_head) & 1; } #define PAGE_POISON_PATTERN -1l static inline int PagePoisoned(const struct page *page) { return READ_ONCE(page->flags) == PAGE_POISON_PATTERN; } #ifdef CONFIG_DEBUG_VM void page_init_poison(struct page *page, size_t size); #else static inline void page_init_poison(struct page *page, size_t size) { } #endif static unsigned long *folio_flags(struct folio *folio, unsigned n) { struct page *page = &folio->page; VM_BUG_ON_PGFLAGS(PageTail(page), page); VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags), page); return &page[n].flags; } /* * Page flags policies wrt compound pages * * PF_POISONED_CHECK * check if this struct page poisoned/uninitialized * * PF_ANY: * the page flag is relevant for small, head and tail pages. * * PF_HEAD: * for compound page all operations related to the page flag applied to * head page. * * PF_ONLY_HEAD: * for compound page, callers only ever operate on the head page. * * PF_NO_TAIL: * modifications of the page flag must be done on small or head pages, * checks can be done on tail pages too. * * PF_NO_COMPOUND: * the page flag is not relevant for compound pages. * * PF_SECOND: * the page flag is stored in the first tail page. */ #define PF_POISONED_CHECK(page) ({ \ VM_BUG_ON_PGFLAGS(PagePoisoned(page), page); \ page; }) #define PF_ANY(page, enforce) PF_POISONED_CHECK(page) #define PF_HEAD(page, enforce) PF_POISONED_CHECK(compound_head(page)) #define PF_ONLY_HEAD(page, enforce) ({ \ VM_BUG_ON_PGFLAGS(PageTail(page), page); \ PF_POISONED_CHECK(page); }) #define PF_NO_TAIL(page, enforce) ({ \ VM_BUG_ON_PGFLAGS(enforce && PageTail(page), page); \ PF_POISONED_CHECK(compound_head(page)); }) #define PF_NO_COMPOUND(page, enforce) ({ \ VM_BUG_ON_PGFLAGS(enforce && PageCompound(page), page); \ PF_POISONED_CHECK(page); }) #define PF_SECOND(page, enforce) ({ \ VM_BUG_ON_PGFLAGS(!PageHead(page), page); \ PF_POISONED_CHECK(&page[1]); }) /* Which page is the flag stored in */ #define FOLIO_PF_ANY 0 #define FOLIO_PF_HEAD 0 #define FOLIO_PF_ONLY_HEAD 0 #define FOLIO_PF_NO_TAIL 0 #define FOLIO_PF_NO_COMPOUND 0 #define FOLIO_PF_SECOND 1 /* * Macros to create function definitions for page flags */ #define TESTPAGEFLAG(uname, lname, policy) \ static __always_inline bool folio_test_##lname(struct folio *folio) \ { return test_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ static __always_inline int Page##uname(struct page *page) \ { return test_bit(PG_##lname, &policy(page, 0)->flags); } #define SETPAGEFLAG(uname, lname, policy) \ static __always_inline \ void folio_set_##lname(struct folio *folio) \ { set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ static __always_inline void SetPage##uname(struct page *page) \ { set_bit(PG_##lname, &policy(page, 1)->flags); } #define CLEARPAGEFLAG(uname, lname, policy) \ static __always_inline \ void folio_clear_##lname(struct folio *folio) \ { clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ static __always_inline void ClearPage##uname(struct page *page) \ { clear_bit(PG_##lname, &policy(page, 1)->flags); } #define __SETPAGEFLAG(uname, lname, policy) \ static __always_inline \ void __folio_set_##lname(struct folio *folio) \ { __set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ static __always_inline void __SetPage##uname(struct page *page) \ { __set_bit(PG_##lname, &policy(page, 1)->flags); } #define __CLEARPAGEFLAG(uname, lname, policy) \ static __always_inline \ void __folio_clear_##lname(struct folio *folio) \ { __clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ static __always_inline void __ClearPage##uname(struct page *page) \ { __clear_bit(PG_##lname, &policy(page, 1)->flags); } #define TESTSETFLAG(uname, lname, policy) \ static __always_inline \ bool folio_test_set_##lname(struct folio *folio) \ { return test_and_set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ static __always_inline int TestSetPage##uname(struct page *page) \ { return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); } #define TESTCLEARFLAG(uname, lname, policy) \ static __always_inline \ bool folio_test_clear_##lname(struct folio *folio) \ { return test_and_clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ static __always_inline int TestClearPage##uname(struct page *page) \ { return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); } #define PAGEFLAG(uname, lname, policy) \ TESTPAGEFLAG(uname, lname, policy) \ SETPAGEFLAG(uname, lname, policy) \ CLEARPAGEFLAG(uname, lname, policy) #define __PAGEFLAG(uname, lname, policy) \ TESTPAGEFLAG(uname, lname, policy) \ __SETPAGEFLAG(uname, lname, policy) \ __CLEARPAGEFLAG(uname, lname, policy) #define TESTSCFLAG(uname, lname, policy) \ TESTSETFLAG(uname, lname, policy) \ TESTCLEARFLAG(uname, lname, policy) #define TESTPAGEFLAG_FALSE(uname, lname) \ static inline bool folio_test_##lname(const struct folio *folio) { return false; } \ static inline int Page##uname(const struct page *page) { return 0; } #define SETPAGEFLAG_NOOP(uname, lname) \ static inline void folio_set_##lname(struct folio *folio) { } \ static inline void SetPage##uname(struct page *page) { } #define CLEARPAGEFLAG_NOOP(uname, lname) \ static inline void folio_clear_##lname(struct folio *folio) { } \ static inline void ClearPage##uname(struct page *page) { } #define __CLEARPAGEFLAG_NOOP(uname, lname) \ static inline void __folio_clear_##lname(struct folio *folio) { } \ static inline void __ClearPage##uname(struct page *page) { } #define TESTSETFLAG_FALSE(uname, lname) \ static inline bool folio_test_set_##lname(struct folio *folio) \ { return 0; } \ static inline int TestSetPage##uname(struct page *page) { return 0; } #define TESTCLEARFLAG_FALSE(uname, lname) \ static inline bool folio_test_clear_##lname(struct folio *folio) \ { return 0; } \ static inline int TestClearPage##uname(struct page *page) { return 0; } #define PAGEFLAG_FALSE(uname, lname) TESTPAGEFLAG_FALSE(uname, lname) \ SETPAGEFLAG_NOOP(uname, lname) CLEARPAGEFLAG_NOOP(uname, lname) #define TESTSCFLAG_FALSE(uname, lname) \ TESTSETFLAG_FALSE(uname, lname) TESTCLEARFLAG_FALSE(uname, lname) __PAGEFLAG(Locked, locked, PF_NO_TAIL) PAGEFLAG(Waiters, waiters, PF_ONLY_HEAD) PAGEFLAG(Error, error, PF_NO_TAIL) TESTCLEARFLAG(Error, error, PF_NO_TAIL) PAGEFLAG(Referenced, referenced, PF_HEAD) TESTCLEARFLAG(Referenced, referenced, PF_HEAD) __SETPAGEFLAG(Referenced, referenced, PF_HEAD) PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD) __CLEARPAGEFLAG(Dirty, dirty, PF_HEAD) PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD) TESTCLEARFLAG(LRU, lru, PF_HEAD) PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD) TESTCLEARFLAG(Active, active, PF_HEAD) PAGEFLAG(Workingset, workingset, PF_HEAD) TESTCLEARFLAG(Workingset, workingset, PF_HEAD) __PAGEFLAG(Slab, slab, PF_NO_TAIL) PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */ /* Xen */ PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND) TESTSCFLAG(Pinned, pinned, PF_NO_COMPOUND) PAGEFLAG(SavePinned, savepinned, PF_NO_COMPOUND); PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND); PAGEFLAG(XenRemapped, xen_remapped, PF_NO_COMPOUND) TESTCLEARFLAG(XenRemapped, xen_remapped, PF_NO_COMPOUND) PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) __SETPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) __SETPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) /* * Private page markings that may be used by the filesystem that owns the page * for its own purposes. * - PG_private and PG_private_2 cause release_folio() and co to be invoked */ PAGEFLAG(Private, private, PF_ANY) PAGEFLAG(Private2, private_2, PF_ANY) TESTSCFLAG(Private2, private_2, PF_ANY) PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY) TESTCLEARFLAG(OwnerPriv1, owner_priv_1, PF_ANY) /* * Only test-and-set exist for PG_writeback. The unconditional operators are * risky: they bypass page accounting. */ TESTPAGEFLAG(Writeback, writeback, PF_NO_TAIL) TESTSCFLAG(Writeback, writeback, PF_NO_TAIL) PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_TAIL) /* PG_readahead is only used for reads; PG_reclaim is only for writes */ PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL) TESTCLEARFLAG(Reclaim, reclaim, PF_NO_TAIL) PAGEFLAG(Readahead, readahead, PF_NO_COMPOUND) TESTCLEARFLAG(Readahead, readahead, PF_NO_COMPOUND) #ifdef CONFIG_HIGHMEM /* * Must use a macro here due to header dependency issues. page_zone() is not * available at this point. */ #define PageHighMem(__p) is_highmem_idx(page_zonenum(__p)) #define folio_test_highmem(__f) is_highmem_idx(folio_zonenum(__f)) #else PAGEFLAG_FALSE(HighMem, highmem) #endif #ifdef CONFIG_SWAP static __always_inline bool folio_test_swapcache(struct folio *folio) { return folio_test_swapbacked(folio) && test_bit(PG_swapcache, folio_flags(folio, 0)); } static __always_inline bool PageSwapCache(struct page *page) { return folio_test_swapcache(page_folio(page)); } SETPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL) CLEARPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL) #else PAGEFLAG_FALSE(SwapCache, swapcache) #endif PAGEFLAG(Unevictable, unevictable, PF_HEAD) __CLEARPAGEFLAG(Unevictable, unevictable, PF_HEAD) TESTCLEARFLAG(Unevictable, unevictable, PF_HEAD) #ifdef CONFIG_MMU PAGEFLAG(Mlocked, mlocked, PF_NO_TAIL) __CLEARPAGEFLAG(Mlocked, mlocked, PF_NO_TAIL) TESTSCFLAG(Mlocked, mlocked, PF_NO_TAIL) #else PAGEFLAG_FALSE(Mlocked, mlocked) __CLEARPAGEFLAG_NOOP(Mlocked, mlocked) TESTSCFLAG_FALSE(Mlocked, mlocked) #endif #ifdef CONFIG_ARCH_USES_PG_UNCACHED PAGEFLAG(Uncached, uncached, PF_NO_COMPOUND) #else PAGEFLAG_FALSE(Uncached, uncached) #endif #ifdef CONFIG_MEMORY_FAILURE PAGEFLAG(HWPoison, hwpoison, PF_ANY) TESTSCFLAG(HWPoison, hwpoison, PF_ANY) #define __PG_HWPOISON (1UL << PG_hwpoison) #define MAGIC_HWPOISON 0x48575053U /* HWPS */ extern void SetPageHWPoisonTakenOff(struct page *page); extern void ClearPageHWPoisonTakenOff(struct page *page); extern bool take_page_off_buddy(struct page *page); extern bool put_page_back_buddy(struct page *page); #else PAGEFLAG_FALSE(HWPoison, hwpoison) #define __PG_HWPOISON 0 #endif #if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT) TESTPAGEFLAG(Young, young, PF_ANY) SETPAGEFLAG(Young, young, PF_ANY) TESTCLEARFLAG(Young, young, PF_ANY) PAGEFLAG(Idle, idle, PF_ANY) #endif /* * PageReported() is used to track reported free pages within the Buddy * allocator. We can use the non-atomic version of the test and set * operations as both should be shielded with the zone lock to prevent * any possible races on the setting or clearing of the bit. */ __PAGEFLAG(Reported, reported, PF_NO_COMPOUND) #ifdef CONFIG_MEMORY_HOTPLUG PAGEFLAG(VmemmapSelfHosted, vmemmap_self_hosted, PF_ANY) #else PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted) #endif /* * On an anonymous page mapped into a user virtual memory area, * page->mapping points to its anon_vma, not to a struct address_space; * with the PAGE_MAPPING_ANON bit set to distinguish it. See rmap.h. * * On an anonymous page in a VM_MERGEABLE area, if CONFIG_KSM is enabled, * the PAGE_MAPPING_MOVABLE bit may be set along with the PAGE_MAPPING_ANON * bit; and then page->mapping points, not to an anon_vma, but to a private * structure which KSM associates with that merged page. See ksm.h. * * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is used for non-lru movable * page and then page->mapping points to a struct movable_operations. * * Please note that, confusingly, "page_mapping" refers to the inode * address_space which maps the page from disk; whereas "page_mapped" * refers to user virtual address space into which the page is mapped. * * For slab pages, since slab reuses the bits in struct page to store its * internal states, the page->mapping does not exist as such, nor do these * flags below. So in order to avoid testing non-existent bits, please * make sure that PageSlab(page) actually evaluates to false before calling * the following functions (e.g., PageAnon). See mm/slab.h. */ #define PAGE_MAPPING_ANON 0x1 #define PAGE_MAPPING_MOVABLE 0x2 #define PAGE_MAPPING_KSM (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE) #define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE) /* * Different with flags above, this flag is used only for fsdax mode. It * indicates that this page->mapping is now under reflink case. */ #define PAGE_MAPPING_DAX_SHARED ((void *)0x1) static __always_inline bool folio_mapping_flags(struct folio *folio) { return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) != 0; } static __always_inline int PageMappingFlags(struct page *page) { return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) != 0; } static __always_inline bool folio_test_anon(struct folio *folio) { return ((unsigned long)folio->mapping & PAGE_MAPPING_ANON) != 0; } static __always_inline bool PageAnon(struct page *page) { return folio_test_anon(page_folio(page)); } static __always_inline bool __folio_test_movable(const struct folio *folio) { return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) == PAGE_MAPPING_MOVABLE; } static __always_inline int __PageMovable(struct page *page) { return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) == PAGE_MAPPING_MOVABLE; } #ifdef CONFIG_KSM /* * A KSM page is one of those write-protected "shared pages" or "merged pages" * which KSM maps into multiple mms, wherever identical anonymous page content * is found in VM_MERGEABLE vmas. It's a PageAnon page, pointing not to any * anon_vma, but to that page's node of the stable tree. */ static __always_inline bool folio_test_ksm(struct folio *folio) { return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) == PAGE_MAPPING_KSM; } static __always_inline bool PageKsm(struct page *page) { return folio_test_ksm(page_folio(page)); } #else TESTPAGEFLAG_FALSE(Ksm, ksm) #endif u64 stable_page_flags(struct page *page); /** * folio_test_uptodate - Is this folio up to date? * @folio: The folio. * * The uptodate flag is set on a folio when every byte in the folio is * at least as new as the corresponding bytes on storage. Anonymous * and CoW folios are always uptodate. If the folio is not uptodate, * some of the bytes in it may be; see the is_partially_uptodate() * address_space operation. */ static inline bool folio_test_uptodate(struct folio *folio) { bool ret = test_bit(PG_uptodate, folio_flags(folio, 0)); /* * Must ensure that the data we read out of the folio is loaded * _after_ we've loaded folio->flags to check the uptodate bit. * We can skip the barrier if the folio is not uptodate, because * we wouldn't be reading anything from it. * * See folio_mark_uptodate() for the other side of the story. */ if (ret) smp_rmb(); return ret; } static inline int PageUptodate(struct page *page) { return folio_test_uptodate(page_folio(page)); } static __always_inline void __folio_mark_uptodate(struct folio *folio) { smp_wmb(); __set_bit(PG_uptodate, folio_flags(folio, 0)); } static __always_inline void folio_mark_uptodate(struct folio *folio) { /* * Memory barrier must be issued before setting the PG_uptodate bit, * so that all previous stores issued in order to bring the folio * uptodate are actually visible before folio_test_uptodate becomes true. */ smp_wmb(); set_bit(PG_uptodate, folio_flags(folio, 0)); } static __always_inline void __SetPageUptodate(struct page *page) { __folio_mark_uptodate((struct folio *)page); } static __always_inline void SetPageUptodate(struct page *page) { folio_mark_uptodate((struct folio *)page); } CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL) bool __folio_start_writeback(struct folio *folio, bool keep_write); bool set_page_writeback(struct page *page); #define folio_start_writeback(folio) \ __folio_start_writeback(folio, false) #define folio_start_writeback_keepwrite(folio) \ __folio_start_writeback(folio, true) static inline bool test_set_page_writeback(struct page *page) { return set_page_writeback(page); } static __always_inline bool folio_test_head(struct folio *folio) { return test_bit(PG_head, folio_flags(folio, FOLIO_PF_ANY)); } static __always_inline int PageHead(struct page *page) { PF_POISONED_CHECK(page); return test_bit(PG_head, &page->flags) && !page_is_fake_head(page); } __SETPAGEFLAG(Head, head, PF_ANY) __CLEARPAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY) /** * folio_test_large() - Does this folio contain more than one page? * @folio: The folio to test. * * Return: True if the folio is larger than one page. */ static inline bool folio_test_large(struct folio *folio) { return folio_test_head(folio); } static __always_inline void set_compound_head(struct page *page, struct page *head) { WRITE_ONCE(page->compound_head, (unsigned long)head + 1); } static __always_inline void clear_compound_head(struct page *page) { WRITE_ONCE(page->compound_head, 0); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void ClearPageCompound(struct page *page) { BUG_ON(!PageHead(page)); ClearPageHead(page); } PAGEFLAG(LargeRmappable, large_rmappable, PF_SECOND) #else TESTPAGEFLAG_FALSE(LargeRmappable, large_rmappable) #endif #define PG_head_mask ((1UL << PG_head)) #ifdef CONFIG_HUGETLB_PAGE int PageHuge(struct page *page); SETPAGEFLAG(HugeTLB, hugetlb, PF_SECOND) CLEARPAGEFLAG(HugeTLB, hugetlb, PF_SECOND) /** * folio_test_hugetlb - Determine if the folio belongs to hugetlbfs * @folio: The folio to test. * * Context: Any context. Caller should have a reference on the folio to * prevent it from being turned into a tail page. * Return: True for hugetlbfs folios, false for anon folios or folios * belonging to other filesystems. */ static inline bool folio_test_hugetlb(struct folio *folio) { return folio_test_large(folio) && test_bit(PG_hugetlb, folio_flags(folio, 1)); } #else TESTPAGEFLAG_FALSE(Huge, hugetlb) #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * PageHuge() only returns true for hugetlbfs pages, but not for * normal or transparent huge pages. * * PageTransHuge() returns true for both transparent huge and * hugetlbfs pages, but not normal pages. PageTransHuge() can only be * called only in the core VM paths where hugetlbfs pages can't exist. */ static inline int PageTransHuge(struct page *page) { VM_BUG_ON_PAGE(PageTail(page), page); return PageHead(page); } /* * PageTransCompound returns true for both transparent huge pages * and hugetlbfs pages, so it should only be called when it's known * that hugetlbfs pages aren't involved. */ static inline int PageTransCompound(struct page *page) { return PageCompound(page); } /* * PageTransTail returns true for both transparent huge pages * and hugetlbfs pages, so it should only be called when it's known * that hugetlbfs pages aren't involved. */ static inline int PageTransTail(struct page *page) { return PageTail(page); } #else TESTPAGEFLAG_FALSE(TransHuge, transhuge) TESTPAGEFLAG_FALSE(TransCompound, transcompound) TESTPAGEFLAG_FALSE(TransCompoundMap, transcompoundmap) TESTPAGEFLAG_FALSE(TransTail, transtail) #endif #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_TRANSPARENT_HUGEPAGE) /* * PageHasHWPoisoned indicates that at least one subpage is hwpoisoned in the * compound page. * * This flag is set by hwpoison handler. Cleared by THP split or free page. */ PAGEFLAG(HasHWPoisoned, has_hwpoisoned, PF_SECOND) TESTSCFLAG(HasHWPoisoned, has_hwpoisoned, PF_SECOND) #else PAGEFLAG_FALSE(HasHWPoisoned, has_hwpoisoned) TESTSCFLAG_FALSE(HasHWPoisoned, has_hwpoisoned) #endif /* * Check if a page is currently marked HWPoisoned. Note that this check is * best effort only and inherently racy: there is no way to synchronize with * failing hardware. */ static inline bool is_page_hwpoison(struct page *page) { if (PageHWPoison(page)) return true; return PageHuge(page) && PageHWPoison(compound_head(page)); } /* * For pages that are never mapped to userspace (and aren't PageSlab), * page_type may be used. Because it is initialised to -1, we invert the * sense of the bit, so __SetPageFoo *clears* the bit used for PageFoo, and * __ClearPageFoo *sets* the bit used for PageFoo. We reserve a few high and * low bits so that an underflow or overflow of page_mapcount() won't be * mistaken for a page type value. */ #define PAGE_TYPE_BASE 0xf0000000 /* Reserve 0x0000007f to catch underflows of page_mapcount */ #define PAGE_MAPCOUNT_RESERVE -128 #define PG_buddy 0x00000080 #define PG_offline 0x00000100 #define PG_table 0x00000200 #define PG_guard 0x00000400 #define PageType(page, flag) \ ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) #define folio_test_type(folio, flag) \ ((folio->page.page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) static inline int page_type_has_type(unsigned int page_type) { return (int)page_type < PAGE_MAPCOUNT_RESERVE; } static inline int page_has_type(struct page *page) { return page_type_has_type(page->page_type); } #define PAGE_TYPE_OPS(uname, lname, fname) \ static __always_inline int Page##uname(const struct page *page) \ { \ return PageType(page, PG_##lname); \ } \ static __always_inline int folio_test_##fname(const struct folio *folio)\ { \ return folio_test_type(folio, PG_##lname); \ } \ static __always_inline void __SetPage##uname(struct page *page) \ { \ VM_BUG_ON_PAGE(!PageType(page, 0), page); \ page->page_type &= ~PG_##lname; \ } \ static __always_inline void __folio_set_##fname(struct folio *folio) \ { \ VM_BUG_ON_FOLIO(!folio_test_type(folio, 0), folio); \ folio->page.page_type &= ~PG_##lname; \ } \ static __always_inline void __ClearPage##uname(struct page *page) \ { \ VM_BUG_ON_PAGE(!Page##uname(page), page); \ page->page_type |= PG_##lname; \ } \ static __always_inline void __folio_clear_##fname(struct folio *folio) \ { \ VM_BUG_ON_FOLIO(!folio_test_##fname(folio), folio); \ folio->page.page_type |= PG_##lname; \ } \ /* * PageBuddy() indicates that the page is free and in the buddy system * (see mm/page_alloc.c). */ PAGE_TYPE_OPS(Buddy, buddy, buddy) /* * PageOffline() indicates that the page is logically offline although the * containing section is online. (e.g. inflated in a balloon driver or * not onlined when onlining the section). * The content of these pages is effectively stale. Such pages should not * be touched (read/write/dump/save) except by their owner. * * If a driver wants to allow to offline unmovable PageOffline() pages without * putting them back to the buddy, it can do so via the memory notifier by * decrementing the reference count in MEM_GOING_OFFLINE and incrementing the * reference count in MEM_CANCEL_OFFLINE. When offlining, the PageOffline() * pages (now with a reference count of zero) are treated like free pages, * allowing the containing memory block to get offlined. A driver that * relies on this feature is aware that re-onlining the memory block will * require to re-set the pages PageOffline() and not giving them to the * buddy via online_page_callback_t. * * There are drivers that mark a page PageOffline() and expect there won't be * any further access to page content. PFN walkers that read content of random * pages should check PageOffline() and synchronize with such drivers using * page_offline_freeze()/page_offline_thaw(). */ PAGE_TYPE_OPS(Offline, offline, offline) extern void page_offline_freeze(void); extern void page_offline_thaw(void); extern void page_offline_begin(void); extern void page_offline_end(void); /* * Marks pages in use as page tables. */ PAGE_TYPE_OPS(Table, table, pgtable) /* * Marks guardpages used with debug_pagealloc. */ PAGE_TYPE_OPS(Guard, guard, guard) extern bool is_free_buddy_page(struct page *page); PAGEFLAG(Isolated, isolated, PF_ANY); static __always_inline int PageAnonExclusive(struct page *page) { VM_BUG_ON_PGFLAGS(!PageAnon(page), page); VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); return test_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags); } static __always_inline void SetPageAnonExclusive(struct page *page) { VM_BUG_ON_PGFLAGS(!PageAnon(page) || PageKsm(page), page); VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); set_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags); } static __always_inline void ClearPageAnonExclusive(struct page *page) { VM_BUG_ON_PGFLAGS(!PageAnon(page) || PageKsm(page), page); VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags); } static __always_inline void __ClearPageAnonExclusive(struct page *page) { VM_BUG_ON_PGFLAGS(!PageAnon(page), page); VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); __clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags); } #ifdef CONFIG_MMU #define __PG_MLOCKED (1UL << PG_mlocked) #else #define __PG_MLOCKED 0 #endif /* * Flags checked when a page is freed. Pages being freed should not have * these flags set. If they are, there is a problem. */ #define PAGE_FLAGS_CHECK_AT_FREE \ (1UL << PG_lru | 1UL << PG_locked | \ 1UL << PG_private | 1UL << PG_private_2 | \ 1UL << PG_writeback | 1UL << PG_reserved | \ 1UL << PG_slab | 1UL << PG_active | \ 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK) /* * Flags checked when a page is prepped for return by the page allocator. * Pages being prepped should not have these flags set. If they are set, * there has been a kernel bug or struct page corruption. * * __PG_HWPOISON is exceptional because it needs to be kept beyond page's * alloc-free cycle to prevent from reusing the page. */ #define PAGE_FLAGS_CHECK_AT_PREP \ ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK) /* * Flags stored in the second page of a compound page. They may overlap * the CHECK_AT_FREE flags above, so need to be cleared. */ #define PAGE_FLAGS_SECOND \ (0xffUL /* order */ | 1UL << PG_has_hwpoisoned | \ 1UL << PG_hugetlb | 1UL << PG_large_rmappable) #define PAGE_FLAGS_PRIVATE \ (1UL << PG_private | 1UL << PG_private_2) /** * page_has_private - Determine if page has private stuff * @page: The page to be checked * * Determine if a page has private stuff, indicating that release routines * should be invoked upon it. */ static inline int page_has_private(struct page *page) { return !!(page->flags & PAGE_FLAGS_PRIVATE); } static inline bool folio_has_private(struct folio *folio) { return page_has_private(&folio->page); } #undef PF_ANY #undef PF_HEAD #undef PF_ONLY_HEAD #undef PF_NO_TAIL #undef PF_NO_COMPOUND #undef PF_SECOND #endif /* !__GENERATING_BOUNDS_H */ #endif /* PAGE_FLAGS_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner */ #ifndef _NET_BATMAN_ADV_GATEWAY_CLIENT_H_ #define _NET_BATMAN_ADV_GATEWAY_CLIENT_H_ #include "main.h" #include <linux/kref.h> #include <linux/netlink.h> #include <linux/skbuff.h> #include <linux/types.h> #include <uapi/linux/batadv_packet.h> void batadv_gw_check_client_stop(struct batadv_priv *bat_priv); void batadv_gw_reselect(struct batadv_priv *bat_priv); void batadv_gw_election(struct batadv_priv *bat_priv); struct batadv_orig_node * batadv_gw_get_selected_orig(struct batadv_priv *bat_priv); void batadv_gw_check_election(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node); void batadv_gw_node_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, struct batadv_tvlv_gateway_data *gateway); void batadv_gw_node_delete(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node); void batadv_gw_node_free(struct batadv_priv *bat_priv); void batadv_gw_node_release(struct kref *ref); struct batadv_gw_node * batadv_gw_get_selected_gw_node(struct batadv_priv *bat_priv); int batadv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb); bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, struct sk_buff *skb); enum batadv_dhcp_recipient batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len, u8 *chaddr); struct batadv_gw_node *batadv_gw_node_get(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node); /** * batadv_gw_node_put() - decrement the gw_node refcounter and possibly release * it * @gw_node: gateway node to free */ static inline void batadv_gw_node_put(struct batadv_gw_node *gw_node) { if (!gw_node) return; kref_put(&gw_node->refcount, batadv_gw_node_release); } #endif /* _NET_BATMAN_ADV_GATEWAY_CLIENT_H_ */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_KHUGEPAGED_H #define _LINUX_KHUGEPAGED_H #include <linux/sched/coredump.h> /* MMF_VM_HUGEPAGE */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern struct attribute_group khugepaged_attr_group; extern int khugepaged_init(void); extern void khugepaged_destroy(void); extern int start_stop_khugepaged(void); extern void __khugepaged_enter(struct mm_struct *mm); extern void __khugepaged_exit(struct mm_struct *mm); extern void khugepaged_enter_vma(struct vm_area_struct *vma, unsigned long vm_flags); extern void khugepaged_min_free_kbytes_update(void); extern bool current_is_khugepaged(void); #ifdef CONFIG_SHMEM extern int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, bool install_pmd); #else static inline int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, bool install_pmd) { return 0; } #endif static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) { if (test_bit(MMF_VM_HUGEPAGE, &oldmm->flags)) __khugepaged_enter(mm); } static inline void khugepaged_exit(struct mm_struct *mm) { if (test_bit(MMF_VM_HUGEPAGE, &mm->flags)) __khugepaged_exit(mm); } #else /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) { } static inline void khugepaged_exit(struct mm_struct *mm) { } static inline void khugepaged_enter_vma(struct vm_area_struct *vma, unsigned long vm_flags) { } static inline int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, bool install_pmd) { return 0; } static inline void khugepaged_min_free_kbytes_update(void) { } static inline bool current_is_khugepaged(void) { return false; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* _LINUX_KHUGEPAGED_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * acpi_bus.h - ACPI Bus Driver ($Revision: 22 $) * * Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com> * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> */ #ifndef __ACPI_BUS_H__ #define __ACPI_BUS_H__ #include <linux/device.h> #include <linux/property.h> /* TBD: Make dynamic */ #define ACPI_MAX_HANDLES 10 struct acpi_handle_list { u32 count; acpi_handle handles[ACPI_MAX_HANDLES]; }; /* acpi_utils.h */ acpi_status acpi_extract_package(union acpi_object *package, struct acpi_buffer *format, struct acpi_buffer *buffer); acpi_status acpi_evaluate_integer(acpi_handle handle, acpi_string pathname, struct acpi_object_list *arguments, unsigned long long *data); acpi_status acpi_evaluate_reference(acpi_handle handle, acpi_string pathname, struct acpi_object_list *arguments, struct acpi_handle_list *list); acpi_status acpi_evaluate_ost(acpi_handle handle, u32 source_event, u32 status_code, struct acpi_buffer *status_buf); acpi_status acpi_get_physical_device_location(acpi_handle handle, struct acpi_pld_info **pld); bool acpi_has_method(acpi_handle handle, char *name); acpi_status acpi_execute_simple_method(acpi_handle handle, char *method, u64 arg); acpi_status acpi_evaluate_ej0(acpi_handle handle); acpi_status acpi_evaluate_lck(acpi_handle handle, int lock); acpi_status acpi_evaluate_reg(acpi_handle handle, u8 space_id, u32 function); bool acpi_ata_match(acpi_handle handle); bool acpi_bay_match(acpi_handle handle); bool acpi_dock_match(acpi_handle handle); bool acpi_check_dsm(acpi_handle handle, const guid_t *guid, u64 rev, u64 funcs); union acpi_object *acpi_evaluate_dsm(acpi_handle handle, const guid_t *guid, u64 rev, u64 func, union acpi_object *argv4); #ifdef CONFIG_ACPI static inline union acpi_object * acpi_evaluate_dsm_typed(acpi_handle handle, const guid_t *guid, u64 rev, u64 func, union acpi_object *argv4, acpi_object_type type) { union acpi_object *obj; obj = acpi_evaluate_dsm(handle, guid, rev, func, argv4); if (obj && obj->type != type) { ACPI_FREE(obj); obj = NULL; } return obj; } #endif #define ACPI_INIT_DSM_ARGV4(cnt, eles) \ { \ .package.type = ACPI_TYPE_PACKAGE, \ .package.count = (cnt), \ .package.elements = (eles) \ } bool acpi_dev_found(const char *hid); bool acpi_dev_present(const char *hid, const char *uid, s64 hrv); bool acpi_reduced_hardware(void); #ifdef CONFIG_ACPI struct proc_dir_entry; #define ACPI_BUS_FILE_ROOT "acpi" extern struct proc_dir_entry *acpi_root_dir; enum acpi_bus_device_type { ACPI_BUS_TYPE_DEVICE = 0, ACPI_BUS_TYPE_POWER, ACPI_BUS_TYPE_PROCESSOR, ACPI_BUS_TYPE_THERMAL, ACPI_BUS_TYPE_POWER_BUTTON, ACPI_BUS_TYPE_SLEEP_BUTTON, ACPI_BUS_TYPE_ECDT_EC, ACPI_BUS_DEVICE_TYPE_COUNT }; struct acpi_driver; struct acpi_device; /* * ACPI Scan Handler * ----------------- */ struct acpi_hotplug_profile { struct kobject kobj; int (*scan_dependent)(struct acpi_device *adev); void (*notify_online)(struct acpi_device *adev); bool enabled:1; bool demand_offline:1; }; static inline struct acpi_hotplug_profile *to_acpi_hotplug_profile( struct kobject *kobj) { return container_of(kobj, struct acpi_hotplug_profile, kobj); } struct acpi_scan_handler { const struct acpi_device_id *ids; struct list_head list_node; bool (*match)(const char *idstr, const struct acpi_device_id **matchid); int (*attach)(struct acpi_device *dev, const struct acpi_device_id *id); void (*detach)(struct acpi_device *dev); void (*bind)(struct device *phys_dev); void (*unbind)(struct device *phys_dev); struct acpi_hotplug_profile hotplug; }; /* * ACPI Hotplug Context * -------------------- */ struct acpi_hotplug_context { struct acpi_device *self; int (*notify)(struct acpi_device *, u32); void (*uevent)(struct acpi_device *, u32); void (*fixup)(struct acpi_device *); }; /* * ACPI Driver * ----------- */ typedef int (*acpi_op_add) (struct acpi_device * device); typedef void (*acpi_op_remove) (struct acpi_device *device); typedef void (*acpi_op_notify) (struct acpi_device * device, u32 event); struct acpi_device_ops { acpi_op_add add; acpi_op_remove remove; acpi_op_notify notify; }; #define ACPI_DRIVER_ALL_NOTIFY_EVENTS 0x1 /* system AND device events */ struct acpi_driver { char name[80]; char class[80]; const struct acpi_device_id *ids; /* Supported Hardware IDs */ unsigned int flags; struct acpi_device_ops ops; struct device_driver drv; struct module *owner; }; /* * ACPI Device * ----------- */ /* Status (_STA) */ struct acpi_device_status { u32 present:1; u32 enabled:1; u32 show_in_ui:1; u32 functional:1; u32 battery_present:1; u32 reserved:27; }; /* Flags */ struct acpi_device_flags { u32 dynamic_status:1; u32 removable:1; u32 ejectable:1; u32 power_manageable:1; u32 match_driver:1; u32 initialized:1; u32 visited:1; u32 hotplug_notify:1; u32 is_dock_station:1; u32 of_compatible_ok:1; u32 coherent_dma:1; u32 cca_seen:1; u32 enumeration_by_parent:1; u32 honor_deps:1; u32 reserved:18; }; /* File System */ struct acpi_device_dir { struct proc_dir_entry *entry; }; #define acpi_device_dir(d) ((d)->dir.entry) /* Plug and Play */ typedef char acpi_bus_id[8]; typedef u64 acpi_bus_address; typedef char acpi_device_name[40]; typedef char acpi_device_class[20]; struct acpi_hardware_id { struct list_head list; const char *id; }; struct acpi_pnp_type { u32 hardware_id:1; u32 bus_address:1; u32 platform_id:1; u32 backlight:1; u32 reserved:28; }; struct acpi_device_pnp { acpi_bus_id bus_id; /* Object name */ int instance_no; /* Instance number of this object */ struct acpi_pnp_type type; /* ID type */ acpi_bus_address bus_address; /* _ADR */ char *unique_id; /* _UID */ struct list_head ids; /* _HID and _CIDs */ acpi_device_name device_name; /* Driver-determined */ acpi_device_class device_class; /* " */ union acpi_object *str_obj; /* unicode string for _STR method */ }; #define acpi_device_bid(d) ((d)->pnp.bus_id) #define acpi_device_adr(d) ((d)->pnp.bus_address) const char *acpi_device_hid(struct acpi_device *device); #define acpi_device_uid(d) ((d)->pnp.unique_id) #define acpi_device_name(d) ((d)->pnp.device_name) #define acpi_device_class(d) ((d)->pnp.device_class) /* Power Management */ struct acpi_device_power_flags { u32 explicit_get:1; /* _PSC present? */ u32 power_resources:1; /* Power resources */ u32 inrush_current:1; /* Serialize Dx->D0 */ u32 power_removed:1; /* Optimize Dx->D0 */ u32 ignore_parent:1; /* Power is independent of parent power state */ u32 dsw_present:1; /* _DSW present? */ u32 reserved:26; }; struct acpi_device_power_state { struct { u8 valid:1; u8 explicit_set:1; /* _PSx present? */ u8 reserved:6; } flags; int power; /* % Power (compared to D0) */ int latency; /* Dx->D0 time (microseconds) */ struct list_head resources; /* Power resources referenced */ }; struct acpi_device_power { int state; /* Current state */ struct acpi_device_power_flags flags; struct acpi_device_power_state states[ACPI_D_STATE_COUNT]; /* Power states (D0-D3Cold) */ u8 state_for_enumeration; /* Deepest power state for enumeration */ }; struct acpi_dep_data { struct list_head node; acpi_handle supplier; acpi_handle consumer; bool honor_dep; bool met; bool free_when_met; }; /* Performance Management */ struct acpi_device_perf_flags { u8 reserved:8; }; struct acpi_device_perf_state { struct { u8 valid:1; u8 reserved:7; } flags; u8 power; /* % Power (compared to P0) */ u8 performance; /* % Performance ( " ) */ int latency; /* Px->P0 time (microseconds) */ }; struct acpi_device_perf { int state; struct acpi_device_perf_flags flags; int state_count; struct acpi_device_perf_state *states; }; /* Wakeup Management */ struct acpi_device_wakeup_flags { u8 valid:1; /* Can successfully enable wakeup? */ u8 notifier_present:1; /* Wake-up notify handler has been installed */ }; struct acpi_device_wakeup_context { void (*func)(struct acpi_device_wakeup_context *context); struct device *dev; }; struct acpi_device_wakeup { acpi_handle gpe_device; u64 gpe_number; u64 sleep_state; struct list_head resources; struct acpi_device_wakeup_flags flags; struct acpi_device_wakeup_context context; struct wakeup_source *ws; int prepare_count; int enable_count; }; struct acpi_device_physical_node { unsigned int node_id; struct list_head node; struct device *dev; bool put_online:1; }; struct acpi_device_properties { const guid_t *guid; union acpi_object *properties; struct list_head list; void **bufs; }; /* ACPI Device Specific Data (_DSD) */ struct acpi_device_data { const union acpi_object *pointer; struct list_head properties; const union acpi_object *of_compatible; struct list_head subnodes; }; struct acpi_gpio_mapping; /* Device */ struct acpi_device { u32 pld_crc; int device_type; acpi_handle handle; /* no handle for fixed hardware */ struct fwnode_handle fwnode; struct list_head wakeup_list; struct list_head del_list; struct acpi_device_status status; struct acpi_device_flags flags; struct acpi_device_pnp pnp; struct acpi_device_power power; struct acpi_device_wakeup wakeup; struct acpi_device_perf performance; struct acpi_device_dir dir; struct acpi_device_data data; struct acpi_scan_handler *handler; struct acpi_hotplug_context *hp; const struct acpi_gpio_mapping *driver_gpios; void *driver_data; struct device dev; unsigned int physical_node_count; unsigned int dep_unmet; struct list_head physical_node_list; struct mutex physical_node_lock; void (*remove)(struct acpi_device *); }; /* Non-device subnode */ struct acpi_data_node { const char *name; acpi_handle handle; struct fwnode_handle fwnode; struct fwnode_handle *parent; struct acpi_device_data data; struct list_head sibling; struct kobject kobj; struct completion kobj_done; }; extern const struct fwnode_operations acpi_device_fwnode_ops; extern const struct fwnode_operations acpi_data_fwnode_ops; extern const struct fwnode_operations acpi_static_fwnode_ops; bool is_acpi_device_node(const struct fwnode_handle *fwnode); bool is_acpi_data_node(const struct fwnode_handle *fwnode); static inline bool is_acpi_node(const struct fwnode_handle *fwnode) { return (is_acpi_device_node(fwnode) || is_acpi_data_node(fwnode)); } #define to_acpi_device_node(__fwnode) \ ({ \ typeof(__fwnode) __to_acpi_device_node_fwnode = __fwnode; \ \ is_acpi_device_node(__to_acpi_device_node_fwnode) ? \ container_of(__to_acpi_device_node_fwnode, \ struct acpi_device, fwnode) : \ NULL; \ }) #define to_acpi_data_node(__fwnode) \ ({ \ typeof(__fwnode) __to_acpi_data_node_fwnode = __fwnode; \ \ is_acpi_data_node(__to_acpi_data_node_fwnode) ? \ container_of(__to_acpi_data_node_fwnode, \ struct acpi_data_node, fwnode) : \ NULL; \ }) static inline bool is_acpi_static_node(const struct fwnode_handle *fwnode) { return !IS_ERR_OR_NULL(fwnode) && fwnode->ops == &acpi_static_fwnode_ops; } static inline bool acpi_data_node_match(const struct fwnode_handle *fwnode, const char *name) { return is_acpi_data_node(fwnode) ? (!strcmp(to_acpi_data_node(fwnode)->name, name)) : false; } static inline struct fwnode_handle *acpi_fwnode_handle(struct acpi_device *adev) { return &adev->fwnode; } static inline void *acpi_driver_data(struct acpi_device *d) { return d->driver_data; } #define to_acpi_device(d) container_of(d, struct acpi_device, dev) #define to_acpi_driver(d) container_of(d, struct acpi_driver, drv) static inline struct acpi_device *acpi_dev_parent(struct acpi_device *adev) { if (adev->dev.parent) return to_acpi_device(adev->dev.parent); return NULL; } static inline void acpi_set_device_status(struct acpi_device *adev, u32 sta) { *((u32 *)&adev->status) = sta; } static inline void acpi_set_hp_context(struct acpi_device *adev, struct acpi_hotplug_context *hp) { hp->self = adev; adev->hp = hp; } void acpi_initialize_hp_context(struct acpi_device *adev, struct acpi_hotplug_context *hp, int (*notify)(struct acpi_device *, u32), void (*uevent)(struct acpi_device *, u32)); /* acpi_device.dev.bus == &acpi_bus_type */ extern struct bus_type acpi_bus_type; int acpi_bus_for_each_dev(int (*fn)(struct device *, void *), void *data); int acpi_dev_for_each_child(struct acpi_device *adev, int (*fn)(struct acpi_device *, void *), void *data); int acpi_dev_for_each_child_reverse(struct acpi_device *adev, int (*fn)(struct acpi_device *, void *), void *data); /* * Events * ------ */ struct acpi_bus_event { struct list_head node; acpi_device_class device_class; acpi_bus_id bus_id; u32 type; u32 data; }; extern struct kobject *acpi_kobj; extern int acpi_bus_generate_netlink_event(const char*, const char*, u8, int); void acpi_bus_private_data_handler(acpi_handle, void *); int acpi_bus_get_private_data(acpi_handle, void **); int acpi_bus_attach_private_data(acpi_handle, void *); void acpi_bus_detach_private_data(acpi_handle); int acpi_dev_install_notify_handler(struct acpi_device *adev, u32 handler_type, acpi_notify_handler handler); void acpi_dev_remove_notify_handler(struct acpi_device *adev, u32 handler_type, acpi_notify_handler handler); extern int acpi_notifier_call_chain(struct acpi_device *, u32, u32); extern int register_acpi_notifier(struct notifier_block *); extern int unregister_acpi_notifier(struct notifier_block *); /* * External Functions */ acpi_status acpi_bus_get_status_handle(acpi_handle handle, unsigned long long *sta); int acpi_bus_get_status(struct acpi_device *device); int acpi_bus_set_power(acpi_handle handle, int state); const char *acpi_power_state_string(int state); int acpi_device_set_power(struct acpi_device *device, int state); int acpi_bus_init_power(struct acpi_device *device); int acpi_device_fix_up_power(struct acpi_device *device); void acpi_device_fix_up_power_extended(struct acpi_device *adev); int acpi_bus_update_power(acpi_handle handle, int *state_p); int acpi_device_update_power(struct acpi_device *device, int *state_p); bool acpi_bus_power_manageable(acpi_handle handle); void acpi_dev_power_up_children_with_adr(struct acpi_device *adev); u8 acpi_dev_power_state_for_wake(struct acpi_device *adev); int acpi_device_power_add_dependent(struct acpi_device *adev, struct device *dev); void acpi_device_power_remove_dependent(struct acpi_device *adev, struct device *dev); #ifdef CONFIG_PM bool acpi_bus_can_wakeup(acpi_handle handle); #else static inline bool acpi_bus_can_wakeup(acpi_handle handle) { return false; } #endif void acpi_scan_lock_acquire(void); void acpi_scan_lock_release(void); void acpi_lock_hp_context(void); void acpi_unlock_hp_context(void); int acpi_scan_add_handler(struct acpi_scan_handler *handler); int acpi_bus_register_driver(struct acpi_driver *driver); void acpi_bus_unregister_driver(struct acpi_driver *driver); int acpi_bus_scan(acpi_handle handle); void acpi_bus_trim(struct acpi_device *start); acpi_status acpi_bus_get_ejd(acpi_handle handle, acpi_handle * ejd); int acpi_match_device_ids(struct acpi_device *device, const struct acpi_device_id *ids); void acpi_set_modalias(struct acpi_device *adev, const char *default_id, char *modalias, size_t len); static inline bool acpi_device_enumerated(struct acpi_device *adev) { return adev && adev->flags.initialized && adev->flags.visited; } /** * module_acpi_driver(acpi_driver) - Helper macro for registering an ACPI driver * @__acpi_driver: acpi_driver struct * * Helper macro for ACPI drivers which do not do anything special in module * init/exit. This eliminates a lot of boilerplate. Each module may only * use this macro once, and calling it replaces module_init() and module_exit() */ #define module_acpi_driver(__acpi_driver) \ module_driver(__acpi_driver, acpi_bus_register_driver, \ acpi_bus_unregister_driver) /* * Bind physical devices with ACPI devices */ struct acpi_bus_type { struct list_head list; const char *name; bool (*match)(struct device *dev); struct acpi_device * (*find_companion)(struct device *); void (*setup)(struct device *); }; int register_acpi_bus_type(struct acpi_bus_type *); int unregister_acpi_bus_type(struct acpi_bus_type *); int acpi_bind_one(struct device *dev, struct acpi_device *adev); int acpi_unbind_one(struct device *dev); enum acpi_bridge_type { ACPI_BRIDGE_TYPE_PCIE = 1, ACPI_BRIDGE_TYPE_CXL, }; struct acpi_pci_root { struct acpi_device * device; struct pci_bus *bus; u16 segment; int bridge_type; struct resource secondary; /* downstream bus range */ u32 osc_support_set; /* _OSC state of support bits */ u32 osc_control_set; /* _OSC state of control bits */ u32 osc_ext_support_set; /* _OSC state of extended support bits */ u32 osc_ext_control_set; /* _OSC state of extended control bits */ phys_addr_t mcfg_addr; }; /* helper */ bool acpi_dma_supported(const struct acpi_device *adev); enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev); int acpi_iommu_fwspec_init(struct device *dev, u32 id, struct fwnode_handle *fwnode, const struct iommu_ops *ops); int acpi_dma_get_range(struct device *dev, const struct bus_dma_region **map); int acpi_dma_configure_id(struct device *dev, enum dev_dma_attr attr, const u32 *input_id); static inline int acpi_dma_configure(struct device *dev, enum dev_dma_attr attr) { return acpi_dma_configure_id(dev, attr, NULL); } struct acpi_device *acpi_find_child_device(struct acpi_device *parent, u64 address, bool check_children); struct acpi_device *acpi_find_child_by_adr(struct acpi_device *adev, acpi_bus_address adr); int acpi_is_root_bridge(acpi_handle); struct acpi_pci_root *acpi_pci_find_root(acpi_handle handle); int acpi_enable_wakeup_device_power(struct acpi_device *dev, int state); int acpi_disable_wakeup_device_power(struct acpi_device *dev); #ifdef CONFIG_X86 bool acpi_device_override_status(struct acpi_device *adev, unsigned long long *status); bool acpi_quirk_skip_acpi_ac_and_battery(void); int acpi_install_cmos_rtc_space_handler(acpi_handle handle); void acpi_remove_cmos_rtc_space_handler(acpi_handle handle); #else static inline bool acpi_device_override_status(struct acpi_device *adev, unsigned long long *status) { return false; } static inline bool acpi_quirk_skip_acpi_ac_and_battery(void) { return false; } static inline int acpi_install_cmos_rtc_space_handler(acpi_handle handle) { return 1; } static inline void acpi_remove_cmos_rtc_space_handler(acpi_handle handle) { } #endif #if IS_ENABLED(CONFIG_X86_ANDROID_TABLETS) bool acpi_quirk_skip_i2c_client_enumeration(struct acpi_device *adev); int acpi_quirk_skip_serdev_enumeration(struct device *controller_parent, bool *skip); bool acpi_quirk_skip_gpio_event_handlers(void); #else static inline bool acpi_quirk_skip_i2c_client_enumeration(struct acpi_device *adev) { return false; } static inline int acpi_quirk_skip_serdev_enumeration(struct device *controller_parent, bool *skip) { *skip = false; return 0; } static inline bool acpi_quirk_skip_gpio_event_handlers(void) { return false; } #endif #ifdef CONFIG_PM void acpi_pm_wakeup_event(struct device *dev); acpi_status acpi_add_pm_notifier(struct acpi_device *adev, struct device *dev, void (*func)(struct acpi_device_wakeup_context *context)); acpi_status acpi_remove_pm_notifier(struct acpi_device *adev); bool acpi_pm_device_can_wakeup(struct device *dev); int acpi_pm_device_sleep_state(struct device *, int *, int); int acpi_pm_set_device_wakeup(struct device *dev, bool enable); #else static inline void acpi_pm_wakeup_event(struct device *dev) { } static inline acpi_status acpi_add_pm_notifier(struct acpi_device *adev, struct device *dev, void (*func)(struct acpi_device_wakeup_context *context)) { return AE_SUPPORT; } static inline acpi_status acpi_remove_pm_notifier(struct acpi_device *adev) { return AE_SUPPORT; } static inline bool acpi_pm_device_can_wakeup(struct device *dev) { return false; } static inline int acpi_pm_device_sleep_state(struct device *d, int *p, int m) { if (p) *p = ACPI_STATE_D0; return (m >= ACPI_STATE_D0 && m <= ACPI_STATE_D3_COLD) ? m : ACPI_STATE_D0; } static inline int acpi_pm_set_device_wakeup(struct device *dev, bool enable) { return -ENODEV; } #endif #ifdef CONFIG_ACPI_SYSTEM_POWER_STATES_SUPPORT bool acpi_sleep_state_supported(u8 sleep_state); #else static inline bool acpi_sleep_state_supported(u8 sleep_state) { return false; } #endif #ifdef CONFIG_ACPI_SLEEP u32 acpi_target_system_state(void); #else static inline u32 acpi_target_system_state(void) { return ACPI_STATE_S0; } #endif static inline bool acpi_device_power_manageable(struct acpi_device *adev) { return adev->flags.power_manageable; } static inline bool acpi_device_can_wakeup(struct acpi_device *adev) { return adev->wakeup.flags.valid; } static inline bool acpi_device_can_poweroff(struct acpi_device *adev) { return adev->power.states[ACPI_STATE_D3_COLD].flags.valid || ((acpi_gbl_FADT.header.revision < 6) && adev->power.states[ACPI_STATE_D3_HOT].flags.explicit_set); } bool acpi_dev_hid_uid_match(struct acpi_device *adev, const char *hid2, const char *uid2); int acpi_dev_uid_to_integer(struct acpi_device *adev, u64 *integer); void acpi_dev_clear_dependencies(struct acpi_device *supplier); bool acpi_dev_ready_for_enumeration(const struct acpi_device *device); struct acpi_device *acpi_dev_get_next_consumer_dev(struct acpi_device *supplier, struct acpi_device *start); /** * for_each_acpi_consumer_dev - iterate over the consumer ACPI devices for a * given supplier * @supplier: Pointer to the supplier's ACPI device * @consumer: Pointer to &struct acpi_device to hold the consumer, initially NULL */ #define for_each_acpi_consumer_dev(supplier, consumer) \ for (consumer = acpi_dev_get_next_consumer_dev(supplier, NULL); \ consumer; \ consumer = acpi_dev_get_next_consumer_dev(supplier, consumer)) struct acpi_device * acpi_dev_get_next_match_dev(struct acpi_device *adev, const char *hid, const char *uid, s64 hrv); struct acpi_device * acpi_dev_get_first_match_dev(const char *hid, const char *uid, s64 hrv); /** * for_each_acpi_dev_match - iterate over ACPI devices that matching the criteria * @adev: pointer to the matching ACPI device, NULL at the end of the loop * @hid: Hardware ID of the device. * @uid: Unique ID of the device, pass NULL to not check _UID * @hrv: Hardware Revision of the device, pass -1 to not check _HRV * * The caller is responsible for invoking acpi_dev_put() on the returned device. */ #define for_each_acpi_dev_match(adev, hid, uid, hrv) \ for (adev = acpi_dev_get_first_match_dev(hid, uid, hrv); \ adev; \ adev = acpi_dev_get_next_match_dev(adev, hid, uid, hrv)) static inline struct acpi_device *acpi_dev_get(struct acpi_device *adev) { return adev ? to_acpi_device(get_device(&adev->dev)) : NULL; } static inline void acpi_dev_put(struct acpi_device *adev) { if (adev) put_device(&adev->dev); } struct acpi_device *acpi_fetch_acpi_dev(acpi_handle handle); struct acpi_device *acpi_get_acpi_dev(acpi_handle handle); static inline void acpi_put_acpi_dev(struct acpi_device *adev) { acpi_dev_put(adev); } #else /* CONFIG_ACPI */ static inline int register_acpi_bus_type(void *bus) { return 0; } static inline int unregister_acpi_bus_type(void *bus) { return 0; } #endif /* CONFIG_ACPI */ #endif /*__ACPI_BUS_H__*/
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 /* vcan.c - Virtual CAN interface * * Copyright (c) 2002-2017 Volkswagen Group Electronic Research * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Volkswagen nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * Alternatively, provided that this notice is retained in full, this * software may be distributed under the terms of the GNU General * Public License ("GPL") version 2, in which case the provisions of the * GPL apply INSTEAD OF those given above. * * The provided data structures and external interfaces from this code * are not restricted to be used by modules with a GPL compatible license. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/ethtool.h> #include <linux/module.h> #include <linux/init.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/can.h> #include <linux/can/can-ml.h> #include <linux/can/dev.h> #include <linux/can/skb.h> #include <linux/slab.h> #include <net/rtnetlink.h> #define DRV_NAME "vcan" MODULE_DESCRIPTION("virtual CAN interface"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Urs Thuermann <urs.thuermann@volkswagen.de>"); MODULE_ALIAS_RTNL_LINK(DRV_NAME); /* CAN test feature: * Enable the echo on driver level for testing the CAN core echo modes. * See Documentation/networking/can.rst for details. */ static bool echo; /* echo testing. Default: 0 (Off) */ module_param(echo, bool, 0444); MODULE_PARM_DESC(echo, "Echo sent frames (for testing). Default: 0 (Off)"); static void vcan_rx(struct sk_buff *skb, struct net_device *dev) { struct net_device_stats *stats = &dev->stats; stats->rx_packets++; stats->rx_bytes += can_skb_get_data_len(skb); skb->pkt_type = PACKET_BROADCAST; skb->dev = dev; skb->ip_summed = CHECKSUM_UNNECESSARY; netif_rx(skb); } static netdev_tx_t vcan_tx(struct sk_buff *skb, struct net_device *dev) { struct net_device_stats *stats = &dev->stats; unsigned int len; int loop; if (can_dropped_invalid_skb(dev, skb)) return NETDEV_TX_OK; len = can_skb_get_data_len(skb); stats->tx_packets++; stats->tx_bytes += len; /* set flag whether this packet has to be looped back */ loop = skb->pkt_type == PACKET_LOOPBACK; skb_tx_timestamp(skb); if (!echo) { /* no echo handling available inside this driver */ if (loop) { /* only count the packets here, because the * CAN core already did the echo for us */ stats->rx_packets++; stats->rx_bytes += len; } consume_skb(skb); return NETDEV_TX_OK; } /* perform standard echo handling for CAN network interfaces */ if (loop) { skb = can_create_echo_skb(skb); if (!skb) return NETDEV_TX_OK; /* receive with packet counting */ vcan_rx(skb, dev); } else { /* no looped packets => no counting */ consume_skb(skb); } return NETDEV_TX_OK; } static int vcan_change_mtu(struct net_device *dev, int new_mtu) { /* Do not allow changing the MTU while running */ if (dev->flags & IFF_UP) return -EBUSY; if (new_mtu != CAN_MTU && new_mtu != CANFD_MTU && !can_is_canxl_dev_mtu(new_mtu)) return -EINVAL; dev->mtu = new_mtu; return 0; } static const struct net_device_ops vcan_netdev_ops = { .ndo_start_xmit = vcan_tx, .ndo_change_mtu = vcan_change_mtu, }; static const struct ethtool_ops vcan_ethtool_ops = { .get_ts_info = ethtool_op_get_ts_info, }; static void vcan_setup(struct net_device *dev) { dev->type = ARPHRD_CAN; dev->mtu = CANFD_MTU; dev->hard_header_len = 0; dev->addr_len = 0; dev->tx_queue_len = 0; dev->flags = IFF_NOARP; can_set_ml_priv(dev, netdev_priv(dev)); /* set flags according to driver capabilities */ if (echo) dev->flags |= IFF_ECHO; dev->netdev_ops = &vcan_netdev_ops; dev->ethtool_ops = &vcan_ethtool_ops; dev->needs_free_netdev = true; } static struct rtnl_link_ops vcan_link_ops __read_mostly = { .kind = DRV_NAME, .priv_size = sizeof(struct can_ml_priv), .setup = vcan_setup, }; static __init int vcan_init_module(void) { pr_info("Virtual CAN interface driver\n"); if (echo) pr_info("enabled echo on driver level.\n"); return rtnl_link_register(&vcan_link_ops); } static __exit void vcan_cleanup_module(void) { rtnl_link_unregister(&vcan_link_ops); } module_init(vcan_init_module); module_exit(vcan_cleanup_module);
465 330 1144 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 /* SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause) */ /* Copyright (C) 2016-2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. * * SipHash: a fast short-input PRF * https://131002.net/siphash/ * * This implementation is specifically for SipHash2-4 for a secure PRF * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for * hashtables. */ #ifndef _LINUX_SIPHASH_H #define _LINUX_SIPHASH_H #include <linux/types.h> #include <linux/kernel.h> #define SIPHASH_ALIGNMENT __alignof__(u64) typedef struct { u64 key[2]; } siphash_key_t; #define siphash_aligned_key_t siphash_key_t __aligned(16) static inline bool siphash_key_is_zero(const siphash_key_t *key) { return !(key->key[0] | key->key[1]); } u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key); u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key); u64 siphash_1u64(const u64 a, const siphash_key_t *key); u64 siphash_2u64(const u64 a, const u64 b, const siphash_key_t *key); u64 siphash_3u64(const u64 a, const u64 b, const u64 c, const siphash_key_t *key); u64 siphash_4u64(const u64 a, const u64 b, const u64 c, const u64 d, const siphash_key_t *key); u64 siphash_1u32(const u32 a, const siphash_key_t *key); u64 siphash_3u32(const u32 a, const u32 b, const u32 c, const siphash_key_t *key); static inline u64 siphash_2u32(const u32 a, const u32 b, const siphash_key_t *key) { return siphash_1u64((u64)b << 32 | a, key); } static inline u64 siphash_4u32(const u32 a, const u32 b, const u32 c, const u32 d, const siphash_key_t *key) { return siphash_2u64((u64)b << 32 | a, (u64)d << 32 | c, key); } static inline u64 ___siphash_aligned(const __le64 *data, size_t len, const siphash_key_t *key) { if (__builtin_constant_p(len) && len == 4) return siphash_1u32(le32_to_cpup((const __le32 *)data), key); if (__builtin_constant_p(len) && len == 8) return siphash_1u64(le64_to_cpu(data[0]), key); if (__builtin_constant_p(len) && len == 16) return siphash_2u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]), key); if (__builtin_constant_p(len) && len == 24) return siphash_3u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]), le64_to_cpu(data[2]), key); if (__builtin_constant_p(len) && len == 32) return siphash_4u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]), le64_to_cpu(data[2]), le64_to_cpu(data[3]), key); return __siphash_aligned(data, len, key); } /** * siphash - compute 64-bit siphash PRF value * @data: buffer to hash * @size: size of @data * @key: the siphash key */ static inline u64 siphash(const void *data, size_t len, const siphash_key_t *key) { if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || !IS_ALIGNED((unsigned long)data, SIPHASH_ALIGNMENT)) return __siphash_unaligned(data, len, key); return ___siphash_aligned(data, len, key); } #define HSIPHASH_ALIGNMENT __alignof__(unsigned long) typedef struct { unsigned long key[2]; } hsiphash_key_t; u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key); u32 __hsiphash_unaligned(const void *data, size_t len, const hsiphash_key_t *key); u32 hsiphash_1u32(const u32 a, const hsiphash_key_t *key); u32 hsiphash_2u32(const u32 a, const u32 b, const hsiphash_key_t *key); u32 hsiphash_3u32(const u32 a, const u32 b, const u32 c, const hsiphash_key_t *key); u32 hsiphash_4u32(const u32 a, const u32 b, const u32 c, const u32 d, const hsiphash_key_t *key); static inline u32 ___hsiphash_aligned(const __le32 *data, size_t len, const hsiphash_key_t *key) { if (__builtin_constant_p(len) && len == 4) return hsiphash_1u32(le32_to_cpu(data[0]), key); if (__builtin_constant_p(len) && len == 8) return hsiphash_2u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]), key); if (__builtin_constant_p(len) && len == 12) return hsiphash_3u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]), le32_to_cpu(data[2]), key); if (__builtin_constant_p(len) && len == 16) return hsiphash_4u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]), le32_to_cpu(data[2]), le32_to_cpu(data[3]), key); return __hsiphash_aligned(data, len, key); } /** * hsiphash - compute 32-bit hsiphash PRF value * @data: buffer to hash * @size: size of @data * @key: the hsiphash key */ static inline u32 hsiphash(const void *data, size_t len, const hsiphash_key_t *key) { if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || !IS_ALIGNED((unsigned long)data, HSIPHASH_ALIGNMENT)) return __hsiphash_unaligned(data, len, key); return ___hsiphash_aligned(data, len, key); } /* * These macros expose the raw SipHash and HalfSipHash permutations. * Do not use them directly! If you think you have a use for them, * be sure to CC the maintainer of this file explaining why. */ #define SIPHASH_PERMUTATION(a, b, c, d) ( \ (a) += (b), (b) = rol64((b), 13), (b) ^= (a), (a) = rol64((a), 32), \ (c) += (d), (d) = rol64((d), 16), (d) ^= (c), \ (a) += (d), (d) = rol64((d), 21), (d) ^= (a), \ (c) += (b), (b) = rol64((b), 17), (b) ^= (c), (c) = rol64((c), 32)) #define SIPHASH_CONST_0 0x736f6d6570736575ULL #define SIPHASH_CONST_1 0x646f72616e646f6dULL #define SIPHASH_CONST_2 0x6c7967656e657261ULL #define SIPHASH_CONST_3 0x7465646279746573ULL #define HSIPHASH_PERMUTATION(a, b, c, d) ( \ (a) += (b), (b) = rol32((b), 5), (b) ^= (a), (a) = rol32((a), 16), \ (c) += (d), (d) = rol32((d), 8), (d) ^= (c), \ (a) += (d), (d) = rol32((d), 7), (d) ^= (a), \ (c) += (b), (b) = rol32((b), 13), (b) ^= (c), (c) = rol32((c), 16)) #define HSIPHASH_CONST_0 0U #define HSIPHASH_CONST_1 0U #define HSIPHASH_CONST_2 0x6c796765U #define HSIPHASH_CONST_3 0x74656462U #endif /* _LINUX_SIPHASH_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/exec.c * * Copyright (C) 1991, 1992 Linus Torvalds */ /* * #!-checking implemented by tytso. */ /* * Demand-loading implemented 01.12.91 - no need to read anything but * the header into memory. The inode of the executable is put into * "current->executable", and page faults do the actual loading. Clean. * * Once more I can proudly say that linux stood up to being changed: it * was less than 2 hours work to get demand-loading completely implemented. * * Demand loading changed July 1993 by Eric Youngdale. Use mmap instead, * current->executable is only used by the procfs. This allows a dispatch * table to check for several different types of binary formats. We keep * trying until we recognize the file or we run out of supported binary * formats. */ #include <linux/kernel_read_file.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/mm.h> #include <linux/stat.h> #include <linux/fcntl.h> #include <linux/swap.h> #include <linux/string.h> #include <linux/init.h> #include <linux/sched/mm.h> #include <linux/sched/coredump.h> #include <linux/sched/signal.h> #include <linux/sched/numa_balancing.h> #include <linux/sched/task.h> #include <linux/pagemap.h> #include <linux/perf_event.h> #include <linux/highmem.h> #include <linux/spinlock.h> #include <linux/key.h> #include <linux/personality.h> #include <linux/binfmts.h> #include <linux/utsname.h> #include <linux/pid_namespace.h> #include <linux/module.h> #include <linux/namei.h> #include <linux/mount.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/tsacct_kern.h> #include <linux/cn_proc.h> #include <linux/audit.h> #include <linux/kmod.h> #include <linux/fsnotify.h> #include <linux/fs_struct.h> #include <linux/oom.h> #include <linux/compat.h> #include <linux/vmalloc.h> #include <linux/io_uring.h> #include <linux/syscall_user_dispatch.h> #include <linux/coredump.h> #include <linux/time_namespace.h> #include <linux/user_events.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> #include <asm/tlb.h> #include <trace/events/task.h> #include "internal.h" #include <trace/events/sched.h> static int bprm_creds_from_file(struct linux_binprm *bprm); int suid_dumpable = 0; static LIST_HEAD(formats); static DEFINE_RWLOCK(binfmt_lock); void __register_binfmt(struct linux_binfmt * fmt, int insert) { write_lock(&binfmt_lock); insert ? list_add(&fmt->lh, &formats) : list_add_tail(&fmt->lh, &formats); write_unlock(&binfmt_lock); } EXPORT_SYMBOL(__register_binfmt); void unregister_binfmt(struct linux_binfmt * fmt) { write_lock(&binfmt_lock); list_del(&fmt->lh); write_unlock(&binfmt_lock); } EXPORT_SYMBOL(unregister_binfmt); static inline void put_binfmt(struct linux_binfmt * fmt) { module_put(fmt->module); } bool path_noexec(const struct path *path) { return (path->mnt->mnt_flags & MNT_NOEXEC) || (path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC); } #ifdef CONFIG_USELIB /* * Note that a shared library must be both readable and executable due to * security reasons. * * Also note that we take the address to load from the file itself. */ SYSCALL_DEFINE1(uselib, const char __user *, library) { struct linux_binfmt *fmt; struct file *file; struct filename *tmp = getname(library); int error = PTR_ERR(tmp); static const struct open_flags uselib_flags = { .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, .acc_mode = MAY_READ | MAY_EXEC, .intent = LOOKUP_OPEN, .lookup_flags = LOOKUP_FOLLOW, }; if (IS_ERR(tmp)) goto out; file = do_filp_open(AT_FDCWD, tmp, &uselib_flags); putname(tmp); error = PTR_ERR(file); if (IS_ERR(file)) goto out; /* * may_open() has already checked for this, so it should be * impossible to trip now. But we need to be extra cautious * and check again at the very end too. */ error = -EACCES; if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode) || path_noexec(&file->f_path))) goto exit; error = -ENOEXEC; read_lock(&binfmt_lock); list_for_each_entry(fmt, &formats, lh) { if (!fmt->load_shlib) continue; if (!try_module_get(fmt->module)) continue; read_unlock(&binfmt_lock); error = fmt->load_shlib(file); read_lock(&binfmt_lock); put_binfmt(fmt); if (error != -ENOEXEC) break; } read_unlock(&binfmt_lock); exit: fput(file); out: return error; } #endif /* #ifdef CONFIG_USELIB */ #ifdef CONFIG_MMU /* * The nascent bprm->mm is not visible until exec_mmap() but it can * use a lot of memory, account these pages in current->mm temporary * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we * change the counter back via acct_arg_size(0). */ static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) { struct mm_struct *mm = current->mm; long diff = (long)(pages - bprm->vma_pages); if (!mm || !diff) return; bprm->vma_pages = pages; add_mm_counter(mm, MM_ANONPAGES, diff); } static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { struct page *page; struct vm_area_struct *vma = bprm->vma; struct mm_struct *mm = bprm->mm; int ret; /* * Avoid relying on expanding the stack down in GUP (which * does not work for STACK_GROWSUP anyway), and just do it * by hand ahead of time. */ if (write && pos < vma->vm_start) { mmap_write_lock(mm); ret = expand_downwards(vma, pos); if (unlikely(ret < 0)) { mmap_write_unlock(mm); return NULL; } mmap_write_downgrade(mm); } else mmap_read_lock(mm); /* * We are doing an exec(). 'current' is the process * doing the exec and 'mm' is the new process's mm. */ ret = get_user_pages_remote(mm, pos, 1, write ? FOLL_WRITE : 0, &page, NULL); mmap_read_unlock(mm); if (ret <= 0) return NULL; if (write) acct_arg_size(bprm, vma_pages(vma)); return page; } static void put_arg_page(struct page *page) { put_page(page); } static void free_arg_pages(struct linux_binprm *bprm) { } static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, struct page *page) { flush_cache_page(bprm->vma, pos, page_to_pfn(page)); } static int __bprm_mm_init(struct linux_binprm *bprm) { int err; struct vm_area_struct *vma = NULL; struct mm_struct *mm = bprm->mm; bprm->vma = vma = vm_area_alloc(mm); if (!vma) return -ENOMEM; vma_set_anonymous(vma); if (mmap_write_lock_killable(mm)) { err = -EINTR; goto err_free; } /* * Place the stack at the largest stack address the architecture * supports. Later, we'll move this to an appropriate place. We don't * use STACK_TOP because that can depend on attributes which aren't * configured yet. */ BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); vma->vm_end = STACK_TOP_MAX; vma->vm_start = vma->vm_end - PAGE_SIZE; vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); err = insert_vm_struct(mm, vma); if (err) goto err; mm->stack_vm = mm->total_vm = 1; mmap_write_unlock(mm); bprm->p = vma->vm_end - sizeof(void *); return 0; err: mmap_write_unlock(mm); err_free: bprm->vma = NULL; vm_area_free(vma); return err; } static bool valid_arg_len(struct linux_binprm *bprm, long len) { return len <= MAX_ARG_STRLEN; } #else static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) { } static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { struct page *page; page = bprm->page[pos / PAGE_SIZE]; if (!page && write) { page = alloc_page(GFP_HIGHUSER|__GFP_ZERO); if (!page) return NULL; bprm->page[pos / PAGE_SIZE] = page; } return page; } static void put_arg_page(struct page *page) { } static void free_arg_page(struct linux_binprm *bprm, int i) { if (bprm->page[i]) { __free_page(bprm->page[i]); bprm->page[i] = NULL; } } static void free_arg_pages(struct linux_binprm *bprm) { int i; for (i = 0; i < MAX_ARG_PAGES; i++) free_arg_page(bprm, i); } static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, struct page *page) { } static int __bprm_mm_init(struct linux_binprm *bprm) { bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *); return 0; } static bool valid_arg_len(struct linux_binprm *bprm, long len) { return len <= bprm->p; } #endif /* CONFIG_MMU */ /* * Create a new mm_struct and populate it with a temporary stack * vm_area_struct. We don't have enough context at this point to set the stack * flags, permissions, and offset, so we use temporary values. We'll update * them later in setup_arg_pages(). */ static int bprm_mm_init(struct linux_binprm *bprm) { int err; struct mm_struct *mm = NULL; bprm->mm = mm = mm_alloc(); err = -ENOMEM; if (!mm) goto err; /* Save current stack limit for all calculations made during exec. */ task_lock(current->group_leader); bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK]; task_unlock(current->group_leader); err = __bprm_mm_init(bprm); if (err) goto err; return 0; err: if (mm) { bprm->mm = NULL; mmdrop(mm); } return err; } struct user_arg_ptr { #ifdef CONFIG_COMPAT bool is_compat; #endif union { const char __user *const __user *native; #ifdef CONFIG_COMPAT const compat_uptr_t __user *compat; #endif } ptr; }; static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr) { const char __user *native; #ifdef CONFIG_COMPAT if (unlikely(argv.is_compat)) { compat_uptr_t compat; if (get_user(compat, argv.ptr.compat + nr)) return ERR_PTR(-EFAULT); return compat_ptr(compat); } #endif if (get_user(native, argv.ptr.native + nr)) return ERR_PTR(-EFAULT); return native; } /* * count() counts the number of strings in array ARGV. */ static int count(struct user_arg_ptr argv, int max) { int i = 0; if (argv.ptr.native != NULL) { for (;;) { const char __user *p = get_user_arg_ptr(argv, i); if (!p) break; if (IS_ERR(p)) return -EFAULT; if (i >= max) return -E2BIG; ++i; if (fatal_signal_pending(current)) return -ERESTARTNOHAND; cond_resched(); } } return i; } static int count_strings_kernel(const char *const *argv) { int i; if (!argv) return 0; for (i = 0; argv[i]; ++i) { if (i >= MAX_ARG_STRINGS) return -E2BIG; if (fatal_signal_pending(current)) return -ERESTARTNOHAND; cond_resched(); } return i; } static int bprm_stack_limits(struct linux_binprm *bprm) { unsigned long limit, ptr_size; /* * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM * (whichever is smaller) for the argv+env strings. * This ensures that: * - the remaining binfmt code will not run out of stack space, * - the program will have a reasonable amount of stack left * to work from. */ limit = _STK_LIM / 4 * 3; limit = min(limit, bprm->rlim_stack.rlim_cur / 4); /* * We've historically supported up to 32 pages (ARG_MAX) * of argument strings even with small stacks */ limit = max_t(unsigned long, limit, ARG_MAX); /* * We must account for the size of all the argv and envp pointers to * the argv and envp strings, since they will also take up space in * the stack. They aren't stored until much later when we can't * signal to the parent that the child has run out of stack space. * Instead, calculate it here so it's possible to fail gracefully. * * In the case of argc = 0, make sure there is space for adding a * empty string (which will bump argc to 1), to ensure confused * userspace programs don't start processing from argv[1], thinking * argc can never be 0, to keep them from walking envp by accident. * See do_execveat_common(). */ ptr_size = (max(bprm->argc, 1) + bprm->envc) * sizeof(void *); if (limit <= ptr_size) return -E2BIG; limit -= ptr_size; bprm->argmin = bprm->p - limit; return 0; } /* * 'copy_strings()' copies argument/environment strings from the old * processes's memory to the new process's stack. The call to get_user_pages() * ensures the destination page is created and not swapped out. */ static int copy_strings(int argc, struct user_arg_ptr argv, struct linux_binprm *bprm) { struct page *kmapped_page = NULL; char *kaddr = NULL; unsigned long kpos = 0; int ret; while (argc-- > 0) { const char __user *str; int len; unsigned long pos; ret = -EFAULT; str = get_user_arg_ptr(argv, argc); if (IS_ERR(str)) goto out; len = strnlen_user(str, MAX_ARG_STRLEN); if (!len) goto out; ret = -E2BIG; if (!valid_arg_len(bprm, len)) goto out; /* We're going to work our way backwards. */ pos = bprm->p; str += len; bprm->p -= len; #ifdef CONFIG_MMU if (bprm->p < bprm->argmin) goto out; #endif while (len > 0) { int offset, bytes_to_copy; if (fatal_signal_pending(current)) { ret = -ERESTARTNOHAND; goto out; } cond_resched(); offset = pos % PAGE_SIZE; if (offset == 0) offset = PAGE_SIZE; bytes_to_copy = offset; if (bytes_to_copy > len) bytes_to_copy = len; offset -= bytes_to_copy; pos -= bytes_to_copy; str -= bytes_to_copy; len -= bytes_to_copy; if (!kmapped_page || kpos != (pos & PAGE_MASK)) { struct page *page; page = get_arg_page(bprm, pos, 1); if (!page) { ret = -E2BIG; goto out; } if (kmapped_page) { flush_dcache_page(kmapped_page); kunmap_local(kaddr); put_arg_page(kmapped_page); } kmapped_page = page; kaddr = kmap_local_page(kmapped_page); kpos = pos & PAGE_MASK; flush_arg_page(bprm, kpos, kmapped_page); } if (copy_from_user(kaddr+offset, str, bytes_to_copy)) { ret = -EFAULT; goto out; } } } ret = 0; out: if (kmapped_page) { flush_dcache_page(kmapped_page); kunmap_local(kaddr); put_arg_page(kmapped_page); } return ret; } /* * Copy and argument/environment string from the kernel to the processes stack. */ int copy_string_kernel(const char *arg, struct linux_binprm *bprm) { int len = strnlen(arg, MAX_ARG_STRLEN) + 1 /* terminating NUL */; unsigned long pos = bprm->p; if (len == 0) return -EFAULT; if (!valid_arg_len(bprm, len)) return -E2BIG; /* We're going to work our way backwards. */ arg += len; bprm->p -= len; if (IS_ENABLED(CONFIG_MMU) && bprm->p < bprm->argmin) return -E2BIG; while (len > 0) { unsigned int bytes_to_copy = min_t(unsigned int, len, min_not_zero(offset_in_page(pos), PAGE_SIZE)); struct page *page; pos -= bytes_to_copy; arg -= bytes_to_copy; len -= bytes_to_copy; page = get_arg_page(bprm, pos, 1); if (!page) return -E2BIG; flush_arg_page(bprm, pos & PAGE_MASK, page); memcpy_to_page(page, offset_in_page(pos), arg, bytes_to_copy); put_arg_page(page); } return 0; } EXPORT_SYMBOL(copy_string_kernel); static int copy_strings_kernel(int argc, const char *const *argv, struct linux_binprm *bprm) { while (argc-- > 0) { int ret = copy_string_kernel(argv[argc], bprm); if (ret < 0) return ret; if (fatal_signal_pending(current)) return -ERESTARTNOHAND; cond_resched(); } return 0; } #ifdef CONFIG_MMU /* * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX. Once * the binfmt code determines where the new stack should reside, we shift it to * its final location. The process proceeds as follows: * * 1) Use shift to calculate the new vma endpoints. * 2) Extend vma to cover both the old and new ranges. This ensures the * arguments passed to subsequent functions are consistent. * 3) Move vma's page tables to the new range. * 4) Free up any cleared pgd range. * 5) Shrink the vma to cover only the new range. */ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) { struct mm_struct *mm = vma->vm_mm; unsigned long old_start = vma->vm_start; unsigned long old_end = vma->vm_end; unsigned long length = old_end - old_start; unsigned long new_start = old_start - shift; unsigned long new_end = old_end - shift; VMA_ITERATOR(vmi, mm, new_start); struct vm_area_struct *next; struct mmu_gather tlb; BUG_ON(new_start > new_end); /* * ensure there are no vmas between where we want to go * and where we are */ if (vma != vma_next(&vmi)) return -EFAULT; vma_iter_prev_range(&vmi); /* * cover the whole range: [new_start, old_end) */ if (vma_expand(&vmi, vma, new_start, old_end, vma->vm_pgoff, NULL)) return -ENOMEM; /* * move the page tables downwards, on failure we rely on * process cleanup to remove whatever mess we made. */ if (length != move_page_tables(vma, old_start, vma, new_start, length, false)) return -ENOMEM; lru_add_drain(); tlb_gather_mmu(&tlb, mm); next = vma_next(&vmi); if (new_end > old_start) { /* * when the old and new regions overlap clear from new_end. */ free_pgd_range(&tlb, new_end, old_end, new_end, next ? next->vm_start : USER_PGTABLES_CEILING); } else { /* * otherwise, clean from old_start; this is done to not touch * the address space in [new_end, old_start) some architectures * have constraints on va-space that make this illegal (IA64) - * for the others its just a little faster. */ free_pgd_range(&tlb, old_start, old_end, new_end, next ? next->vm_start : USER_PGTABLES_CEILING); } tlb_finish_mmu(&tlb); vma_prev(&vmi); /* Shrink the vma to just the new range */ return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff); } /* * Finalizes the stack vm_area_struct. The flags and permissions are updated, * the stack is optionally relocated, and some extra space is added. */ int setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, int executable_stack) { unsigned long ret; unsigned long stack_shift; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = bprm->vma; struct vm_area_struct *prev = NULL; unsigned long vm_flags; unsigned long stack_base; unsigned long stack_size; unsigned long stack_expand; unsigned long rlim_stack; struct mmu_gather tlb; struct vma_iterator vmi; #ifdef CONFIG_STACK_GROWSUP /* Limit stack size */ stack_base = bprm->rlim_stack.rlim_max; stack_base = calc_max_stack_size(stack_base); /* Add space for stack randomization. */ stack_base += (STACK_RND_MASK << PAGE_SHIFT); /* Make sure we didn't let the argument array grow too large. */ if (vma->vm_end - vma->vm_start > stack_base) return -ENOMEM; stack_base = PAGE_ALIGN(stack_top - stack_base); stack_shift = vma->vm_start - stack_base; mm->arg_start = bprm->p - stack_shift; bprm->p = vma->vm_end - stack_shift; #else stack_top = arch_align_stack(stack_top); stack_top = PAGE_ALIGN(stack_top); if (unlikely(stack_top < mmap_min_addr) || unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr)) return -ENOMEM; stack_shift = vma->vm_end - stack_top; bprm->p -= stack_shift; mm->arg_start = bprm->p; #endif if (bprm->loader) bprm->loader -= stack_shift; bprm->exec -= stack_shift; if (mmap_write_lock_killable(mm)) return -EINTR; vm_flags = VM_STACK_FLAGS; /* * Adjust stack execute permissions; explicitly enable for * EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone * (arch default) otherwise. */ if (unlikely(executable_stack == EXSTACK_ENABLE_X)) vm_flags |= VM_EXEC; else if (executable_stack == EXSTACK_DISABLE_X) vm_flags &= ~VM_EXEC; vm_flags |= mm->def_flags; vm_flags |= VM_STACK_INCOMPLETE_SETUP; vma_iter_init(&vmi, mm, vma->vm_start); tlb_gather_mmu(&tlb, mm); ret = mprotect_fixup(&vmi, &tlb, vma, &prev, vma->vm_start, vma->vm_end, vm_flags); tlb_finish_mmu(&tlb); if (ret) goto out_unlock; BUG_ON(prev != vma); if (unlikely(vm_flags & VM_EXEC)) { pr_warn_once("process '%pD4' started with executable stack\n", bprm->file); } /* Move stack pages down in memory. */ if (stack_shift) { ret = shift_arg_pages(vma, stack_shift); if (ret) goto out_unlock; } /* mprotect_fixup is overkill to remove the temporary stack flags */ vm_flags_clear(vma, VM_STACK_INCOMPLETE_SETUP); stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */ stack_size = vma->vm_end - vma->vm_start; /* * Align this down to a page boundary as expand_stack * will align it up. */ rlim_stack = bprm->rlim_stack.rlim_cur & PAGE_MASK; stack_expand = min(rlim_stack, stack_size + stack_expand); #ifdef CONFIG_STACK_GROWSUP stack_base = vma->vm_start + stack_expand; #else stack_base = vma->vm_end - stack_expand; #endif current->mm->start_stack = bprm->p; ret = expand_stack_locked(vma, stack_base); if (ret) ret = -EFAULT; out_unlock: mmap_write_unlock(mm); return ret; } EXPORT_SYMBOL(setup_arg_pages); #else /* * Transfer the program arguments and environment from the holding pages * onto the stack. The provided stack pointer is adjusted accordingly. */ int transfer_args_to_stack(struct linux_binprm *bprm, unsigned long *sp_location) { unsigned long index, stop, sp; int ret = 0; stop = bprm->p >> PAGE_SHIFT; sp = *sp_location; for (index = MAX_ARG_PAGES - 1; index >= stop; index--) { unsigned int offset = index == stop ? bprm->p & ~PAGE_MASK : 0; char *src = kmap_local_page(bprm->page[index]) + offset; sp -= PAGE_SIZE - offset; if (copy_to_user((void *) sp, src, PAGE_SIZE - offset) != 0) ret = -EFAULT; kunmap_local(src); if (ret) goto out; } *sp_location = sp; out: return ret; } EXPORT_SYMBOL(transfer_args_to_stack); #endif /* CONFIG_MMU */ static struct file *do_open_execat(int fd, struct filename *name, int flags) { struct file *file; int err; struct open_flags open_exec_flags = { .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, .acc_mode = MAY_EXEC, .intent = LOOKUP_OPEN, .lookup_flags = LOOKUP_FOLLOW, }; if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) return ERR_PTR(-EINVAL); if (flags & AT_SYMLINK_NOFOLLOW) open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW; if (flags & AT_EMPTY_PATH) open_exec_flags.lookup_flags |= LOOKUP_EMPTY; file = do_filp_open(fd, name, &open_exec_flags); if (IS_ERR(file)) goto out; /* * may_open() has already checked for this, so it should be * impossible to trip now. But we need to be extra cautious * and check again at the very end too. */ err = -EACCES; if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode) || path_noexec(&file->f_path))) goto exit; err = deny_write_access(file); if (err) goto exit; out: return file; exit: fput(file); return ERR_PTR(err); } struct file *open_exec(const char *name) { struct filename *filename = getname_kernel(name); struct file *f = ERR_CAST(filename); if (!IS_ERR(filename)) { f = do_open_execat(AT_FDCWD, filename, 0); putname(filename); } return f; } EXPORT_SYMBOL(open_exec); #if defined(CONFIG_BINFMT_FLAT) || defined(CONFIG_BINFMT_ELF_FDPIC) ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len) { ssize_t res = vfs_read(file, (void __user *)addr, len, &pos); if (res > 0) flush_icache_user_range(addr, addr + len); return res; } EXPORT_SYMBOL(read_code); #endif /* * Maps the mm_struct mm into the current task struct. * On success, this function returns with exec_update_lock * held for writing. */ static int exec_mmap(struct mm_struct *mm) { struct task_struct *tsk; struct mm_struct *old_mm, *active_mm; int ret; /* Notify parent that we're no longer interested in the old VM */ tsk = current; old_mm = current->mm; exec_mm_release(tsk, old_mm); if (old_mm) sync_mm_rss(old_mm); ret = down_write_killable(&tsk->signal->exec_update_lock); if (ret) return ret; if (old_mm) { /* * If there is a pending fatal signal perhaps a signal * whose default action is to create a coredump get * out and die instead of going through with the exec. */ ret = mmap_read_lock_killable(old_mm); if (ret) { up_write(&tsk->signal->exec_update_lock); return ret; } } task_lock(tsk); membarrier_exec_mmap(mm); local_irq_disable(); active_mm = tsk->active_mm; tsk->active_mm = mm; tsk->mm = mm; mm_init_cid(mm); /* * This prevents preemption while active_mm is being loaded and * it and mm are being updated, which could cause problems for * lazy tlb mm refcounting when these are updated by context * switches. Not all architectures can handle irqs off over * activate_mm yet. */ if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) local_irq_enable(); activate_mm(active_mm, mm); if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) local_irq_enable(); lru_gen_add_mm(mm); task_unlock(tsk); lru_gen_use_mm(mm); if (old_mm) { mmap_read_unlock(old_mm); BUG_ON(active_mm != old_mm); setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm); mm_update_next_owner(old_mm); mmput(old_mm); return 0; } mmdrop_lazy_tlb(active_mm); return 0; } static int de_thread(struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; struct sighand_struct *oldsighand = tsk->sighand; spinlock_t *lock = &oldsighand->siglock; if (thread_group_empty(tsk)) goto no_thread_group; /* * Kill all other threads in the thread group. */ spin_lock_irq(lock); if ((sig->flags & SIGNAL_GROUP_EXIT) || sig->group_exec_task) { /* * Another group action in progress, just * return so that the signal is processed. */ spin_unlock_irq(lock); return -EAGAIN; } sig->group_exec_task = tsk; sig->notify_count = zap_other_threads(tsk); if (!thread_group_leader(tsk)) sig->notify_count--; while (sig->notify_count) { __set_current_state(TASK_KILLABLE); spin_unlock_irq(lock); schedule(); if (__fatal_signal_pending(tsk)) goto killed; spin_lock_irq(lock); } spin_unlock_irq(lock); /* * At this point all other threads have exited, all we have to * do is to wait for the thread group leader to become inactive, * and to assume its PID: */ if (!thread_group_leader(tsk)) { struct task_struct *leader = tsk->group_leader; for (;;) { cgroup_threadgroup_change_begin(tsk); write_lock_irq(&tasklist_lock); /* * Do this under tasklist_lock to ensure that * exit_notify() can't miss ->group_exec_task */ sig->notify_count = -1; if (likely(leader->exit_state)) break; __set_current_state(TASK_KILLABLE); write_unlock_irq(&tasklist_lock); cgroup_threadgroup_change_end(tsk); schedule(); if (__fatal_signal_pending(tsk)) goto killed; } /* * The only record we have of the real-time age of a * process, regardless of execs it's done, is start_time. * All the past CPU time is accumulated in signal_struct * from sister threads now dead. But in this non-leader * exec, nothing survives from the original leader thread, * whose birth marks the true age of this process now. * When we take on its identity by switching to its PID, we * also take its birthdate (always earlier than our own). */ tsk->start_time = leader->start_time; tsk->start_boottime = leader->start_boottime; BUG_ON(!same_thread_group(leader, tsk)); /* * An exec() starts a new thread group with the * TGID of the previous thread group. Rehash the * two threads with a switched PID, and release * the former thread group leader: */ /* Become a process group leader with the old leader's pid. * The old leader becomes a thread of the this thread group. */ exchange_tids(tsk, leader); transfer_pid(leader, tsk, PIDTYPE_TGID); transfer_pid(leader, tsk, PIDTYPE_PGID); transfer_pid(leader, tsk, PIDTYPE_SID); list_replace_rcu(&leader->tasks, &tsk->tasks); list_replace_init(&leader->sibling, &tsk->sibling); tsk->group_leader = tsk; leader->group_leader = tsk; tsk->exit_signal = SIGCHLD; leader->exit_signal = -1; BUG_ON(leader->exit_state != EXIT_ZOMBIE); leader->exit_state = EXIT_DEAD; /* * We are going to release_task()->ptrace_unlink() silently, * the tracer can sleep in do_wait(). EXIT_DEAD guarantees * the tracer won't block again waiting for this thread. */ if (unlikely(leader->ptrace)) __wake_up_parent(leader, leader->parent); write_unlock_irq(&tasklist_lock); cgroup_threadgroup_change_end(tsk); release_task(leader); } sig->group_exec_task = NULL; sig->notify_count = 0; no_thread_group: /* we have changed execution domain */ tsk->exit_signal = SIGCHLD; BUG_ON(!thread_group_leader(tsk)); return 0; killed: /* protects against exit_notify() and __exit_signal() */ read_lock(&tasklist_lock); sig->group_exec_task = NULL; sig->notify_count = 0; read_unlock(&tasklist_lock); return -EAGAIN; } /* * This function makes sure the current process has its own signal table, * so that flush_signal_handlers can later reset the handlers without * disturbing other processes. (Other processes might share the signal * table via the CLONE_SIGHAND option to clone().) */ static int unshare_sighand(struct task_struct *me) { struct sighand_struct *oldsighand = me->sighand; if (refcount_read(&oldsighand->count) != 1) { struct sighand_struct *newsighand; /* * This ->sighand is shared with the CLONE_SIGHAND * but not CLONE_THREAD task, switch to the new one. */ newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); if (!newsighand) return -ENOMEM; refcount_set(&newsighand->count, 1); write_lock_irq(&tasklist_lock); spin_lock(&oldsighand->siglock); memcpy(newsighand->action, oldsighand->action, sizeof(newsighand->action)); rcu_assign_pointer(me->sighand, newsighand); spin_unlock(&oldsighand->siglock); write_unlock_irq(&tasklist_lock); __cleanup_sighand(oldsighand); } return 0; } char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk) { task_lock(tsk); /* Always NUL terminated and zero-padded */ strscpy_pad(buf, tsk->comm, buf_size); task_unlock(tsk); return buf; } EXPORT_SYMBOL_GPL(__get_task_comm); /* * These functions flushes out all traces of the currently running executable * so that a new one can be started */ void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec) { task_lock(tsk); trace_task_rename(tsk, buf); strscpy_pad(tsk->comm, buf, sizeof(tsk->comm)); task_unlock(tsk); perf_event_comm(tsk, exec); } /* * Calling this is the point of no return. None of the failures will be * seen by userspace since either the process is already taking a fatal * signal (via de_thread() or coredump), or will have SEGV raised * (after exec_mmap()) by search_binary_handler (see below). */ int begin_new_exec(struct linux_binprm * bprm) { struct task_struct *me = current; int retval; /* Once we are committed compute the creds */ retval = bprm_creds_from_file(bprm); if (retval) return retval; /* * Ensure all future errors are fatal. */ bprm->point_of_no_return = true; /* * Make this the only thread in the thread group. */ retval = de_thread(me); if (retval) goto out; /* * Cancel any io_uring activity across execve */ io_uring_task_cancel(); /* Ensure the files table is not shared. */ retval = unshare_files(); if (retval) goto out; /* * Must be called _before_ exec_mmap() as bprm->mm is * not visible until then. Doing it here also ensures * we don't race against replace_mm_exe_file(). */ retval = set_mm_exe_file(bprm->mm, bprm->file); if (retval) goto out; /* If the binary is not readable then enforce mm->dumpable=0 */ would_dump(bprm, bprm->file); if (bprm->have_execfd) would_dump(bprm, bprm->executable); /* * Release all of the old mmap stuff */ acct_arg_size(bprm, 0); retval = exec_mmap(bprm->mm); if (retval) goto out; bprm->mm = NULL; retval = exec_task_namespaces(); if (retval) goto out_unlock; #ifdef CONFIG_POSIX_TIMERS spin_lock_irq(&me->sighand->siglock); posix_cpu_timers_exit(me); spin_unlock_irq(&me->sighand->siglock); exit_itimers(me); flush_itimer_signals(); #endif /* * Make the signal table private. */ retval = unshare_sighand(me); if (retval) goto out_unlock; me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_NOFREEZE | PF_NO_SETAFFINITY); flush_thread(); me->personality &= ~bprm->per_clear; clear_syscall_work_syscall_user_dispatch(me); /* * We have to apply CLOEXEC before we change whether the process is * dumpable (in setup_new_exec) to avoid a race with a process in userspace * trying to access the should-be-closed file descriptors of a process * undergoing exec(2). */ do_close_on_exec(me->files); if (bprm->secureexec) { /* Make sure parent cannot signal privileged process. */ me->pdeath_signal = 0; /* * For secureexec, reset the stack limit to sane default to * avoid bad behavior from the prior rlimits. This has to * happen before arch_pick_mmap_layout(), which examines * RLIMIT_STACK, but after the point of no return to avoid * needing to clean up the change on failure. */ if (bprm->rlim_stack.rlim_cur > _STK_LIM) bprm->rlim_stack.rlim_cur = _STK_LIM; } me->sas_ss_sp = me->sas_ss_size = 0; /* * Figure out dumpability. Note that this checking only of current * is wrong, but userspace depends on it. This should be testing * bprm->secureexec instead. */ if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP || !(uid_eq(current_euid(), current_uid()) && gid_eq(current_egid(), current_gid()))) set_dumpable(current->mm, suid_dumpable); else set_dumpable(current->mm, SUID_DUMP_USER); perf_event_exec(); __set_task_comm(me, kbasename(bprm->filename), true); /* An exec changes our domain. We are no longer part of the thread group */ WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1); flush_signal_handlers(me, 0); retval = set_cred_ucounts(bprm->cred); if (retval < 0) goto out_unlock; /* * install the new credentials for this executable */ security_bprm_committing_creds(bprm); commit_creds(bprm->cred); bprm->cred = NULL; /* * Disable monitoring for regular users * when executing setuid binaries. Must * wait until new credentials are committed * by commit_creds() above */ if (get_dumpable(me->mm) != SUID_DUMP_USER) perf_event_exit_task(me); /* * cred_guard_mutex must be held at least to this point to prevent * ptrace_attach() from altering our determination of the task's * credentials; any time after this it may be unlocked. */ security_bprm_committed_creds(bprm); /* Pass the opened binary to the interpreter. */ if (bprm->have_execfd) { retval = get_unused_fd_flags(0); if (retval < 0) goto out_unlock; fd_install(retval, bprm->executable); bprm->executable = NULL; bprm->execfd = retval; } return 0; out_unlock: up_write(&me->signal->exec_update_lock); out: return retval; } EXPORT_SYMBOL(begin_new_exec); void would_dump(struct linux_binprm *bprm, struct file *file) { struct inode *inode = file_inode(file); struct mnt_idmap *idmap = file_mnt_idmap(file); if (inode_permission(idmap, inode, MAY_READ) < 0) { struct user_namespace *old, *user_ns; bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP; /* Ensure mm->user_ns contains the executable */ user_ns = old = bprm->mm->user_ns; while ((user_ns != &init_user_ns) && !privileged_wrt_inode_uidgid(user_ns, idmap, inode)) user_ns = user_ns->parent; if (old != user_ns) { bprm->mm->user_ns = get_user_ns(user_ns); put_user_ns(old); } } } EXPORT_SYMBOL(would_dump); void setup_new_exec(struct linux_binprm * bprm) { /* Setup things that can depend upon the personality */ struct task_struct *me = current; arch_pick_mmap_layout(me->mm, &bprm->rlim_stack); arch_setup_new_exec(); /* Set the new mm task size. We have to do that late because it may * depend on TIF_32BIT which is only updated in flush_thread() on * some architectures like powerpc */ me->mm->task_size = TASK_SIZE; up_write(&me->signal->exec_update_lock); mutex_unlock(&me->signal->cred_guard_mutex); } EXPORT_SYMBOL(setup_new_exec); /* Runs immediately before start_thread() takes over. */ void finalize_exec(struct linux_binprm *bprm) { /* Store any stack rlimit changes before starting thread. */ task_lock(current->group_leader); current->signal->rlim[RLIMIT_STACK] = bprm->rlim_stack; task_unlock(current->group_leader); } EXPORT_SYMBOL(finalize_exec); /* * Prepare credentials and lock ->cred_guard_mutex. * setup_new_exec() commits the new creds and drops the lock. * Or, if exec fails before, free_bprm() should release ->cred * and unlock. */ static int prepare_bprm_creds(struct linux_binprm *bprm) { if (mutex_lock_interruptible(&current->signal->cred_guard_mutex)) return -ERESTARTNOINTR; bprm->cred = prepare_exec_creds(); if (likely(bprm->cred)) return 0; mutex_unlock(&current->signal->cred_guard_mutex); return -ENOMEM; } static void free_bprm(struct linux_binprm *bprm) { if (bprm->mm) { acct_arg_size(bprm, 0); mmput(bprm->mm); } free_arg_pages(bprm); if (bprm->cred) { mutex_unlock(&current->signal->cred_guard_mutex); abort_creds(bprm->cred); } if (bprm->file) { allow_write_access(bprm->file); fput(bprm->file); } if (bprm->executable) fput(bprm->executable); /* If a binfmt changed the interp, free it. */ if (bprm->interp != bprm->filename) kfree(bprm->interp); kfree(bprm->fdpath); kfree(bprm); } static struct linux_binprm *alloc_bprm(int fd, struct filename *filename) { struct linux_binprm *bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); int retval = -ENOMEM; if (!bprm) goto out; if (fd == AT_FDCWD || filename->name[0] == '/') { bprm->filename = filename->name; } else { if (filename->name[0] == '\0') bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd); else bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s", fd, filename->name); if (!bprm->fdpath) goto out_free; bprm->filename = bprm->fdpath; } bprm->interp = bprm->filename; retval = bprm_mm_init(bprm); if (retval) goto out_free; return bprm; out_free: free_bprm(bprm); out: return ERR_PTR(retval); } int bprm_change_interp(const char *interp, struct linux_binprm *bprm) { /* If a binfmt changed the interp, free it first. */ if (bprm->interp != bprm->filename) kfree(bprm->interp); bprm->interp = kstrdup(interp, GFP_KERNEL); if (!bprm->interp) return -ENOMEM; return 0; } EXPORT_SYMBOL(bprm_change_interp); /* * determine how safe it is to execute the proposed program * - the caller must hold ->cred_guard_mutex to protect against * PTRACE_ATTACH or seccomp thread-sync */ static void check_unsafe_exec(struct linux_binprm *bprm) { struct task_struct *p = current, *t; unsigned n_fs; if (p->ptrace) bprm->unsafe |= LSM_UNSAFE_PTRACE; /* * This isn't strictly necessary, but it makes it harder for LSMs to * mess up. */ if (task_no_new_privs(current)) bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS; /* * If another task is sharing our fs, we cannot safely * suid exec because the differently privileged task * will be able to manipulate the current directory, etc. * It would be nice to force an unshare instead... */ t = p; n_fs = 1; spin_lock(&p->fs->lock); rcu_read_lock(); while_each_thread(p, t) { if (t->fs == p->fs) n_fs++; } rcu_read_unlock(); if (p->fs->users > n_fs) bprm->unsafe |= LSM_UNSAFE_SHARE; else p->fs->in_exec = 1; spin_unlock(&p->fs->lock); } static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) { /* Handle suid and sgid on files */ struct mnt_idmap *idmap; struct inode *inode = file_inode(file); unsigned int mode; vfsuid_t vfsuid; vfsgid_t vfsgid; if (!mnt_may_suid(file->f_path.mnt)) return; if (task_no_new_privs(current)) return; mode = READ_ONCE(inode->i_mode); if (!(mode & (S_ISUID|S_ISGID))) return; idmap = file_mnt_idmap(file); /* Be careful if suid/sgid is set */ inode_lock(inode); /* reload atomically mode/uid/gid now that lock held */ mode = inode->i_mode; vfsuid = i_uid_into_vfsuid(idmap, inode); vfsgid = i_gid_into_vfsgid(idmap, inode); inode_unlock(inode); /* We ignore suid/sgid if there are no mappings for them in the ns */ if (!vfsuid_has_mapping(bprm->cred->user_ns, vfsuid) || !vfsgid_has_mapping(bprm->cred->user_ns, vfsgid)) return; if (mode & S_ISUID) { bprm->per_clear |= PER_CLEAR_ON_SETID; bprm->cred->euid = vfsuid_into_kuid(vfsuid); } if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { bprm->per_clear |= PER_CLEAR_ON_SETID; bprm->cred->egid = vfsgid_into_kgid(vfsgid); } } /* * Compute brpm->cred based upon the final binary. */ static int bprm_creds_from_file(struct linux_binprm *bprm) { /* Compute creds based on which file? */ struct file *file = bprm->execfd_creds ? bprm->executable : bprm->file; bprm_fill_uid(bprm, file); return security_bprm_creds_from_file(bprm, file); } /* * Fill the binprm structure from the inode. * Read the first BINPRM_BUF_SIZE bytes * * This may be called multiple times for binary chains (scripts for example). */ static int prepare_binprm(struct linux_binprm *bprm) { loff_t pos = 0; memset(bprm->buf, 0, BINPRM_BUF_SIZE); return kernel_read(bprm->file, bprm->buf, BINPRM_BUF_SIZE, &pos); } /* * Arguments are '\0' separated strings found at the location bprm->p * points to; chop off the first by relocating brpm->p to right after * the first '\0' encountered. */ int remove_arg_zero(struct linux_binprm *bprm) { int ret = 0; unsigned long offset; char *kaddr; struct page *page; if (!bprm->argc) return 0; do { offset = bprm->p & ~PAGE_MASK; page = get_arg_page(bprm, bprm->p, 0); if (!page) { ret = -EFAULT; goto out; } kaddr = kmap_local_page(page); for (; offset < PAGE_SIZE && kaddr[offset]; offset++, bprm->p++) ; kunmap_local(kaddr); put_arg_page(page); } while (offset == PAGE_SIZE); bprm->p++; bprm->argc--; ret = 0; out: return ret; } EXPORT_SYMBOL(remove_arg_zero); #define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e)) /* * cycle the list of binary formats handler, until one recognizes the image */ static int search_binary_handler(struct linux_binprm *bprm) { bool need_retry = IS_ENABLED(CONFIG_MODULES); struct linux_binfmt *fmt; int retval; retval = prepare_binprm(bprm); if (retval < 0) return retval; retval = security_bprm_check(bprm); if (retval) return retval; retval = -ENOENT; retry: read_lock(&binfmt_lock); list_for_each_entry(fmt, &formats, lh) { if (!try_module_get(fmt->module)) continue; read_unlock(&binfmt_lock); retval = fmt->load_binary(bprm); read_lock(&binfmt_lock); put_binfmt(fmt); if (bprm->point_of_no_return || (retval != -ENOEXEC)) { read_unlock(&binfmt_lock); return retval; } } read_unlock(&binfmt_lock); if (need_retry) { if (printable(bprm->buf[0]) && printable(bprm->buf[1]) && printable(bprm->buf[2]) && printable(bprm->buf[3])) return retval; if (request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)) < 0) return retval; need_retry = false; goto retry; } return retval; } /* binfmt handlers will call back into begin_new_exec() on success. */ static int exec_binprm(struct linux_binprm *bprm) { pid_t old_pid, old_vpid; int ret, depth; /* Need to fetch pid before load_binary changes it */ old_pid = current->pid; rcu_read_lock(); old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent)); rcu_read_unlock(); /* This allows 4 levels of binfmt rewrites before failing hard. */ for (depth = 0;; depth++) { struct file *exec; if (depth > 5) return -ELOOP; ret = search_binary_handler(bprm); if (ret < 0) return ret; if (!bprm->interpreter) break; exec = bprm->file; bprm->file = bprm->interpreter; bprm->interpreter = NULL; allow_write_access(exec); if (unlikely(bprm->have_execfd)) { if (bprm->executable) { fput(exec); return -ENOEXEC; } bprm->executable = exec; } else fput(exec); } audit_bprm(bprm); trace_sched_process_exec(current, old_pid, bprm); ptrace_event(PTRACE_EVENT_EXEC, old_vpid); proc_exec_connector(current); return 0; } /* * sys_execve() executes a new program. */ static int bprm_execve(struct linux_binprm *bprm, int fd, struct filename *filename, int flags) { struct file *file; int retval; retval = prepare_bprm_creds(bprm); if (retval) return retval; /* * Check for unsafe execution states before exec_binprm(), which * will call back into begin_new_exec(), into bprm_creds_from_file(), * where setuid-ness is evaluated. */ check_unsafe_exec(bprm); current->in_execve = 1; sched_mm_cid_before_execve(current); file = do_open_execat(fd, filename, flags); retval = PTR_ERR(file); if (IS_ERR(file)) goto out_unmark; sched_exec(); bprm->file = file; /* * Record that a name derived from an O_CLOEXEC fd will be * inaccessible after exec. This allows the code in exec to * choose to fail when the executable is not mmaped into the * interpreter and an open file descriptor is not passed to * the interpreter. This makes for a better user experience * than having the interpreter start and then immediately fail * when it finds the executable is inaccessible. */ if (bprm->fdpath && get_close_on_exec(fd)) bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE; /* Set the unchanging part of bprm->cred */ retval = security_bprm_creds_for_exec(bprm); if (retval) goto out; retval = exec_binprm(bprm); if (retval < 0) goto out; sched_mm_cid_after_execve(current); /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; rseq_execve(current); user_events_execve(current); acct_update_integrals(current); task_numa_free(current, false); return retval; out: /* * If past the point of no return ensure the code never * returns to the userspace process. Use an existing fatal * signal if present otherwise terminate the process with * SIGSEGV. */ if (bprm->point_of_no_return && !fatal_signal_pending(current)) force_fatal_sig(SIGSEGV); out_unmark: sched_mm_cid_after_execve(current); current->fs->in_exec = 0; current->in_execve = 0; return retval; } static int do_execveat_common(int fd, struct filename *filename, struct user_arg_ptr argv, struct user_arg_ptr envp, int flags) { struct linux_binprm *bprm; int retval; if (IS_ERR(filename)) return PTR_ERR(filename); /* * We move the actual failure in case of RLIMIT_NPROC excess from * set*uid() to execve() because too many poorly written programs * don't check setuid() return code. Here we additionally recheck * whether NPROC limit is still exceeded. */ if ((current->flags & PF_NPROC_EXCEEDED) && is_rlimit_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { retval = -EAGAIN; goto out_ret; } /* We're below the limit (still or again), so we don't want to make * further execve() calls fail. */ current->flags &= ~PF_NPROC_EXCEEDED; bprm = alloc_bprm(fd, filename); if (IS_ERR(bprm)) { retval = PTR_ERR(bprm); goto out_ret; } retval = count(argv, MAX_ARG_STRINGS); if (retval == 0) pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n", current->comm, bprm->filename); if (retval < 0) goto out_free; bprm->argc = retval; retval = count(envp, MAX_ARG_STRINGS); if (retval < 0) goto out_free; bprm->envc = retval; retval = bprm_stack_limits(bprm); if (retval < 0) goto out_free; retval = copy_string_kernel(bprm->filename, bprm); if (retval < 0) goto out_free; bprm->exec = bprm->p; retval = copy_strings(bprm->envc, envp, bprm); if (retval < 0) goto out_free; retval = copy_strings(bprm->argc, argv, bprm); if (retval < 0) goto out_free; /* * When argv is empty, add an empty string ("") as argv[0] to * ensure confused userspace programs that start processing * from argv[1] won't end up walking envp. See also * bprm_stack_limits(). */ if (bprm->argc == 0) { retval = copy_string_kernel("", bprm); if (retval < 0) goto out_free; bprm->argc = 1; } retval = bprm_execve(bprm, fd, filename, flags); out_free: free_bprm(bprm); out_ret: putname(filename); return retval; } int kernel_execve(const char *kernel_filename, const char *const *argv, const char *const *envp) { struct filename *filename; struct linux_binprm *bprm; int fd = AT_FDCWD; int retval; /* It is non-sense for kernel threads to call execve */ if (WARN_ON_ONCE(current->flags & PF_KTHREAD)) return -EINVAL; filename = getname_kernel(kernel_filename); if (IS_ERR(filename)) return PTR_ERR(filename); bprm = alloc_bprm(fd, filename); if (IS_ERR(bprm)) { retval = PTR_ERR(bprm); goto out_ret; } retval = count_strings_kernel(argv); if (WARN_ON_ONCE(retval == 0)) retval = -EINVAL; if (retval < 0) goto out_free; bprm->argc = retval; retval = count_strings_kernel(envp); if (retval < 0) goto out_free; bprm->envc = retval; retval = bprm_stack_limits(bprm); if (retval < 0) goto out_free; retval = copy_string_kernel(bprm->filename, bprm); if (retval < 0) goto out_free; bprm->exec = bprm->p; retval = copy_strings_kernel(bprm->envc, envp, bprm); if (retval < 0) goto out_free; retval = copy_strings_kernel(bprm->argc, argv, bprm); if (retval < 0) goto out_free; retval = bprm_execve(bprm, fd, filename, 0); out_free: free_bprm(bprm); out_ret: putname(filename); return retval; } static int do_execve(struct filename *filename, const char __user *const __user *__argv, const char __user *const __user *__envp) { struct user_arg_ptr argv = { .ptr.native = __argv }; struct user_arg_ptr envp = { .ptr.native = __envp }; return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); } static int do_execveat(int fd, struct filename *filename, const char __user *const __user *__argv, const char __user *const __user *__envp, int flags) { struct user_arg_ptr argv = { .ptr.native = __argv }; struct user_arg_ptr envp = { .ptr.native = __envp }; return do_execveat_common(fd, filename, argv, envp, flags); } #ifdef CONFIG_COMPAT static int compat_do_execve(struct filename *filename, const compat_uptr_t __user *__argv, const compat_uptr_t __user *__envp) { struct user_arg_ptr argv = { .is_compat = true, .ptr.compat = __argv, }; struct user_arg_ptr envp = { .is_compat = true, .ptr.compat = __envp, }; return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); } static int compat_do_execveat(int fd, struct filename *filename, const compat_uptr_t __user *__argv, const compat_uptr_t __user *__envp, int flags) { struct user_arg_ptr argv = { .is_compat = true, .ptr.compat = __argv, }; struct user_arg_ptr envp = { .is_compat = true, .ptr.compat = __envp, }; return do_execveat_common(fd, filename, argv, envp, flags); } #endif void set_binfmt(struct linux_binfmt *new) { struct mm_struct *mm = current->mm; if (mm->binfmt) module_put(mm->binfmt->module); mm->binfmt = new; if (new) __module_get(new->module); } EXPORT_SYMBOL(set_binfmt); /* * set_dumpable stores three-value SUID_DUMP_* into mm->flags. */ void set_dumpable(struct mm_struct *mm, int value) { if (WARN_ON((unsigned)value > SUID_DUMP_ROOT)) return; set_mask_bits(&mm->flags, MMF_DUMPABLE_MASK, value); } SYSCALL_DEFINE3(execve, const char __user *, filename, const char __user *const __user *, argv, const char __user *const __user *, envp) { return do_execve(getname(filename), argv, envp); } SYSCALL_DEFINE5(execveat, int, fd, const char __user *, filename, const char __user *const __user *, argv, const char __user *const __user *, envp, int, flags) { return do_execveat(fd, getname_uflags(filename, flags), argv, envp, flags); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename, const compat_uptr_t __user *, argv, const compat_uptr_t __user *, envp) { return compat_do_execve(getname(filename), argv, envp); } COMPAT_SYSCALL_DEFINE5(execveat, int, fd, const char __user *, filename, const compat_uptr_t __user *, argv, const compat_uptr_t __user *, envp, int, flags) { return compat_do_execveat(fd, getname_uflags(filename, flags), argv, envp, flags); } #endif #ifdef CONFIG_SYSCTL static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!error) validate_coredump_safety(); return error; } static struct ctl_table fs_exec_sysctls[] = { { .procname = "suid_dumpable", .data = &suid_dumpable, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax_coredump, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, { } }; static int __init init_fs_exec_sysctls(void) { register_sysctl_init("fs", fs_exec_sysctls); return 0; } fs_initcall(init_fs_exec_sysctls); #endif /* CONFIG_SYSCTL */
57 57 57 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_COOKIE_H #define __LINUX_COOKIE_H #include <linux/atomic.h> #include <linux/percpu.h> #include <asm/local.h> struct pcpu_gen_cookie { local_t nesting; u64 last; } __aligned(16); struct gen_cookie { struct pcpu_gen_cookie __percpu *local; atomic64_t forward_last ____cacheline_aligned_in_smp; atomic64_t reverse_last; }; #define COOKIE_LOCAL_BATCH 4096 #define DEFINE_COOKIE(name) \ static DEFINE_PER_CPU(struct pcpu_gen_cookie, __##name); \ static struct gen_cookie name = { \ .local = &__##name, \ .forward_last = ATOMIC64_INIT(0), \ .reverse_last = ATOMIC64_INIT(0), \ } static __always_inline u64 gen_cookie_next(struct gen_cookie *gc) { struct pcpu_gen_cookie *local = this_cpu_ptr(gc->local); u64 val; if (likely(local_inc_return(&local->nesting) == 1)) { val = local->last; if (__is_defined(CONFIG_SMP) && unlikely((val & (COOKIE_LOCAL_BATCH - 1)) == 0)) { s64 next = atomic64_add_return(COOKIE_LOCAL_BATCH, &gc->forward_last); val = next - COOKIE_LOCAL_BATCH; } local->last = ++val; } else { val = atomic64_dec_return(&gc->reverse_last); } local_dec(&local->nesting); return val; } #endif /* __LINUX_COOKIE_H */
107 837 108 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* fs/ internal definitions * * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ struct super_block; struct file_system_type; struct iomap; struct iomap_ops; struct linux_binprm; struct path; struct mount; struct shrink_control; struct fs_context; struct pipe_inode_info; struct iov_iter; struct mnt_idmap; /* * block/bdev.c */ #ifdef CONFIG_BLOCK extern void __init bdev_cache_init(void); #else static inline void bdev_cache_init(void) { } #endif /* CONFIG_BLOCK */ /* * buffer.c */ int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block, const struct iomap *iomap); /* * char_dev.c */ extern void __init chrdev_init(void); /* * fs_context.c */ extern const struct fs_context_operations legacy_fs_context_ops; extern int parse_monolithic_mount_data(struct fs_context *, void *); extern void vfs_clean_context(struct fs_context *fc); extern int finish_clean_context(struct fs_context *fc); /* * namei.c */ extern int filename_lookup(int dfd, struct filename *name, unsigned flags, struct path *path, struct path *root); int do_rmdir(int dfd, struct filename *name); int do_unlinkat(int dfd, struct filename *name); int may_linkat(struct mnt_idmap *idmap, const struct path *link); int do_renameat2(int olddfd, struct filename *oldname, int newdfd, struct filename *newname, unsigned int flags); int do_mkdirat(int dfd, struct filename *name, umode_t mode); int do_symlinkat(struct filename *from, int newdfd, struct filename *to); int do_linkat(int olddfd, struct filename *old, int newdfd, struct filename *new, int flags); /* * namespace.c */ extern struct vfsmount *lookup_mnt(const struct path *); extern int finish_automount(struct vfsmount *, const struct path *); extern int sb_prepare_remount_readonly(struct super_block *); extern void __init mnt_init(void); extern int __mnt_want_write_file(struct file *); extern void __mnt_drop_write_file(struct file *); extern void dissolve_on_fput(struct vfsmount *); extern bool may_mount(void); int path_mount(const char *dev_name, struct path *path, const char *type_page, unsigned long flags, void *data_page); int path_umount(struct path *path, int flags); /* * fs_struct.c */ extern void chroot_fs_refs(const struct path *, const struct path *); /* * file_table.c */ struct file *alloc_empty_file(int flags, const struct cred *cred); struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred); struct file *alloc_empty_backing_file(int flags, const struct cred *cred); static inline void put_file_access(struct file *file) { if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) { i_readcount_dec(file->f_inode); } else if (file->f_mode & FMODE_WRITER) { put_write_access(file->f_inode); __mnt_drop_write(file->f_path.mnt); } } /* * super.c */ extern int reconfigure_super(struct fs_context *); extern bool super_trylock_shared(struct super_block *sb); struct super_block *user_get_super(dev_t, bool excl); void put_super(struct super_block *sb); extern bool mount_capable(struct fs_context *); int sb_init_dio_done_wq(struct super_block *sb); /* * Prepare superblock for changing its read-only state (i.e., either remount * read-write superblock read-only or vice versa). After this function returns * mnt_is_readonly() will return true for any mount of the superblock if its * caller is able to observe any changes done by the remount. This holds until * sb_end_ro_state_change() is called. */ static inline void sb_start_ro_state_change(struct super_block *sb) { WRITE_ONCE(sb->s_readonly_remount, 1); /* * For RO->RW transition, the barrier pairs with the barrier in * mnt_is_readonly() making sure if mnt_is_readonly() sees SB_RDONLY * cleared, it will see s_readonly_remount set. * For RW->RO transition, the barrier pairs with the barrier in * __mnt_want_write() before the mnt_is_readonly() check. The barrier * makes sure if __mnt_want_write() sees MNT_WRITE_HOLD already * cleared, it will see s_readonly_remount set. */ smp_wmb(); } /* * Ends section changing read-only state of the superblock. After this function * returns if mnt_is_readonly() returns false, the caller will be able to * observe all the changes remount did to the superblock. */ static inline void sb_end_ro_state_change(struct super_block *sb) { /* * This barrier provides release semantics that pairs with * the smp_rmb() acquire semantics in mnt_is_readonly(). * This barrier pair ensure that when mnt_is_readonly() sees * 0 for sb->s_readonly_remount, it will also see all the * preceding flag changes that were made during the RO state * change. */ smp_wmb(); WRITE_ONCE(sb->s_readonly_remount, 0); } /* * open.c */ struct open_flags { int open_flag; umode_t mode; int acc_mode; int intent; int lookup_flags; }; extern struct file *do_filp_open(int dfd, struct filename *pathname, const struct open_flags *op); extern struct file *do_file_open_root(const struct path *, const char *, const struct open_flags *); extern struct open_how build_open_how(int flags, umode_t mode); extern int build_open_flags(const struct open_how *how, struct open_flags *op); extern struct file *__close_fd_get_file(unsigned int fd); long do_sys_ftruncate(unsigned int fd, loff_t length, int small); int chmod_common(const struct path *path, umode_t mode); int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, int flag); int chown_common(const struct path *path, uid_t user, gid_t group); extern int vfs_open(const struct path *, struct file *); /* * inode.c */ extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); int dentry_needs_remove_privs(struct mnt_idmap *, struct dentry *dentry); bool in_group_or_capable(struct mnt_idmap *idmap, const struct inode *inode, vfsgid_t vfsgid); void lock_two_inodes(struct inode *inode1, struct inode *inode2, unsigned subclass1, unsigned subclass2); /* * fs-writeback.c */ extern long get_nr_dirty_inodes(void); void invalidate_inodes(struct super_block *sb); /* * dcache.c */ extern int d_set_mounted(struct dentry *dentry); extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc); extern struct dentry *d_alloc_cursor(struct dentry *); extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *); extern char *simple_dname(struct dentry *, char *, int); extern void dput_to_list(struct dentry *, struct list_head *); extern void shrink_dentry_list(struct list_head *); /* * pipe.c */ extern const struct file_operations pipefifo_fops; /* * fs_pin.c */ extern void group_pin_kill(struct hlist_head *p); extern void mnt_pin_kill(struct mount *m); /* * fs/nsfs.c */ extern const struct dentry_operations ns_dentry_operations; /* * fs/stat.c: */ int getname_statx_lookup_flags(int flags); int do_statx(int dfd, struct filename *filename, unsigned int flags, unsigned int mask, struct statx __user *buffer); /* * fs/splice.c: */ long splice_file_to_pipe(struct file *in, struct pipe_inode_info *opipe, loff_t *offset, size_t len, unsigned int flags); /* * fs/xattr.c: */ struct xattr_name { char name[XATTR_NAME_MAX + 1]; }; struct xattr_ctx { /* Value of attribute */ union { const void __user *cvalue; void __user *value; }; void *kvalue; size_t size; /* Attribute name */ struct xattr_name *kname; unsigned int flags; }; ssize_t do_getxattr(struct mnt_idmap *idmap, struct dentry *d, struct xattr_ctx *ctx); int setxattr_copy(const char __user *name, struct xattr_ctx *ctx); int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, struct xattr_ctx *ctx); int may_write_xattr(struct mnt_idmap *idmap, struct inode *inode); #ifdef CONFIG_FS_POSIX_ACL int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, const void *kvalue, size_t size); ssize_t do_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, void *kvalue, size_t size); #else static inline int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, const void *kvalue, size_t size) { return -EOPNOTSUPP; } static inline ssize_t do_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, void *kvalue, size_t size) { return -EOPNOTSUPP; } #endif ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos); /* * fs/attr.c */ struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns); struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap); void mnt_idmap_put(struct mnt_idmap *idmap);
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 /* SPDX-License-Identifier: GPL-2.0 */ /* * Header for use in defining a given L4 protocol for connection tracking. * * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> * - generalized L3 protocol dependent part. * * Derived from include/linux/netfiter_ipv4/ip_conntrack_protcol.h */ #ifndef _NF_CONNTRACK_L4PROTO_H #define _NF_CONNTRACK_L4PROTO_H #include <linux/netlink.h> #include <net/netlink.h> #include <net/netfilter/nf_conntrack.h> #include <net/netns/generic.h> struct seq_file; struct nf_conntrack_l4proto { /* L4 Protocol number. */ u_int8_t l4proto; /* Resolve clashes on insertion races. */ bool allow_clash; /* protoinfo nlattr size, closes a hole */ u16 nlattr_size; /* called by gc worker if table is full */ bool (*can_early_drop)(const struct nf_conn *ct); /* convert protoinfo to nfnetink attributes */ int (*to_nlattr)(struct sk_buff *skb, struct nlattr *nla, struct nf_conn *ct, bool destroy); /* convert nfnetlink attributes to protoinfo */ int (*from_nlattr)(struct nlattr *tb[], struct nf_conn *ct); int (*tuple_to_nlattr)(struct sk_buff *skb, const struct nf_conntrack_tuple *t); /* Calculate tuple nlattr size */ unsigned int (*nlattr_tuple_size)(void); int (*nlattr_to_tuple)(struct nlattr *tb[], struct nf_conntrack_tuple *t, u_int32_t flags); const struct nla_policy *nla_policy; struct { int (*nlattr_to_obj)(struct nlattr *tb[], struct net *net, void *data); int (*obj_to_nlattr)(struct sk_buff *skb, const void *data); u16 obj_size; u16 nlattr_max; const struct nla_policy *nla_policy; } ctnl_timeout; #ifdef CONFIG_NF_CONNTRACK_PROCFS /* Print out the private part of the conntrack. */ void (*print_conntrack)(struct seq_file *s, struct nf_conn *); #endif }; bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct net *net, struct nf_conntrack_tuple *tuple); bool icmpv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct net *net, struct nf_conntrack_tuple *tuple); bool nf_conntrack_invert_icmp_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig); bool nf_conntrack_invert_icmpv6_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig); int nf_conntrack_inet_error(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state, u8 l4proto, union nf_inet_addr *outer_daddr); int nf_conntrack_icmpv4_error(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state); int nf_conntrack_icmpv6_error(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state); int nf_conntrack_icmp_packet(struct nf_conn *ct, struct sk_buff *skb, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); int nf_conntrack_icmpv6_packet(struct nf_conn *ct, struct sk_buff *skb, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); int nf_conntrack_udp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); int nf_conntrack_udplite_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); int nf_conntrack_tcp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); int nf_conntrack_dccp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); int nf_conntrack_sctp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); int nf_conntrack_gre_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); void nf_conntrack_generic_init_net(struct net *net); void nf_conntrack_tcp_init_net(struct net *net); void nf_conntrack_udp_init_net(struct net *net); void nf_conntrack_gre_init_net(struct net *net); void nf_conntrack_dccp_init_net(struct net *net); void nf_conntrack_sctp_init_net(struct net *net); void nf_conntrack_icmp_init_net(struct net *net); void nf_conntrack_icmpv6_init_net(struct net *net); /* Existing built-in generic protocol */ extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic; #define MAX_NF_CT_PROTO IPPROTO_UDPLITE const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto); /* Generic netlink helpers */ int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple); int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], struct nf_conntrack_tuple *t, u_int32_t flags); unsigned int nf_ct_port_nlattr_tuple_size(void); extern const struct nla_policy nf_ct_port_nla_policy[]; #ifdef CONFIG_SYSCTL __printf(4, 5) __cold void nf_ct_l4proto_log_invalid(const struct sk_buff *skb, const struct nf_conn *ct, const struct nf_hook_state *state, const char *fmt, ...); __printf(4, 5) __cold void nf_l4proto_log_invalid(const struct sk_buff *skb, const struct nf_hook_state *state, u8 protonum, const char *fmt, ...); #else static inline __printf(4, 5) __cold void nf_l4proto_log_invalid(const struct sk_buff *skb, const struct nf_hook_state *state, u8 protonum, const char *fmt, ...) {} static inline __printf(4, 5) __cold void nf_ct_l4proto_log_invalid(const struct sk_buff *skb, const struct nf_conn *ct, const struct nf_hook_state *state, const char *fmt, ...) { } #endif /* CONFIG_SYSCTL */ #if IS_ENABLED(CONFIG_NF_CONNTRACK) static inline struct nf_generic_net *nf_generic_pernet(struct net *net) { return &net->ct.nf_ct_proto.generic; } static inline struct nf_tcp_net *nf_tcp_pernet(struct net *net) { return &net->ct.nf_ct_proto.tcp; } static inline struct nf_udp_net *nf_udp_pernet(struct net *net) { return &net->ct.nf_ct_proto.udp; } static inline struct nf_icmp_net *nf_icmp_pernet(struct net *net) { return &net->ct.nf_ct_proto.icmp; } static inline struct nf_icmp_net *nf_icmpv6_pernet(struct net *net) { return &net->ct.nf_ct_proto.icmpv6; } /* Caller must check nf_ct_protonum(ct) is IPPROTO_TCP before calling. */ static inline void nf_ct_set_tcp_be_liberal(struct nf_conn *ct) { ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; } /* Caller must check nf_ct_protonum(ct) is IPPROTO_TCP before calling. */ static inline bool nf_conntrack_tcp_established(const struct nf_conn *ct) { return ct->proto.tcp.state == TCP_CONNTRACK_ESTABLISHED && test_bit(IPS_ASSURED_BIT, &ct->status); } #endif #ifdef CONFIG_NF_CT_PROTO_DCCP static inline struct nf_dccp_net *nf_dccp_pernet(struct net *net) { return &net->ct.nf_ct_proto.dccp; } #endif #ifdef CONFIG_NF_CT_PROTO_SCTP static inline struct nf_sctp_net *nf_sctp_pernet(struct net *net) { return &net->ct.nf_ct_proto.sctp; } #endif #ifdef CONFIG_NF_CT_PROTO_GRE static inline struct nf_gre_net *nf_gre_pernet(struct net *net) { return &net->ct.nf_ct_proto.gre; } #endif #endif /*_NF_CONNTRACK_PROTOCOL_H*/
1191 1191 1191 1191 1191 1190 1130 436 428 428 428 428 428 428 428 428 428 426 427 426 426 437 427 425 424 424 111 112 4735 444 4548 3959 3880 3880 4494 4212 4548 4548 4548 4547 4548 4289 4548 1976 1679 4547 3933 3930 4548 681 4548 3764 4548 4547 4548 4536 3926 4735 4735 62 62 60 5018 565 565 5207 5207 5017 5207 5207 5207 2406 5213 3689 5213 3 2 5212 565 5212 5206 6 3316 5211 2361 5212 3696 96 2403 2321 5212 3761 5206 5206 5206 5206 5205 5206 5212 5207 6 6 5207 5137 5204 5101 5099 1127 4637 5031 5207 954 953 952 949 954 2 969 921 923 3764 3764 1742 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/open.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/string.h> #include <linux/mm.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/fsnotify.h> #include <linux/module.h> #include <linux/tty.h> #include <linux/namei.h> #include <linux/backing-dev.h> #include <linux/capability.h> #include <linux/securebits.h> #include <linux/security.h> #include <linux/mount.h> #include <linux/fcntl.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/fs.h> #include <linux/personality.h> #include <linux/pagemap.h> #include <linux/syscalls.h> #include <linux/rcupdate.h> #include <linux/audit.h> #include <linux/falloc.h> #include <linux/fs_struct.h> #include <linux/ima.h> #include <linux/dnotify.h> #include <linux/compat.h> #include <linux/mnt_idmapping.h> #include <linux/filelock.h> #include "internal.h" int do_truncate(struct mnt_idmap *idmap, struct dentry *dentry, loff_t length, unsigned int time_attrs, struct file *filp) { int ret; struct iattr newattrs; /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */ if (length < 0) return -EINVAL; newattrs.ia_size = length; newattrs.ia_valid = ATTR_SIZE | time_attrs; if (filp) { newattrs.ia_file = filp; newattrs.ia_valid |= ATTR_FILE; } /* Remove suid, sgid, and file capabilities on truncate too */ ret = dentry_needs_remove_privs(idmap, dentry); if (ret < 0) return ret; if (ret) newattrs.ia_valid |= ret | ATTR_FORCE; inode_lock(dentry->d_inode); /* Note any delegations or leases have already been broken: */ ret = notify_change(idmap, dentry, &newattrs, NULL); inode_unlock(dentry->d_inode); return ret; } long vfs_truncate(const struct path *path, loff_t length) { struct mnt_idmap *idmap; struct inode *inode; long error; inode = path->dentry->d_inode; /* For directories it's -EISDIR, for other non-regulars - -EINVAL */ if (S_ISDIR(inode->i_mode)) return -EISDIR; if (!S_ISREG(inode->i_mode)) return -EINVAL; error = mnt_want_write(path->mnt); if (error) goto out; idmap = mnt_idmap(path->mnt); error = inode_permission(idmap, inode, MAY_WRITE); if (error) goto mnt_drop_write_and_out; error = -EPERM; if (IS_APPEND(inode)) goto mnt_drop_write_and_out; error = get_write_access(inode); if (error) goto mnt_drop_write_and_out; /* * Make sure that there are no leases. get_write_access() protects * against the truncate racing with a lease-granting setlease(). */ error = break_lease(inode, O_WRONLY); if (error) goto put_write_and_out; error = security_path_truncate(path); if (!error) error = do_truncate(idmap, path->dentry, length, 0, NULL); put_write_and_out: put_write_access(inode); mnt_drop_write_and_out: mnt_drop_write(path->mnt); out: return error; } EXPORT_SYMBOL_GPL(vfs_truncate); long do_sys_truncate(const char __user *pathname, loff_t length) { unsigned int lookup_flags = LOOKUP_FOLLOW; struct path path; int error; if (length < 0) /* sorry, but loff_t says... */ return -EINVAL; retry: error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (!error) { error = vfs_truncate(&path, length); path_put(&path); } if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } return error; } SYSCALL_DEFINE2(truncate, const char __user *, path, long, length) { return do_sys_truncate(path, length); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length) { return do_sys_truncate(path, length); } #endif long do_sys_ftruncate(unsigned int fd, loff_t length, int small) { struct inode *inode; struct dentry *dentry; struct fd f; int error; error = -EINVAL; if (length < 0) goto out; error = -EBADF; f = fdget(fd); if (!f.file) goto out; /* explicitly opened as large or we are on 64-bit box */ if (f.file->f_flags & O_LARGEFILE) small = 0; dentry = f.file->f_path.dentry; inode = dentry->d_inode; error = -EINVAL; if (!S_ISREG(inode->i_mode) || !(f.file->f_mode & FMODE_WRITE)) goto out_putf; error = -EINVAL; /* Cannot ftruncate over 2^31 bytes without large file support */ if (small && length > MAX_NON_LFS) goto out_putf; error = -EPERM; /* Check IS_APPEND on real upper inode */ if (IS_APPEND(file_inode(f.file))) goto out_putf; sb_start_write(inode->i_sb); error = security_file_truncate(f.file); if (!error) error = do_truncate(file_mnt_idmap(f.file), dentry, length, ATTR_MTIME | ATTR_CTIME, f.file); sb_end_write(inode->i_sb); out_putf: fdput(f); out: return error; } SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length) { return do_sys_ftruncate(fd, length, 1); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_ulong_t, length) { return do_sys_ftruncate(fd, length, 1); } #endif /* LFS versions of truncate are only needed on 32 bit machines */ #if BITS_PER_LONG == 32 SYSCALL_DEFINE2(truncate64, const char __user *, path, loff_t, length) { return do_sys_truncate(path, length); } SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length) { return do_sys_ftruncate(fd, length, 0); } #endif /* BITS_PER_LONG == 32 */ #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_TRUNCATE64) COMPAT_SYSCALL_DEFINE3(truncate64, const char __user *, pathname, compat_arg_u64_dual(length)) { return ksys_truncate(pathname, compat_arg_u64_glue(length)); } #endif #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FTRUNCATE64) COMPAT_SYSCALL_DEFINE3(ftruncate64, unsigned int, fd, compat_arg_u64_dual(length)) { return ksys_ftruncate(fd, compat_arg_u64_glue(length)); } #endif int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); long ret; if (offset < 0 || len <= 0) return -EINVAL; /* Return error if mode is not supported */ if (mode & ~FALLOC_FL_SUPPORTED_MASK) return -EOPNOTSUPP; /* Punch hole and zero range are mutually exclusive */ if ((mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) == (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) return -EOPNOTSUPP; /* Punch hole must have keep size set */ if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE)) return -EOPNOTSUPP; /* Collapse range should only be used exclusively. */ if ((mode & FALLOC_FL_COLLAPSE_RANGE) && (mode & ~FALLOC_FL_COLLAPSE_RANGE)) return -EINVAL; /* Insert range should only be used exclusively. */ if ((mode & FALLOC_FL_INSERT_RANGE) && (mode & ~FALLOC_FL_INSERT_RANGE)) return -EINVAL; /* Unshare range should only be used with allocate mode. */ if ((mode & FALLOC_FL_UNSHARE_RANGE) && (mode & ~(FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_KEEP_SIZE))) return -EINVAL; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; /* * We can only allow pure fallocate on append only files */ if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode)) return -EPERM; if (IS_IMMUTABLE(inode)) return -EPERM; /* * We cannot allow any fallocate operation on an active swapfile */ if (IS_SWAPFILE(inode)) return -ETXTBSY; /* * Revalidate the write permissions, in case security policy has * changed since the files were opened. */ ret = security_file_permission(file, MAY_WRITE); if (ret) return ret; if (S_ISFIFO(inode->i_mode)) return -ESPIPE; if (S_ISDIR(inode->i_mode)) return -EISDIR; if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode)) return -ENODEV; /* Check for wrap through zero too */ if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0)) return -EFBIG; if (!file->f_op->fallocate) return -EOPNOTSUPP; file_start_write(file); ret = file->f_op->fallocate(file, mode, offset, len); /* * Create inotify and fanotify events. * * To keep the logic simple always create events if fallocate succeeds. * This implies that events are even created if the file size remains * unchanged, e.g. when using flag FALLOC_FL_KEEP_SIZE. */ if (ret == 0) fsnotify_modify(file); file_end_write(file); return ret; } EXPORT_SYMBOL_GPL(vfs_fallocate); int ksys_fallocate(int fd, int mode, loff_t offset, loff_t len) { struct fd f = fdget(fd); int error = -EBADF; if (f.file) { error = vfs_fallocate(f.file, mode, offset, len); fdput(f); } return error; } SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) { return ksys_fallocate(fd, mode, offset, len); } #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FALLOCATE) COMPAT_SYSCALL_DEFINE6(fallocate, int, fd, int, mode, compat_arg_u64_dual(offset), compat_arg_u64_dual(len)) { return ksys_fallocate(fd, mode, compat_arg_u64_glue(offset), compat_arg_u64_glue(len)); } #endif /* * access() needs to use the real uid/gid, not the effective uid/gid. * We do this by temporarily clearing all FS-related capabilities and * switching the fsuid/fsgid around to the real ones. * * Creating new credentials is expensive, so we try to skip doing it, * which we can if the result would match what we already got. */ static bool access_need_override_creds(int flags) { const struct cred *cred; if (flags & AT_EACCESS) return false; cred = current_cred(); if (!uid_eq(cred->fsuid, cred->uid) || !gid_eq(cred->fsgid, cred->gid)) return true; if (!issecure(SECURE_NO_SETUID_FIXUP)) { kuid_t root_uid = make_kuid(cred->user_ns, 0); if (!uid_eq(cred->uid, root_uid)) { if (!cap_isclear(cred->cap_effective)) return true; } else { if (!cap_isidentical(cred->cap_effective, cred->cap_permitted)) return true; } } return false; } static const struct cred *access_override_creds(void) { const struct cred *old_cred; struct cred *override_cred; override_cred = prepare_creds(); if (!override_cred) return NULL; /* * XXX access_need_override_creds performs checks in hopes of skipping * this work. Make sure it stays in sync if making any changes in this * routine. */ override_cred->fsuid = override_cred->uid; override_cred->fsgid = override_cred->gid; if (!issecure(SECURE_NO_SETUID_FIXUP)) { /* Clear the capabilities if we switch to a non-root user */ kuid_t root_uid = make_kuid(override_cred->user_ns, 0); if (!uid_eq(override_cred->uid, root_uid)) cap_clear(override_cred->cap_effective); else override_cred->cap_effective = override_cred->cap_permitted; } /* * The new set of credentials can *only* be used in * task-synchronous circumstances, and does not need * RCU freeing, unless somebody then takes a separate * reference to it. * * NOTE! This is _only_ true because this credential * is used purely for override_creds() that installs * it as the subjective cred. Other threads will be * accessing ->real_cred, not the subjective cred. * * If somebody _does_ make a copy of this (using the * 'get_current_cred()' function), that will clear the * non_rcu field, because now that other user may be * expecting RCU freeing. But normal thread-synchronous * cred accesses will keep things non-RCY. */ override_cred->non_rcu = 1; old_cred = override_creds(override_cred); /* override_cred() gets its own ref */ put_cred(override_cred); return old_cred; } static long do_faccessat(int dfd, const char __user *filename, int mode, int flags) { struct path path; struct inode *inode; int res; unsigned int lookup_flags = LOOKUP_FOLLOW; const struct cred *old_cred = NULL; if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ return -EINVAL; if (flags & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) return -EINVAL; if (flags & AT_SYMLINK_NOFOLLOW) lookup_flags &= ~LOOKUP_FOLLOW; if (flags & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; if (access_need_override_creds(flags)) { old_cred = access_override_creds(); if (!old_cred) return -ENOMEM; } retry: res = user_path_at(dfd, filename, lookup_flags, &path); if (res) goto out; inode = d_backing_inode(path.dentry); if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) { /* * MAY_EXEC on regular files is denied if the fs is mounted * with the "noexec" flag. */ res = -EACCES; if (path_noexec(&path)) goto out_path_release; } res = inode_permission(mnt_idmap(path.mnt), inode, mode | MAY_ACCESS); /* SuS v2 requires we report a read only fs too */ if (res || !(mode & S_IWOTH) || special_file(inode->i_mode)) goto out_path_release; /* * This is a rare case where using __mnt_is_readonly() * is OK without a mnt_want/drop_write() pair. Since * no actual write to the fs is performed here, we do * not need to telegraph to that to anyone. * * By doing this, we accept that this access is * inherently racy and know that the fs may change * state before we even see this result. */ if (__mnt_is_readonly(path.mnt)) res = -EROFS; out_path_release: path_put(&path); if (retry_estale(res, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: if (old_cred) revert_creds(old_cred); return res; } SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode) { return do_faccessat(dfd, filename, mode, 0); } SYSCALL_DEFINE4(faccessat2, int, dfd, const char __user *, filename, int, mode, int, flags) { return do_faccessat(dfd, filename, mode, flags); } SYSCALL_DEFINE2(access, const char __user *, filename, int, mode) { return do_faccessat(AT_FDCWD, filename, mode, 0); } SYSCALL_DEFINE1(chdir, const char __user *, filename) { struct path path; int error; unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY; retry: error = user_path_at(AT_FDCWD, filename, lookup_flags, &path); if (error) goto out; error = path_permission(&path, MAY_EXEC | MAY_CHDIR); if (error) goto dput_and_out; set_fs_pwd(current->fs, &path); dput_and_out: path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: return error; } SYSCALL_DEFINE1(fchdir, unsigned int, fd) { struct fd f = fdget_raw(fd); int error; error = -EBADF; if (!f.file) goto out; error = -ENOTDIR; if (!d_can_lookup(f.file->f_path.dentry)) goto out_putf; error = file_permission(f.file, MAY_EXEC | MAY_CHDIR); if (!error) set_fs_pwd(current->fs, &f.file->f_path); out_putf: fdput(f); out: return error; } SYSCALL_DEFINE1(chroot, const char __user *, filename) { struct path path; int error; unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY; retry: error = user_path_at(AT_FDCWD, filename, lookup_flags, &path); if (error) goto out; error = path_permission(&path, MAY_EXEC | MAY_CHDIR); if (error) goto dput_and_out; error = -EPERM; if (!ns_capable(current_user_ns(), CAP_SYS_CHROOT)) goto dput_and_out; error = security_path_chroot(&path); if (error) goto dput_and_out; set_fs_root(current->fs, &path); error = 0; dput_and_out: path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: return error; } int chmod_common(const struct path *path, umode_t mode) { struct inode *inode = path->dentry->d_inode; struct inode *delegated_inode = NULL; struct iattr newattrs; int error; error = mnt_want_write(path->mnt); if (error) return error; retry_deleg: inode_lock(inode); error = security_path_chmod(path, mode); if (error) goto out_unlock; newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; error = notify_change(mnt_idmap(path->mnt), path->dentry, &newattrs, &delegated_inode); out_unlock: inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } mnt_drop_write(path->mnt); return error; } int vfs_fchmod(struct file *file, umode_t mode) { audit_file(file); return chmod_common(&file->f_path, mode); } SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode) { struct fd f = fdget(fd); int err = -EBADF; if (f.file) { err = vfs_fchmod(f.file, mode); fdput(f); } return err; } static int do_fchmodat(int dfd, const char __user *filename, umode_t mode, unsigned int flags) { struct path path; int error; unsigned int lookup_flags; if (unlikely(flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH))) return -EINVAL; lookup_flags = (flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; if (flags & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; retry: error = user_path_at(dfd, filename, lookup_flags, &path); if (!error) { error = chmod_common(&path, mode); path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } } return error; } SYSCALL_DEFINE4(fchmodat2, int, dfd, const char __user *, filename, umode_t, mode, unsigned int, flags) { return do_fchmodat(dfd, filename, mode, flags); } SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, umode_t, mode) { return do_fchmodat(dfd, filename, mode, 0); } SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode) { return do_fchmodat(AT_FDCWD, filename, mode, 0); } /* * Check whether @kuid is valid and if so generate and set vfsuid_t in * ia_vfsuid. * * Return: true if @kuid is valid, false if not. */ static inline bool setattr_vfsuid(struct iattr *attr, kuid_t kuid) { if (!uid_valid(kuid)) return false; attr->ia_valid |= ATTR_UID; attr->ia_vfsuid = VFSUIDT_INIT(kuid); return true; } /* * Check whether @kgid is valid and if so generate and set vfsgid_t in * ia_vfsgid. * * Return: true if @kgid is valid, false if not. */ static inline bool setattr_vfsgid(struct iattr *attr, kgid_t kgid) { if (!gid_valid(kgid)) return false; attr->ia_valid |= ATTR_GID; attr->ia_vfsgid = VFSGIDT_INIT(kgid); return true; } int chown_common(const struct path *path, uid_t user, gid_t group) { struct mnt_idmap *idmap; struct user_namespace *fs_userns; struct inode *inode = path->dentry->d_inode; struct inode *delegated_inode = NULL; int error; struct iattr newattrs; kuid_t uid; kgid_t gid; uid = make_kuid(current_user_ns(), user); gid = make_kgid(current_user_ns(), group); idmap = mnt_idmap(path->mnt); fs_userns = i_user_ns(inode); retry_deleg: newattrs.ia_vfsuid = INVALID_VFSUID; newattrs.ia_vfsgid = INVALID_VFSGID; newattrs.ia_valid = ATTR_CTIME; if ((user != (uid_t)-1) && !setattr_vfsuid(&newattrs, uid)) return -EINVAL; if ((group != (gid_t)-1) && !setattr_vfsgid(&newattrs, gid)) return -EINVAL; inode_lock(inode); if (!S_ISDIR(inode->i_mode)) newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_PRIV | setattr_should_drop_sgid(idmap, inode); /* Continue to send actual fs values, not the mount values. */ error = security_path_chown( path, from_vfsuid(idmap, fs_userns, newattrs.ia_vfsuid), from_vfsgid(idmap, fs_userns, newattrs.ia_vfsgid)); if (!error) error = notify_change(idmap, path->dentry, &newattrs, &delegated_inode); inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } return error; } int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, int flag) { struct path path; int error = -EINVAL; int lookup_flags; if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) goto out; lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; if (flag & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; retry: error = user_path_at(dfd, filename, lookup_flags, &path); if (error) goto out; error = mnt_want_write(path.mnt); if (error) goto out_release; error = chown_common(&path, user, group); mnt_drop_write(path.mnt); out_release: path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: return error; } SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user, gid_t, group, int, flag) { return do_fchownat(dfd, filename, user, group, flag); } SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group) { return do_fchownat(AT_FDCWD, filename, user, group, 0); } SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group) { return do_fchownat(AT_FDCWD, filename, user, group, AT_SYMLINK_NOFOLLOW); } int vfs_fchown(struct file *file, uid_t user, gid_t group) { int error; error = mnt_want_write_file(file); if (error) return error; audit_file(file); error = chown_common(&file->f_path, user, group); mnt_drop_write_file(file); return error; } int ksys_fchown(unsigned int fd, uid_t user, gid_t group) { struct fd f = fdget(fd); int error = -EBADF; if (f.file) { error = vfs_fchown(f.file, user, group); fdput(f); } return error; } SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) { return ksys_fchown(fd, user, group); } static int do_dentry_open(struct file *f, struct inode *inode, int (*open)(struct inode *, struct file *)) { static const struct file_operations empty_fops = {}; int error; path_get(&f->f_path); f->f_inode = inode; f->f_mapping = inode->i_mapping; f->f_wb_err = filemap_sample_wb_err(f->f_mapping); f->f_sb_err = file_sample_sb_err(f); if (unlikely(f->f_flags & O_PATH)) { f->f_mode = FMODE_PATH | FMODE_OPENED; f->f_op = &empty_fops; return 0; } if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) { i_readcount_inc(inode); } else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { error = get_write_access(inode); if (unlikely(error)) goto cleanup_file; error = __mnt_want_write(f->f_path.mnt); if (unlikely(error)) { put_write_access(inode); goto cleanup_file; } f->f_mode |= FMODE_WRITER; } /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */ if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) f->f_mode |= FMODE_ATOMIC_POS; f->f_op = fops_get(inode->i_fop); if (WARN_ON(!f->f_op)) { error = -ENODEV; goto cleanup_all; } error = security_file_open(f); if (error) goto cleanup_all; error = break_lease(file_inode(f), f->f_flags); if (error) goto cleanup_all; /* normally all 3 are set; ->open() can clear them if needed */ f->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; if (!open) open = f->f_op->open; if (open) { error = open(inode, f); if (error) goto cleanup_all; } f->f_mode |= FMODE_OPENED; if ((f->f_mode & FMODE_READ) && likely(f->f_op->read || f->f_op->read_iter)) f->f_mode |= FMODE_CAN_READ; if ((f->f_mode & FMODE_WRITE) && likely(f->f_op->write || f->f_op->write_iter)) f->f_mode |= FMODE_CAN_WRITE; if ((f->f_mode & FMODE_LSEEK) && !f->f_op->llseek) f->f_mode &= ~FMODE_LSEEK; if (f->f_mapping->a_ops && f->f_mapping->a_ops->direct_IO) f->f_mode |= FMODE_CAN_ODIRECT; f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); f->f_iocb_flags = iocb_flags(f); file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); if ((f->f_flags & O_DIRECT) && !(f->f_mode & FMODE_CAN_ODIRECT)) return -EINVAL; /* * XXX: Huge page cache doesn't support writing yet. Drop all page * cache for this file before processing writes. */ if (f->f_mode & FMODE_WRITE) { /* * Paired with smp_mb() in collapse_file() to ensure nr_thps * is up to date and the update to i_writecount by * get_write_access() is visible. Ensures subsequent insertion * of THPs into the page cache will fail. */ smp_mb(); if (filemap_nr_thps(inode->i_mapping)) { struct address_space *mapping = inode->i_mapping; filemap_invalidate_lock(inode->i_mapping); /* * unmap_mapping_range just need to be called once * here, because the private pages is not need to be * unmapped mapping (e.g. data segment of dynamic * shared libraries here). */ unmap_mapping_range(mapping, 0, 0, 0); truncate_inode_pages(mapping, 0); filemap_invalidate_unlock(inode->i_mapping); } } /* * Once we return a file with FMODE_OPENED, __fput() will call * fsnotify_close(), so we need fsnotify_open() here for symmetry. */ fsnotify_open(f); return 0; cleanup_all: if (WARN_ON_ONCE(error > 0)) error = -EINVAL; fops_put(f->f_op); put_file_access(f); cleanup_file: path_put(&f->f_path); f->f_path.mnt = NULL; f->f_path.dentry = NULL; f->f_inode = NULL; return error; } /** * finish_open - finish opening a file * @file: file pointer * @dentry: pointer to dentry * @open: open callback * * This can be used to finish opening a file passed to i_op->atomic_open(). * * If the open callback is set to NULL, then the standard f_op->open() * filesystem callback is substituted. * * NB: the dentry reference is _not_ consumed. If, for example, the dentry is * the return value of d_splice_alias(), then the caller needs to perform dput() * on it after finish_open(). * * Returns zero on success or -errno if the open failed. */ int finish_open(struct file *file, struct dentry *dentry, int (*open)(struct inode *, struct file *)) { BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */ file->f_path.dentry = dentry; return do_dentry_open(file, d_backing_inode(dentry), open); } EXPORT_SYMBOL(finish_open); /** * finish_no_open - finish ->atomic_open() without opening the file * * @file: file pointer * @dentry: dentry or NULL (as returned from ->lookup()) * * This can be used to set the result of a successful lookup in ->atomic_open(). * * NB: unlike finish_open() this function does consume the dentry reference and * the caller need not dput() it. * * Returns "0" which must be the return value of ->atomic_open() after having * called this function. */ int finish_no_open(struct file *file, struct dentry *dentry) { file->f_path.dentry = dentry; return 0; } EXPORT_SYMBOL(finish_no_open); char *file_path(struct file *filp, char *buf, int buflen) { return d_path(&filp->f_path, buf, buflen); } EXPORT_SYMBOL(file_path); /** * vfs_open - open the file at the given path * @path: path to open * @file: newly allocated file with f_flag initialized */ int vfs_open(const struct path *path, struct file *file) { file->f_path = *path; return do_dentry_open(file, d_backing_inode(path->dentry), NULL); } struct file *dentry_open(const struct path *path, int flags, const struct cred *cred) { int error; struct file *f; validate_creds(cred); /* We must always pass in a valid mount pointer. */ BUG_ON(!path->mnt); f = alloc_empty_file(flags, cred); if (!IS_ERR(f)) { error = vfs_open(path, f); if (error) { fput(f); f = ERR_PTR(error); } } return f; } EXPORT_SYMBOL(dentry_open); /** * dentry_create - Create and open a file * @path: path to create * @flags: O_ flags * @mode: mode bits for new file * @cred: credentials to use * * Caller must hold the parent directory's lock, and have prepared * a negative dentry, placed in @path->dentry, for the new file. * * Caller sets @path->mnt to the vfsmount of the filesystem where * the new file is to be created. The parent directory and the * negative dentry must reside on the same filesystem instance. * * On success, returns a "struct file *". Otherwise a ERR_PTR * is returned. */ struct file *dentry_create(const struct path *path, int flags, umode_t mode, const struct cred *cred) { struct file *f; int error; validate_creds(cred); f = alloc_empty_file(flags, cred); if (IS_ERR(f)) return f; error = vfs_create(mnt_idmap(path->mnt), d_inode(path->dentry->d_parent), path->dentry, mode, true); if (!error) error = vfs_open(path, f); if (unlikely(error)) { fput(f); return ERR_PTR(error); } return f; } EXPORT_SYMBOL(dentry_create); /** * kernel_file_open - open a file for kernel internal use * @path: path of the file to open * @flags: open flags * @inode: the inode * @cred: credentials for open * * Open a file for use by in-kernel consumers. The file is not accounted * against nr_files and must not be installed into the file descriptor * table. * * Return: Opened file on success, an error pointer on failure. */ struct file *kernel_file_open(const struct path *path, int flags, struct inode *inode, const struct cred *cred) { struct file *f; int error; f = alloc_empty_file_noaccount(flags, cred); if (IS_ERR(f)) return f; f->f_path = *path; error = do_dentry_open(f, inode, NULL); if (error) { fput(f); f = ERR_PTR(error); } return f; } EXPORT_SYMBOL_GPL(kernel_file_open); /** * backing_file_open - open a backing file for kernel internal use * @path: path of the file to open * @flags: open flags * @real_path: path of the backing file * @cred: credentials for open * * Open a backing file for a stackable filesystem (e.g., overlayfs). * @path may be on the stackable filesystem and backing inode on the * underlying filesystem. In this case, we want to be able to return * the @real_path of the backing inode. This is done by embedding the * returned file into a container structure that also stores the path of * the backing inode on the underlying filesystem, which can be * retrieved using backing_file_real_path(). */ struct file *backing_file_open(const struct path *path, int flags, const struct path *real_path, const struct cred *cred) { struct file *f; int error; f = alloc_empty_backing_file(flags, cred); if (IS_ERR(f)) return f; f->f_path = *path; path_get(real_path); *backing_file_real_path(f) = *real_path; error = do_dentry_open(f, d_inode(real_path->dentry), NULL); if (error) { fput(f); f = ERR_PTR(error); } return f; } EXPORT_SYMBOL_GPL(backing_file_open); #define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE)) #define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC) inline struct open_how build_open_how(int flags, umode_t mode) { struct open_how how = { .flags = flags & VALID_OPEN_FLAGS, .mode = mode & S_IALLUGO, }; /* O_PATH beats everything else. */ if (how.flags & O_PATH) how.flags &= O_PATH_FLAGS; /* Modes should only be set for create-like flags. */ if (!WILL_CREATE(how.flags)) how.mode = 0; return how; } inline int build_open_flags(const struct open_how *how, struct open_flags *op) { u64 flags = how->flags; u64 strip = __FMODE_NONOTIFY | O_CLOEXEC; int lookup_flags = 0; int acc_mode = ACC_MODE(flags); BUILD_BUG_ON_MSG(upper_32_bits(VALID_OPEN_FLAGS), "struct open_flags doesn't yet handle flags > 32 bits"); /* * Strip flags that either shouldn't be set by userspace like * FMODE_NONOTIFY or that aren't relevant in determining struct * open_flags like O_CLOEXEC. */ flags &= ~strip; /* * Older syscalls implicitly clear all of the invalid flags or argument * values before calling build_open_flags(), but openat2(2) checks all * of its arguments. */ if (flags & ~VALID_OPEN_FLAGS) return -EINVAL; if (how->resolve & ~VALID_RESOLVE_FLAGS) return -EINVAL; /* Scoping flags are mutually exclusive. */ if ((how->resolve & RESOLVE_BENEATH) && (how->resolve & RESOLVE_IN_ROOT)) return -EINVAL; /* Deal with the mode. */ if (WILL_CREATE(flags)) { if (how->mode & ~S_IALLUGO) return -EINVAL; op->mode = how->mode | S_IFREG; } else { if (how->mode != 0) return -EINVAL; op->mode = 0; } /* * Block bugs where O_DIRECTORY | O_CREAT created regular files. * Note, that blocking O_DIRECTORY | O_CREAT here also protects * O_TMPFILE below which requires O_DIRECTORY being raised. */ if ((flags & (O_DIRECTORY | O_CREAT)) == (O_DIRECTORY | O_CREAT)) return -EINVAL; /* Now handle the creative implementation of O_TMPFILE. */ if (flags & __O_TMPFILE) { /* * In order to ensure programs get explicit errors when trying * to use O_TMPFILE on old kernels we enforce that O_DIRECTORY * is raised alongside __O_TMPFILE. */ if (!(flags & O_DIRECTORY)) return -EINVAL; if (!(acc_mode & MAY_WRITE)) return -EINVAL; } if (flags & O_PATH) { /* O_PATH only permits certain other flags to be set. */ if (flags & ~O_PATH_FLAGS) return -EINVAL; acc_mode = 0; } /* * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only * check for O_DSYNC if the need any syncing at all we enforce it's * always set instead of having to deal with possibly weird behaviour * for malicious applications setting only __O_SYNC. */ if (flags & __O_SYNC) flags |= O_DSYNC; op->open_flag = flags; /* O_TRUNC implies we need access checks for write permissions */ if (flags & O_TRUNC) acc_mode |= MAY_WRITE; /* Allow the LSM permission hook to distinguish append access from general write access. */ if (flags & O_APPEND) acc_mode |= MAY_APPEND; op->acc_mode = acc_mode; op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN; if (flags & O_CREAT) { op->intent |= LOOKUP_CREATE; if (flags & O_EXCL) { op->intent |= LOOKUP_EXCL; flags |= O_NOFOLLOW; } } if (flags & O_DIRECTORY) lookup_flags |= LOOKUP_DIRECTORY; if (!(flags & O_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; if (how->resolve & RESOLVE_NO_XDEV) lookup_flags |= LOOKUP_NO_XDEV; if (how->resolve & RESOLVE_NO_MAGICLINKS) lookup_flags |= LOOKUP_NO_MAGICLINKS; if (how->resolve & RESOLVE_NO_SYMLINKS) lookup_flags |= LOOKUP_NO_SYMLINKS; if (how->resolve & RESOLVE_BENEATH) lookup_flags |= LOOKUP_BENEATH; if (how->resolve & RESOLVE_IN_ROOT) lookup_flags |= LOOKUP_IN_ROOT; if (how->resolve & RESOLVE_CACHED) { /* Don't bother even trying for create/truncate/tmpfile open */ if (flags & (O_TRUNC | O_CREAT | __O_TMPFILE)) return -EAGAIN; lookup_flags |= LOOKUP_CACHED; } op->lookup_flags = lookup_flags; return 0; } /** * file_open_name - open file and return file pointer * * @name: struct filename containing path to open * @flags: open flags as per the open(2) second argument * @mode: mode for the new file if O_CREAT is set, else ignored * * This is the helper to open a file from kernelspace if you really * have to. But in generally you should not do this, so please move * along, nothing to see here.. */ struct file *file_open_name(struct filename *name, int flags, umode_t mode) { struct open_flags op; struct open_how how = build_open_how(flags, mode); int err = build_open_flags(&how, &op); if (err) return ERR_PTR(err); return do_filp_open(AT_FDCWD, name, &op); } /** * filp_open - open file and return file pointer * * @filename: path to open * @flags: open flags as per the open(2) second argument * @mode: mode for the new file if O_CREAT is set, else ignored * * This is the helper to open a file from kernelspace if you really * have to. But in generally you should not do this, so please move * along, nothing to see here.. */ struct file *filp_open(const char *filename, int flags, umode_t mode) { struct filename *name = getname_kernel(filename); struct file *file = ERR_CAST(name); if (!IS_ERR(name)) { file = file_open_name(name, flags, mode); putname(name); } return file; } EXPORT_SYMBOL(filp_open); struct file *file_open_root(const struct path *root, const char *filename, int flags, umode_t mode) { struct open_flags op; struct open_how how = build_open_how(flags, mode); int err = build_open_flags(&how, &op); if (err) return ERR_PTR(err); return do_file_open_root(root, filename, &op); } EXPORT_SYMBOL(file_open_root); static long do_sys_openat2(int dfd, const char __user *filename, struct open_how *how) { struct open_flags op; int fd = build_open_flags(how, &op); struct filename *tmp; if (fd) return fd; tmp = getname(filename); if (IS_ERR(tmp)) return PTR_ERR(tmp); fd = get_unused_fd_flags(how->flags); if (fd >= 0) { struct file *f = do_filp_open(dfd, tmp, &op); if (IS_ERR(f)) { put_unused_fd(fd); fd = PTR_ERR(f); } else { fd_install(fd, f); } } putname(tmp); return fd; } long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) { struct open_how how = build_open_how(flags, mode); return do_sys_openat2(dfd, filename, &how); } SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) { if (force_o_largefile()) flags |= O_LARGEFILE; return do_sys_open(AT_FDCWD, filename, flags, mode); } SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode) { if (force_o_largefile()) flags |= O_LARGEFILE; return do_sys_open(dfd, filename, flags, mode); } SYSCALL_DEFINE4(openat2, int, dfd, const char __user *, filename, struct open_how __user *, how, size_t, usize) { int err; struct open_how tmp; BUILD_BUG_ON(sizeof(struct open_how) < OPEN_HOW_SIZE_VER0); BUILD_BUG_ON(sizeof(struct open_how) != OPEN_HOW_SIZE_LATEST); if (unlikely(usize < OPEN_HOW_SIZE_VER0)) return -EINVAL; err = copy_struct_from_user(&tmp, sizeof(tmp), how, usize); if (err) return err; audit_openat2_how(&tmp); /* O_LARGEFILE is only allowed for non-O_PATH. */ if (!(tmp.flags & O_PATH) && force_o_largefile()) tmp.flags |= O_LARGEFILE; return do_sys_openat2(dfd, filename, &tmp); } #ifdef CONFIG_COMPAT /* * Exactly like sys_open(), except that it doesn't set the * O_LARGEFILE flag. */ COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) { return do_sys_open(AT_FDCWD, filename, flags, mode); } /* * Exactly like sys_openat(), except that it doesn't set the * O_LARGEFILE flag. */ COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode) { return do_sys_open(dfd, filename, flags, mode); } #endif #ifndef __alpha__ /* * For backward compatibility? Maybe this should be moved * into arch/i386 instead? */ SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode) { int flags = O_CREAT | O_WRONLY | O_TRUNC; if (force_o_largefile()) flags |= O_LARGEFILE; return do_sys_open(AT_FDCWD, pathname, flags, mode); } #endif /* * "id" is the POSIX thread ID. We use the * files pointer for this.. */ static int filp_flush(struct file *filp, fl_owner_t id) { int retval = 0; if (CHECK_DATA_CORRUPTION(file_count(filp) == 0, "VFS: Close: file count is 0 (f_op=%ps)", filp->f_op)) { return 0; } if (filp->f_op->flush) retval = filp->f_op->flush(filp, id); if (likely(!(filp->f_mode & FMODE_PATH))) { dnotify_flush(filp, id); locks_remove_posix(filp, id); } return retval; } int filp_close(struct file *filp, fl_owner_t id) { int retval; retval = filp_flush(filp, id); fput(filp); return retval; } EXPORT_SYMBOL(filp_close); /* * Careful here! We test whether the file pointer is NULL before * releasing the fd. This ensures that one clone task can't release * an fd while another clone is opening it. */ SYSCALL_DEFINE1(close, unsigned int, fd) { int retval; struct file *file; file = close_fd_get_file(fd); if (!file) return -EBADF; retval = filp_flush(file, current->files); /* * We're returning to user space. Don't bother * with any delayed fput() cases. */ __fput_sync(file); /* can't restart close syscall because file table entry was cleared */ if (unlikely(retval == -ERESTARTSYS || retval == -ERESTARTNOINTR || retval == -ERESTARTNOHAND || retval == -ERESTART_RESTARTBLOCK)) retval = -EINTR; return retval; } /** * sys_close_range() - Close all file descriptors in a given range. * * @fd: starting file descriptor to close * @max_fd: last file descriptor to close * @flags: reserved for future extensions * * This closes a range of file descriptors. All file descriptors * from @fd up to and including @max_fd are closed. * Currently, errors to close a given file descriptor are ignored. */ SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd, unsigned int, flags) { return __close_range(fd, max_fd, flags); } /* * This routine simulates a hangup on the tty, to arrange that users * are given clean terminals at login time. */ SYSCALL_DEFINE0(vhangup) { if (capable(CAP_SYS_TTY_CONFIG)) { tty_vhangup_self(); return 0; } return -EPERM; } /* * Called when an inode is about to be open. * We use this to disallow opening large files on 32bit systems if * the caller didn't specify O_LARGEFILE. On 64bit systems we force * on this flag in sys_open. */ int generic_file_open(struct inode * inode, struct file * filp) { if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) return -EOVERFLOW; return 0; } EXPORT_SYMBOL(generic_file_open); /* * This is used by subsystems that don't want seekable * file descriptors. The function is not supposed to ever fail, the only * reason it returns an 'int' and not 'void' is so that it can be plugged * directly into file_operations structure. */ int nonseekable_open(struct inode *inode, struct file *filp) { filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); return 0; } EXPORT_SYMBOL(nonseekable_open); /* * stream_open is used by subsystems that want stream-like file descriptors. * Such file descriptors are not seekable and don't have notion of position * (file.f_pos is always 0 and ppos passed to .read()/.write() is always NULL). * Contrary to file descriptors of other regular files, .read() and .write() * can run simultaneously. * * stream_open never fails and is marked to return int so that it could be * directly used as file_operations.open . */ int stream_open(struct inode *inode, struct file *filp) { filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE | FMODE_ATOMIC_POS); filp->f_mode |= FMODE_STREAM; return 0; } EXPORT_SYMBOL(stream_open);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PTRACE_H #define _LINUX_PTRACE_H #include <linux/compiler.h> /* For unlikely. */ #include <linux/sched.h> /* For struct task_struct. */ #include <linux/sched/signal.h> /* For send_sig(), same_thread_group(), etc. */ #include <linux/err.h> /* for IS_ERR_VALUE */ #include <linux/bug.h> /* For BUG_ON. */ #include <linux/pid_namespace.h> /* For task_active_pid_ns. */ #include <uapi/linux/ptrace.h> #include <linux/seccomp.h> /* Add sp to seccomp_data, as seccomp is user API, we don't want to modify it */ struct syscall_info { __u64 sp; struct seccomp_data data; }; extern int ptrace_access_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags); /* * Ptrace flags * * The owner ship rules for task->ptrace which holds the ptrace * flags is simple. When a task is running it owns it's task->ptrace * flags. When the a task is stopped the ptracer owns task->ptrace. */ #define PT_SEIZED 0x00010000 /* SEIZE used, enable new behavior */ #define PT_PTRACED 0x00000001 #define PT_OPT_FLAG_SHIFT 3 /* PT_TRACE_* event enable flags */ #define PT_EVENT_FLAG(event) (1 << (PT_OPT_FLAG_SHIFT + (event))) #define PT_TRACESYSGOOD PT_EVENT_FLAG(0) #define PT_TRACE_FORK PT_EVENT_FLAG(PTRACE_EVENT_FORK) #define PT_TRACE_VFORK PT_EVENT_FLAG(PTRACE_EVENT_VFORK) #define PT_TRACE_CLONE PT_EVENT_FLAG(PTRACE_EVENT_CLONE) #define PT_TRACE_EXEC PT_EVENT_FLAG(PTRACE_EVENT_EXEC) #define PT_TRACE_VFORK_DONE PT_EVENT_FLAG(PTRACE_EVENT_VFORK_DONE) #define PT_TRACE_EXIT PT_EVENT_FLAG(PTRACE_EVENT_EXIT) #define PT_TRACE_SECCOMP PT_EVENT_FLAG(PTRACE_EVENT_SECCOMP) #define PT_EXITKILL (PTRACE_O_EXITKILL << PT_OPT_FLAG_SHIFT) #define PT_SUSPEND_SECCOMP (PTRACE_O_SUSPEND_SECCOMP << PT_OPT_FLAG_SHIFT) extern long arch_ptrace(struct task_struct *child, long request, unsigned long addr, unsigned long data); extern int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len); extern int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len); extern void ptrace_disable(struct task_struct *); extern int ptrace_request(struct task_struct *child, long request, unsigned long addr, unsigned long data); extern int ptrace_notify(int exit_code, unsigned long message); extern void __ptrace_link(struct task_struct *child, struct task_struct *new_parent, const struct cred *ptracer_cred); extern void __ptrace_unlink(struct task_struct *child); extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead); #define PTRACE_MODE_READ 0x01 #define PTRACE_MODE_ATTACH 0x02 #define PTRACE_MODE_NOAUDIT 0x04 #define PTRACE_MODE_FSCREDS 0x08 #define PTRACE_MODE_REALCREDS 0x10 /* shorthands for READ/ATTACH and FSCREDS/REALCREDS combinations */ #define PTRACE_MODE_READ_FSCREDS (PTRACE_MODE_READ | PTRACE_MODE_FSCREDS) #define PTRACE_MODE_READ_REALCREDS (PTRACE_MODE_READ | PTRACE_MODE_REALCREDS) #define PTRACE_MODE_ATTACH_FSCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_FSCREDS) #define PTRACE_MODE_ATTACH_REALCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_REALCREDS) /** * ptrace_may_access - check whether the caller is permitted to access * a target task. * @task: target task * @mode: selects type of access and caller credentials * * Returns true on success, false on denial. * * One of the flags PTRACE_MODE_FSCREDS and PTRACE_MODE_REALCREDS must * be set in @mode to specify whether the access was requested through * a filesystem syscall (should use effective capabilities and fsuid * of the caller) or through an explicit syscall such as * process_vm_writev or ptrace (and should use the real credentials). */ extern bool ptrace_may_access(struct task_struct *task, unsigned int mode); static inline int ptrace_reparented(struct task_struct *child) { return !same_thread_group(child->real_parent, child->parent); } static inline void ptrace_unlink(struct task_struct *child) { if (unlikely(child->ptrace)) __ptrace_unlink(child); } int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr, unsigned long data); int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr, unsigned long data); /** * ptrace_parent - return the task that is tracing the given task * @task: task to consider * * Returns %NULL if no one is tracing @task, or the &struct task_struct * pointer to its tracer. * * Must called under rcu_read_lock(). The pointer returned might be kept * live only by RCU. During exec, this may be called with task_lock() held * on @task, still held from when check_unsafe_exec() was called. */ static inline struct task_struct *ptrace_parent(struct task_struct *task) { if (unlikely(task->ptrace)) return rcu_dereference(task->parent); return NULL; } /** * ptrace_event_enabled - test whether a ptrace event is enabled * @task: ptracee of interest * @event: %PTRACE_EVENT_* to test * * Test whether @event is enabled for ptracee @task. * * Returns %true if @event is enabled, %false otherwise. */ static inline bool ptrace_event_enabled(struct task_struct *task, int event) { return task->ptrace & PT_EVENT_FLAG(event); } /** * ptrace_event - possibly stop for a ptrace event notification * @event: %PTRACE_EVENT_* value to report * @message: value for %PTRACE_GETEVENTMSG to return * * Check whether @event is enabled and, if so, report @event and @message * to the ptrace parent. * * Called without locks. */ static inline void ptrace_event(int event, unsigned long message) { if (unlikely(ptrace_event_enabled(current, event))) { ptrace_notify((event << 8) | SIGTRAP, message); } else if (event == PTRACE_EVENT_EXEC) { /* legacy EXEC report via SIGTRAP */ if ((current->ptrace & (PT_PTRACED|PT_SEIZED)) == PT_PTRACED) send_sig(SIGTRAP, current, 0); } } /** * ptrace_event_pid - possibly stop for a ptrace event notification * @event: %PTRACE_EVENT_* value to report * @pid: process identifier for %PTRACE_GETEVENTMSG to return * * Check whether @event is enabled and, if so, report @event and @pid * to the ptrace parent. @pid is reported as the pid_t seen from the * ptrace parent's pid namespace. * * Called without locks. */ static inline void ptrace_event_pid(int event, struct pid *pid) { /* * FIXME: There's a potential race if a ptracer in a different pid * namespace than parent attaches between computing message below and * when we acquire tasklist_lock in ptrace_stop(). If this happens, * the ptracer will get a bogus pid from PTRACE_GETEVENTMSG. */ unsigned long message = 0; struct pid_namespace *ns; rcu_read_lock(); ns = task_active_pid_ns(rcu_dereference(current->parent)); if (ns) message = pid_nr_ns(pid, ns); rcu_read_unlock(); ptrace_event(event, message); } /** * ptrace_init_task - initialize ptrace state for a new child * @child: new child task * @ptrace: true if child should be ptrace'd by parent's tracer * * This is called immediately after adding @child to its parent's children * list. @ptrace is false in the normal case, and true to ptrace @child. * * Called with current's siglock and write_lock_irq(&tasklist_lock) held. */ static inline void ptrace_init_task(struct task_struct *child, bool ptrace) { INIT_LIST_HEAD(&child->ptrace_entry); INIT_LIST_HEAD(&child->ptraced); child->jobctl = 0; child->ptrace = 0; child->parent = child->real_parent; if (unlikely(ptrace) && current->ptrace) { child->ptrace = current->ptrace; __ptrace_link(child, current->parent, current->ptracer_cred); if (child->ptrace & PT_SEIZED) task_set_jobctl_pending(child, JOBCTL_TRAP_STOP); else sigaddset(&child->pending.signal, SIGSTOP); } else child->ptracer_cred = NULL; } /** * ptrace_release_task - final ptrace-related cleanup of a zombie being reaped * @task: task in %EXIT_DEAD state * * Called with write_lock(&tasklist_lock) held. */ static inline void ptrace_release_task(struct task_struct *task) { BUG_ON(!list_empty(&task->ptraced)); ptrace_unlink(task); BUG_ON(!list_empty(&task->ptrace_entry)); } #ifndef force_successful_syscall_return /* * System call handlers that, upon successful completion, need to return a * negative value should call force_successful_syscall_return() right before * returning. On architectures where the syscall convention provides for a * separate error flag (e.g., alpha, ia64, ppc{,64}, sparc{,64}, possibly * others), this macro can be used to ensure that the error flag will not get * set. On architectures which do not support a separate error flag, the macro * is a no-op and the spurious error condition needs to be filtered out by some * other means (e.g., in user-level, by passing an extra argument to the * syscall handler, or something along those lines). */ #define force_successful_syscall_return() do { } while (0) #endif #ifndef is_syscall_success /* * On most systems we can tell if a syscall is a success based on if the retval * is an error value. On some systems like ia64 and powerpc they have different * indicators of success/failure and must define their own. */ #define is_syscall_success(regs) (!IS_ERR_VALUE((unsigned long)(regs_return_value(regs)))) #endif /* * <asm/ptrace.h> should define the following things inside #ifdef __KERNEL__. * * These do-nothing inlines are used when the arch does not * implement single-step. The kerneldoc comments are here * to document the interface for all arch definitions. */ #ifndef arch_has_single_step /** * arch_has_single_step - does this CPU support user-mode single-step? * * If this is defined, then there must be function declarations or * inlines for user_enable_single_step() and user_disable_single_step(). * arch_has_single_step() should evaluate to nonzero iff the machine * supports instruction single-step for user mode. * It can be a constant or it can test a CPU feature bit. */ #define arch_has_single_step() (0) /** * user_enable_single_step - single-step in user-mode task * @task: either current or a task stopped in %TASK_TRACED * * This can only be called when arch_has_single_step() has returned nonzero. * Set @task so that when it returns to user mode, it will trap after the * next single instruction executes. If arch_has_block_step() is defined, * this must clear the effects of user_enable_block_step() too. */ static inline void user_enable_single_step(struct task_struct *task) { BUG(); /* This can never be called. */ } /** * user_disable_single_step - cancel user-mode single-step * @task: either current or a task stopped in %TASK_TRACED * * Clear @task of the effects of user_enable_single_step() and * user_enable_block_step(). This can be called whether or not either * of those was ever called on @task, and even if arch_has_single_step() * returned zero. */ static inline void user_disable_single_step(struct task_struct *task) { } #else extern void user_enable_single_step(struct task_struct *); extern void user_disable_single_step(struct task_struct *); #endif /* arch_has_single_step */ #ifndef arch_has_block_step /** * arch_has_block_step - does this CPU support user-mode block-step? * * If this is defined, then there must be a function declaration or inline * for user_enable_block_step(), and arch_has_single_step() must be defined * too. arch_has_block_step() should evaluate to nonzero iff the machine * supports step-until-branch for user mode. It can be a constant or it * can test a CPU feature bit. */ #define arch_has_block_step() (0) /** * user_enable_block_step - step until branch in user-mode task * @task: either current or a task stopped in %TASK_TRACED * * This can only be called when arch_has_block_step() has returned nonzero, * and will never be called when single-instruction stepping is being used. * Set @task so that when it returns to user mode, it will trap after the * next branch or trap taken. */ static inline void user_enable_block_step(struct task_struct *task) { BUG(); /* This can never be called. */ } #else extern void user_enable_block_step(struct task_struct *); #endif /* arch_has_block_step */ #ifdef ARCH_HAS_USER_SINGLE_STEP_REPORT extern void user_single_step_report(struct pt_regs *regs); #else static inline void user_single_step_report(struct pt_regs *regs) { kernel_siginfo_t info; clear_siginfo(&info); info.si_signo = SIGTRAP; info.si_errno = 0; info.si_code = SI_USER; info.si_pid = 0; info.si_uid = 0; force_sig_info(&info); } #endif #ifndef arch_ptrace_stop_needed /** * arch_ptrace_stop_needed - Decide whether arch_ptrace_stop() should be called * * This is called with the siglock held, to decide whether or not it's * necessary to release the siglock and call arch_ptrace_stop(). It can be * defined to a constant if arch_ptrace_stop() is never required, or always * is. On machines where this makes sense, it should be defined to a quick * test to optimize out calling arch_ptrace_stop() when it would be * superfluous. For example, if the thread has not been back to user mode * since the last stop, the thread state might indicate that nothing needs * to be done. * * This is guaranteed to be invoked once before a task stops for ptrace and * may include arch-specific operations necessary prior to a ptrace stop. */ #define arch_ptrace_stop_needed() (0) #endif #ifndef arch_ptrace_stop /** * arch_ptrace_stop - Do machine-specific work before stopping for ptrace * * This is called with no locks held when arch_ptrace_stop_needed() has * just returned nonzero. It is allowed to block, e.g. for user memory * access. The arch can have machine-specific work to be done before * ptrace stops. On ia64, register backing store gets written back to user * memory here. Since this can be costly (requires dropping the siglock), * we only do it when the arch requires it for this particular stop, as * indicated by arch_ptrace_stop_needed(). */ #define arch_ptrace_stop() do { } while (0) #endif #ifndef current_pt_regs #define current_pt_regs() task_pt_regs(current) #endif #ifndef current_user_stack_pointer #define current_user_stack_pointer() user_stack_pointer(current_pt_regs()) #endif extern int task_current_syscall(struct task_struct *target, struct syscall_info *info); extern void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact); /* * ptrace report for syscall entry and exit looks identical. */ static inline int ptrace_report_syscall(unsigned long message) { int ptrace = current->ptrace; int signr; if (!(ptrace & PT_PTRACED)) return 0; signr = ptrace_notify(SIGTRAP | ((ptrace & PT_TRACESYSGOOD) ? 0x80 : 0), message); /* * this isn't the same as continuing with a signal, but it will do * for normal use. strace only continues with a signal if the * stopping signal is not SIGTRAP. -brl */ if (signr) send_sig(signr, current, 1); return fatal_signal_pending(current); } /** * ptrace_report_syscall_entry - task is about to attempt a system call * @regs: user register state of current task * * This will be called if %SYSCALL_WORK_SYSCALL_TRACE or * %SYSCALL_WORK_SYSCALL_EMU have been set, when the current task has just * entered the kernel for a system call. Full user register state is * available here. Changing the values in @regs can affect the system * call number and arguments to be tried. It is safe to block here, * preventing the system call from beginning. * * Returns zero normally, or nonzero if the calling arch code should abort * the system call. That must prevent normal entry so no system call is * made. If @task ever returns to user mode after this, its register state * is unspecified, but should be something harmless like an %ENOSYS error * return. It should preserve enough information so that syscall_rollback() * can work (see asm-generic/syscall.h). * * Called without locks, just after entering kernel mode. */ static inline __must_check int ptrace_report_syscall_entry( struct pt_regs *regs) { return ptrace_report_syscall(PTRACE_EVENTMSG_SYSCALL_ENTRY); } /** * ptrace_report_syscall_exit - task has just finished a system call * @regs: user register state of current task * @step: nonzero if simulating single-step or block-step * * This will be called if %SYSCALL_WORK_SYSCALL_TRACE has been set, when * the current task has just finished an attempted system call. Full * user register state is available here. It is safe to block here, * preventing signals from being processed. * * If @step is nonzero, this report is also in lieu of the normal * trap that would follow the system call instruction because * user_enable_block_step() or user_enable_single_step() was used. * In this case, %SYSCALL_WORK_SYSCALL_TRACE might not be set. * * Called without locks, just before checking for pending signals. */ static inline void ptrace_report_syscall_exit(struct pt_regs *regs, int step) { if (step) user_single_step_report(regs); else ptrace_report_syscall(PTRACE_EVENTMSG_SYSCALL_EXIT); } #endif
1529 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 // SPDX-License-Identifier: GPL-2.0-or-later /* * * Copyright Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) * Copyright Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk) * Copyright Darryl Miles G7LED (dlm@g7led.demon.co.uk) */ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/stat.h> #include <net/ax25.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <net/net_namespace.h> #include <net/sock.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/termios.h> /* For TIOCINQ/OUTQ */ #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/notifier.h> #include <net/netrom.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <net/ip.h> #include <net/tcp_states.h> #include <net/arp.h> #include <linux/init.h> static int nr_ndevs = 4; int sysctl_netrom_default_path_quality = NR_DEFAULT_QUAL; int sysctl_netrom_obsolescence_count_initialiser = NR_DEFAULT_OBS; int sysctl_netrom_network_ttl_initialiser = NR_DEFAULT_TTL; int sysctl_netrom_transport_timeout = NR_DEFAULT_T1; int sysctl_netrom_transport_maximum_tries = NR_DEFAULT_N2; int sysctl_netrom_transport_acknowledge_delay = NR_DEFAULT_T2; int sysctl_netrom_transport_busy_delay = NR_DEFAULT_T4; int sysctl_netrom_transport_requested_window_size = NR_DEFAULT_WINDOW; int sysctl_netrom_transport_no_activity_timeout = NR_DEFAULT_IDLE; int sysctl_netrom_routing_control = NR_DEFAULT_ROUTING; int sysctl_netrom_link_fails_count = NR_DEFAULT_FAILS; int sysctl_netrom_reset_circuit = NR_DEFAULT_RESET; static unsigned short circuit = 0x101; static HLIST_HEAD(nr_list); static DEFINE_SPINLOCK(nr_list_lock); static const struct proto_ops nr_proto_ops; /* * NETROM network devices are virtual network devices encapsulating NETROM * frames into AX.25 which will be sent through an AX.25 device, so form a * special "super class" of normal net devices; split their locks off into a * separate class since they always nest. */ static struct lock_class_key nr_netdev_xmit_lock_key; static struct lock_class_key nr_netdev_addr_lock_key; static void nr_set_lockdep_one(struct net_device *dev, struct netdev_queue *txq, void *_unused) { lockdep_set_class(&txq->_xmit_lock, &nr_netdev_xmit_lock_key); } static void nr_set_lockdep_key(struct net_device *dev) { lockdep_set_class(&dev->addr_list_lock, &nr_netdev_addr_lock_key); netdev_for_each_tx_queue(dev, nr_set_lockdep_one, NULL); } /* * Socket removal during an interrupt is now safe. */ static void nr_remove_socket(struct sock *sk) { spin_lock_bh(&nr_list_lock); sk_del_node_init(sk); spin_unlock_bh(&nr_list_lock); } /* * Kill all bound sockets on a dropped device. */ static void nr_kill_by_device(struct net_device *dev) { struct sock *s; spin_lock_bh(&nr_list_lock); sk_for_each(s, &nr_list) if (nr_sk(s)->device == dev) nr_disconnect(s, ENETUNREACH); spin_unlock_bh(&nr_list_lock); } /* * Handle device status changes. */ static int nr_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; if (event != NETDEV_DOWN) return NOTIFY_DONE; nr_kill_by_device(dev); nr_rt_device_down(dev); return NOTIFY_DONE; } /* * Add a socket to the bound sockets list. */ static void nr_insert_socket(struct sock *sk) { spin_lock_bh(&nr_list_lock); sk_add_node(sk, &nr_list); spin_unlock_bh(&nr_list_lock); } /* * Find a socket that wants to accept the Connect Request we just * received. */ static struct sock *nr_find_listener(ax25_address *addr) { struct sock *s; spin_lock_bh(&nr_list_lock); sk_for_each(s, &nr_list) if (!ax25cmp(&nr_sk(s)->source_addr, addr) && s->sk_state == TCP_LISTEN) { sock_hold(s); goto found; } s = NULL; found: spin_unlock_bh(&nr_list_lock); return s; } /* * Find a connected NET/ROM socket given my circuit IDs. */ static struct sock *nr_find_socket(unsigned char index, unsigned char id) { struct sock *s; spin_lock_bh(&nr_list_lock); sk_for_each(s, &nr_list) { struct nr_sock *nr = nr_sk(s); if (nr->my_index == index && nr->my_id == id) { sock_hold(s); goto found; } } s = NULL; found: spin_unlock_bh(&nr_list_lock); return s; } /* * Find a connected NET/ROM socket given their circuit IDs. */ static struct sock *nr_find_peer(unsigned char index, unsigned char id, ax25_address *dest) { struct sock *s; spin_lock_bh(&nr_list_lock); sk_for_each(s, &nr_list) { struct nr_sock *nr = nr_sk(s); if (nr->your_index == index && nr->your_id == id && !ax25cmp(&nr->dest_addr, dest)) { sock_hold(s); goto found; } } s = NULL; found: spin_unlock_bh(&nr_list_lock); return s; } /* * Find next free circuit ID. */ static unsigned short nr_find_next_circuit(void) { unsigned short id = circuit; unsigned char i, j; struct sock *sk; for (;;) { i = id / 256; j = id % 256; if (i != 0 && j != 0) { if ((sk=nr_find_socket(i, j)) == NULL) break; sock_put(sk); } id++; } return id; } /* * Deferred destroy. */ void nr_destroy_socket(struct sock *); /* * Handler for deferred kills. */ static void nr_destroy_timer(struct timer_list *t) { struct sock *sk = from_timer(sk, t, sk_timer); bh_lock_sock(sk); sock_hold(sk); nr_destroy_socket(sk); bh_unlock_sock(sk); sock_put(sk); } /* * This is called from user mode and the timers. Thus it protects itself * against interrupt users but doesn't worry about being called during * work. Once it is removed from the queue no interrupt or bottom half * will touch it and we are (fairly 8-) ) safe. */ void nr_destroy_socket(struct sock *sk) { struct sk_buff *skb; nr_remove_socket(sk); nr_stop_heartbeat(sk); nr_stop_t1timer(sk); nr_stop_t2timer(sk); nr_stop_t4timer(sk); nr_stop_idletimer(sk); nr_clear_queues(sk); /* Flush the queues */ while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { if (skb->sk != sk) { /* A pending connection */ /* Queue the unaccepted socket for death */ sock_set_flag(skb->sk, SOCK_DEAD); nr_start_heartbeat(skb->sk); nr_sk(skb->sk)->state = NR_STATE_0; } kfree_skb(skb); } if (sk_has_allocations(sk)) { /* Defer: outstanding buffers */ sk->sk_timer.function = nr_destroy_timer; sk->sk_timer.expires = jiffies + 2 * HZ; add_timer(&sk->sk_timer); } else sock_put(sk); } /* * Handling for system calls applied via the various interfaces to a * NET/ROM socket object. */ static int nr_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct nr_sock *nr = nr_sk(sk); unsigned int opt; if (level != SOL_NETROM) return -ENOPROTOOPT; if (optlen < sizeof(unsigned int)) return -EINVAL; if (copy_from_sockptr(&opt, optval, sizeof(opt))) return -EFAULT; switch (optname) { case NETROM_T1: if (opt < 1 || opt > UINT_MAX / HZ) return -EINVAL; nr->t1 = opt * HZ; return 0; case NETROM_T2: if (opt < 1 || opt > UINT_MAX / HZ) return -EINVAL; nr->t2 = opt * HZ; return 0; case NETROM_N2: if (opt < 1 || opt > 31) return -EINVAL; nr->n2 = opt; return 0; case NETROM_T4: if (opt < 1 || opt > UINT_MAX / HZ) return -EINVAL; nr->t4 = opt * HZ; return 0; case NETROM_IDLE: if (opt > UINT_MAX / (60 * HZ)) return -EINVAL; nr->idle = opt * 60 * HZ; return 0; default: return -ENOPROTOOPT; } } static int nr_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct nr_sock *nr = nr_sk(sk); int val = 0; int len; if (level != SOL_NETROM) return -ENOPROTOOPT; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; switch (optname) { case NETROM_T1: val = nr->t1 / HZ; break; case NETROM_T2: val = nr->t2 / HZ; break; case NETROM_N2: val = nr->n2; break; case NETROM_T4: val = nr->t4 / HZ; break; case NETROM_IDLE: val = nr->idle / (60 * HZ); break; default: return -ENOPROTOOPT; } len = min_t(unsigned int, len, sizeof(int)); if (put_user(len, optlen)) return -EFAULT; return copy_to_user(optval, &val, len) ? -EFAULT : 0; } static int nr_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; lock_sock(sk); if (sock->state != SS_UNCONNECTED) { release_sock(sk); return -EINVAL; } if (sk->sk_state != TCP_LISTEN) { memset(&nr_sk(sk)->user_addr, 0, AX25_ADDR_LEN); sk->sk_max_ack_backlog = backlog; sk->sk_state = TCP_LISTEN; release_sock(sk); return 0; } release_sock(sk); return -EOPNOTSUPP; } static struct proto nr_proto = { .name = "NETROM", .owner = THIS_MODULE, .obj_size = sizeof(struct nr_sock), }; static int nr_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; struct nr_sock *nr; if (!net_eq(net, &init_net)) return -EAFNOSUPPORT; if (sock->type != SOCK_SEQPACKET || protocol != 0) return -ESOCKTNOSUPPORT; sk = sk_alloc(net, PF_NETROM, GFP_ATOMIC, &nr_proto, kern); if (sk == NULL) return -ENOMEM; nr = nr_sk(sk); sock_init_data(sock, sk); sock->ops = &nr_proto_ops; sk->sk_protocol = protocol; skb_queue_head_init(&nr->ack_queue); skb_queue_head_init(&nr->reseq_queue); skb_queue_head_init(&nr->frag_queue); nr_init_timers(sk); nr->t1 = msecs_to_jiffies(sysctl_netrom_transport_timeout); nr->t2 = msecs_to_jiffies(sysctl_netrom_transport_acknowledge_delay); nr->n2 = msecs_to_jiffies(sysctl_netrom_transport_maximum_tries); nr->t4 = msecs_to_jiffies(sysctl_netrom_transport_busy_delay); nr->idle = msecs_to_jiffies(sysctl_netrom_transport_no_activity_timeout); nr->window = sysctl_netrom_transport_requested_window_size; nr->bpqext = 1; nr->state = NR_STATE_0; return 0; } static struct sock *nr_make_new(struct sock *osk) { struct sock *sk; struct nr_sock *nr, *onr; if (osk->sk_type != SOCK_SEQPACKET) return NULL; sk = sk_alloc(sock_net(osk), PF_NETROM, GFP_ATOMIC, osk->sk_prot, 0); if (sk == NULL) return NULL; nr = nr_sk(sk); sock_init_data(NULL, sk); sk->sk_type = osk->sk_type; sk->sk_priority = osk->sk_priority; sk->sk_protocol = osk->sk_protocol; sk->sk_rcvbuf = osk->sk_rcvbuf; sk->sk_sndbuf = osk->sk_sndbuf; sk->sk_state = TCP_ESTABLISHED; sock_copy_flags(sk, osk); skb_queue_head_init(&nr->ack_queue); skb_queue_head_init(&nr->reseq_queue); skb_queue_head_init(&nr->frag_queue); nr_init_timers(sk); onr = nr_sk(osk); nr->t1 = onr->t1; nr->t2 = onr->t2; nr->n2 = onr->n2; nr->t4 = onr->t4; nr->idle = onr->idle; nr->window = onr->window; nr->device = onr->device; nr->bpqext = onr->bpqext; return sk; } static int nr_release(struct socket *sock) { struct sock *sk = sock->sk; struct nr_sock *nr; if (sk == NULL) return 0; sock_hold(sk); sock_orphan(sk); lock_sock(sk); nr = nr_sk(sk); switch (nr->state) { case NR_STATE_0: case NR_STATE_1: case NR_STATE_2: nr_disconnect(sk, 0); nr_destroy_socket(sk); break; case NR_STATE_3: nr_clear_queues(sk); nr->n2count = 0; nr_write_internal(sk, NR_DISCREQ); nr_start_t1timer(sk); nr_stop_t2timer(sk); nr_stop_t4timer(sk); nr_stop_idletimer(sk); nr->state = NR_STATE_2; sk->sk_state = TCP_CLOSE; sk->sk_shutdown |= SEND_SHUTDOWN; sk->sk_state_change(sk); sock_set_flag(sk, SOCK_DESTROY); break; default: break; } sock->sk = NULL; release_sock(sk); sock_put(sk); return 0; } static int nr_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; struct nr_sock *nr = nr_sk(sk); struct full_sockaddr_ax25 *addr = (struct full_sockaddr_ax25 *)uaddr; struct net_device *dev; ax25_uid_assoc *user; ax25_address *source; lock_sock(sk); if (!sock_flag(sk, SOCK_ZAPPED)) { release_sock(sk); return -EINVAL; } if (addr_len < sizeof(struct sockaddr_ax25) || addr_len > sizeof(struct full_sockaddr_ax25)) { release_sock(sk); return -EINVAL; } if (addr_len < (addr->fsa_ax25.sax25_ndigis * sizeof(ax25_address) + sizeof(struct sockaddr_ax25))) { release_sock(sk); return -EINVAL; } if (addr->fsa_ax25.sax25_family != AF_NETROM) { release_sock(sk); return -EINVAL; } if ((dev = nr_dev_get(&addr->fsa_ax25.sax25_call)) == NULL) { release_sock(sk); return -EADDRNOTAVAIL; } /* * Only the super user can set an arbitrary user callsign. */ if (addr->fsa_ax25.sax25_ndigis == 1) { if (!capable(CAP_NET_BIND_SERVICE)) { dev_put(dev); release_sock(sk); return -EPERM; } nr->user_addr = addr->fsa_digipeater[0]; nr->source_addr = addr->fsa_ax25.sax25_call; } else { source = &addr->fsa_ax25.sax25_call; user = ax25_findbyuid(current_euid()); if (user) { nr->user_addr = user->call; ax25_uid_put(user); } else { if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) { release_sock(sk); dev_put(dev); return -EPERM; } nr->user_addr = *source; } nr->source_addr = *source; } nr->device = dev; nr_insert_socket(sk); sock_reset_flag(sk, SOCK_ZAPPED); dev_put(dev); release_sock(sk); return 0; } static int nr_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; struct nr_sock *nr = nr_sk(sk); struct sockaddr_ax25 *addr = (struct sockaddr_ax25 *)uaddr; const ax25_address *source = NULL; ax25_uid_assoc *user; struct net_device *dev; int err = 0; lock_sock(sk); if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) { sock->state = SS_CONNECTED; goto out_release; /* Connect completed during a ERESTARTSYS event */ } if (sk->sk_state == TCP_CLOSE && sock->state == SS_CONNECTING) { sock->state = SS_UNCONNECTED; err = -ECONNREFUSED; goto out_release; } if (sk->sk_state == TCP_ESTABLISHED) { err = -EISCONN; /* No reconnect on a seqpacket socket */ goto out_release; } if (sock->state == SS_CONNECTING) { err = -EALREADY; goto out_release; } sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; if (addr_len != sizeof(struct sockaddr_ax25) && addr_len != sizeof(struct full_sockaddr_ax25)) { err = -EINVAL; goto out_release; } if (addr->sax25_family != AF_NETROM) { err = -EINVAL; goto out_release; } if (sock_flag(sk, SOCK_ZAPPED)) { /* Must bind first - autobinding in this may or may not work */ sock_reset_flag(sk, SOCK_ZAPPED); if ((dev = nr_dev_first()) == NULL) { err = -ENETUNREACH; goto out_release; } source = (const ax25_address *)dev->dev_addr; user = ax25_findbyuid(current_euid()); if (user) { nr->user_addr = user->call; ax25_uid_put(user); } else { if (ax25_uid_policy && !capable(CAP_NET_ADMIN)) { dev_put(dev); err = -EPERM; goto out_release; } nr->user_addr = *source; } nr->source_addr = *source; nr->device = dev; dev_put(dev); nr_insert_socket(sk); /* Finish the bind */ } nr->dest_addr = addr->sax25_call; release_sock(sk); circuit = nr_find_next_circuit(); lock_sock(sk); nr->my_index = circuit / 256; nr->my_id = circuit % 256; circuit++; /* Move to connecting socket, start sending Connect Requests */ sock->state = SS_CONNECTING; sk->sk_state = TCP_SYN_SENT; nr_establish_data_link(sk); nr->state = NR_STATE_1; nr_start_heartbeat(sk); /* Now the loop */ if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) { err = -EINPROGRESS; goto out_release; } /* * A Connect Ack with Choke or timeout or failed routing will go to * closed. */ if (sk->sk_state == TCP_SYN_SENT) { DEFINE_WAIT(wait); for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (sk->sk_state != TCP_SYN_SENT) break; if (!signal_pending(current)) { release_sock(sk); schedule(); lock_sock(sk); continue; } err = -ERESTARTSYS; break; } finish_wait(sk_sleep(sk), &wait); if (err) goto out_release; } if (sk->sk_state != TCP_ESTABLISHED) { sock->state = SS_UNCONNECTED; err = sock_error(sk); /* Always set at this point */ goto out_release; } sock->state = SS_CONNECTED; out_release: release_sock(sk); return err; } static int nr_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { struct sk_buff *skb; struct sock *newsk; DEFINE_WAIT(wait); struct sock *sk; int err = 0; if ((sk = sock->sk) == NULL) return -EINVAL; lock_sock(sk); if (sk->sk_type != SOCK_SEQPACKET) { err = -EOPNOTSUPP; goto out_release; } if (sk->sk_state != TCP_LISTEN) { err = -EINVAL; goto out_release; } /* * The write queue this time is holding sockets ready to use * hooked into the SABM we saved */ for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); skb = skb_dequeue(&sk->sk_receive_queue); if (skb) break; if (flags & O_NONBLOCK) { err = -EWOULDBLOCK; break; } if (!signal_pending(current)) { release_sock(sk); schedule(); lock_sock(sk); continue; } err = -ERESTARTSYS; break; } finish_wait(sk_sleep(sk), &wait); if (err) goto out_release; newsk = skb->sk; sock_graft(newsk, newsock); /* Now attach up the new socket */ kfree_skb(skb); sk_acceptq_removed(sk); out_release: release_sock(sk); return err; } static int nr_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct full_sockaddr_ax25 *sax = (struct full_sockaddr_ax25 *)uaddr; struct sock *sk = sock->sk; struct nr_sock *nr = nr_sk(sk); int uaddr_len; memset(&sax->fsa_ax25, 0, sizeof(struct sockaddr_ax25)); lock_sock(sk); if (peer != 0) { if (sk->sk_state != TCP_ESTABLISHED) { release_sock(sk); return -ENOTCONN; } sax->fsa_ax25.sax25_family = AF_NETROM; sax->fsa_ax25.sax25_ndigis = 1; sax->fsa_ax25.sax25_call = nr->user_addr; memset(sax->fsa_digipeater, 0, sizeof(sax->fsa_digipeater)); sax->fsa_digipeater[0] = nr->dest_addr; uaddr_len = sizeof(struct full_sockaddr_ax25); } else { sax->fsa_ax25.sax25_family = AF_NETROM; sax->fsa_ax25.sax25_ndigis = 0; sax->fsa_ax25.sax25_call = nr->source_addr; uaddr_len = sizeof(struct sockaddr_ax25); } release_sock(sk); return uaddr_len; } int nr_rx_frame(struct sk_buff *skb, struct net_device *dev) { struct sock *sk; struct sock *make; struct nr_sock *nr_make; ax25_address *src, *dest, *user; unsigned short circuit_index, circuit_id; unsigned short peer_circuit_index, peer_circuit_id; unsigned short frametype, flags, window, timeout; int ret; skb_orphan(skb); /* * skb->data points to the netrom frame start */ src = (ax25_address *)(skb->data + 0); dest = (ax25_address *)(skb->data + 7); circuit_index = skb->data[15]; circuit_id = skb->data[16]; peer_circuit_index = skb->data[17]; peer_circuit_id = skb->data[18]; frametype = skb->data[19] & 0x0F; flags = skb->data[19] & 0xF0; /* * Check for an incoming IP over NET/ROM frame. */ if (frametype == NR_PROTOEXT && circuit_index == NR_PROTO_IP && circuit_id == NR_PROTO_IP) { skb_pull(skb, NR_NETWORK_LEN + NR_TRANSPORT_LEN); skb_reset_transport_header(skb); return nr_rx_ip(skb, dev); } /* * Find an existing socket connection, based on circuit ID, if it's * a Connect Request base it on their circuit ID. * * Circuit ID 0/0 is not valid but it could still be a "reset" for a * circuit that no longer exists at the other end ... */ sk = NULL; if (circuit_index == 0 && circuit_id == 0) { if (frametype == NR_CONNACK && flags == NR_CHOKE_FLAG) sk = nr_find_peer(peer_circuit_index, peer_circuit_id, src); } else { if (frametype == NR_CONNREQ) sk = nr_find_peer(circuit_index, circuit_id, src); else sk = nr_find_socket(circuit_index, circuit_id); } if (sk != NULL) { bh_lock_sock(sk); skb_reset_transport_header(skb); if (frametype == NR_CONNACK && skb->len == 22) nr_sk(sk)->bpqext = 1; else nr_sk(sk)->bpqext = 0; ret = nr_process_rx_frame(sk, skb); bh_unlock_sock(sk); sock_put(sk); return ret; } /* * Now it should be a CONNREQ. */ if (frametype != NR_CONNREQ) { /* * Here it would be nice to be able to send a reset but * NET/ROM doesn't have one. We've tried to extend the protocol * by sending NR_CONNACK | NR_CHOKE_FLAGS replies but that * apparently kills BPQ boxes... :-( * So now we try to follow the established behaviour of * G8PZT's Xrouter which is sending packets with command type 7 * as an extension of the protocol. */ if (sysctl_netrom_reset_circuit && (frametype != NR_RESET || flags != 0)) nr_transmit_reset(skb, 1); return 0; } sk = nr_find_listener(dest); user = (ax25_address *)(skb->data + 21); if (sk == NULL || sk_acceptq_is_full(sk) || (make = nr_make_new(sk)) == NULL) { nr_transmit_refusal(skb, 0); if (sk) sock_put(sk); return 0; } bh_lock_sock(sk); window = skb->data[20]; sock_hold(make); skb->sk = make; skb->destructor = sock_efree; make->sk_state = TCP_ESTABLISHED; /* Fill in his circuit details */ nr_make = nr_sk(make); nr_make->source_addr = *dest; nr_make->dest_addr = *src; nr_make->user_addr = *user; nr_make->your_index = circuit_index; nr_make->your_id = circuit_id; bh_unlock_sock(sk); circuit = nr_find_next_circuit(); bh_lock_sock(sk); nr_make->my_index = circuit / 256; nr_make->my_id = circuit % 256; circuit++; /* Window negotiation */ if (window < nr_make->window) nr_make->window = window; /* L4 timeout negotiation */ if (skb->len == 37) { timeout = skb->data[36] * 256 + skb->data[35]; if (timeout * HZ < nr_make->t1) nr_make->t1 = timeout * HZ; nr_make->bpqext = 1; } else { nr_make->bpqext = 0; } nr_write_internal(make, NR_CONNACK); nr_make->condition = 0x00; nr_make->vs = 0; nr_make->va = 0; nr_make->vr = 0; nr_make->vl = 0; nr_make->state = NR_STATE_3; sk_acceptq_added(sk); skb_queue_head(&sk->sk_receive_queue, skb); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk); bh_unlock_sock(sk); sock_put(sk); nr_insert_socket(make); nr_start_heartbeat(make); nr_start_idletimer(make); return 1; } static int nr_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct nr_sock *nr = nr_sk(sk); DECLARE_SOCKADDR(struct sockaddr_ax25 *, usax, msg->msg_name); int err; struct sockaddr_ax25 sax; struct sk_buff *skb; unsigned char *asmptr; int size; if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_CMSG_COMPAT)) return -EINVAL; lock_sock(sk); if (sock_flag(sk, SOCK_ZAPPED)) { err = -EADDRNOTAVAIL; goto out; } if (sk->sk_shutdown & SEND_SHUTDOWN) { send_sig(SIGPIPE, current, 0); err = -EPIPE; goto out; } if (nr->device == NULL) { err = -ENETUNREACH; goto out; } if (usax) { if (msg->msg_namelen < sizeof(sax)) { err = -EINVAL; goto out; } sax = *usax; if (ax25cmp(&nr->dest_addr, &sax.sax25_call) != 0) { err = -EISCONN; goto out; } if (sax.sax25_family != AF_NETROM) { err = -EINVAL; goto out; } } else { if (sk->sk_state != TCP_ESTABLISHED) { err = -ENOTCONN; goto out; } sax.sax25_family = AF_NETROM; sax.sax25_call = nr->dest_addr; } /* Build a packet - the conventional user limit is 236 bytes. We can do ludicrously large NetROM frames but must not overflow */ if (len > 65536) { err = -EMSGSIZE; goto out; } size = len + NR_NETWORK_LEN + NR_TRANSPORT_LEN; if ((skb = sock_alloc_send_skb(sk, size, msg->msg_flags & MSG_DONTWAIT, &err)) == NULL) goto out; skb_reserve(skb, size - len); skb_reset_transport_header(skb); /* * Push down the NET/ROM header */ asmptr = skb_push(skb, NR_TRANSPORT_LEN); /* Build a NET/ROM Transport header */ *asmptr++ = nr->your_index; *asmptr++ = nr->your_id; *asmptr++ = 0; /* To be filled in later */ *asmptr++ = 0; /* Ditto */ *asmptr++ = NR_INFO; /* * Put the data on the end */ skb_put(skb, len); /* User data follows immediately after the NET/ROM transport header */ if (memcpy_from_msg(skb_transport_header(skb), msg, len)) { kfree_skb(skb); err = -EFAULT; goto out; } if (sk->sk_state != TCP_ESTABLISHED) { kfree_skb(skb); err = -ENOTCONN; goto out; } nr_output(sk, skb); /* Shove it onto the queue */ err = len; out: release_sock(sk); return err; } static int nr_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; DECLARE_SOCKADDR(struct sockaddr_ax25 *, sax, msg->msg_name); size_t copied; struct sk_buff *skb; int er; /* * This works for seqpacket too. The receiver has ordered the queue for * us! We do one quick check first though */ lock_sock(sk); if (sk->sk_state != TCP_ESTABLISHED) { release_sock(sk); return -ENOTCONN; } /* Now we can treat all alike */ skb = skb_recv_datagram(sk, flags, &er); if (!skb) { release_sock(sk); return er; } skb_reset_transport_header(skb); copied = skb->len; if (copied > size) { copied = size; msg->msg_flags |= MSG_TRUNC; } er = skb_copy_datagram_msg(skb, 0, msg, copied); if (er < 0) { skb_free_datagram(sk, skb); release_sock(sk); return er; } if (sax != NULL) { memset(sax, 0, sizeof(*sax)); sax->sax25_family = AF_NETROM; skb_copy_from_linear_data_offset(skb, 7, sax->sax25_call.ax25_call, AX25_ADDR_LEN); msg->msg_namelen = sizeof(*sax); } skb_free_datagram(sk, skb); release_sock(sk); return copied; } static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; void __user *argp = (void __user *)arg; switch (cmd) { case TIOCOUTQ: { long amount; lock_sock(sk); amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk); if (amount < 0) amount = 0; release_sock(sk); return put_user(amount, (int __user *)argp); } case TIOCINQ: { struct sk_buff *skb; long amount = 0L; lock_sock(sk); /* These two are safe on a single CPU system as only user tasks fiddle here */ if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) amount = skb->len; release_sock(sk); return put_user(amount, (int __user *)argp); } case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: case SIOCGIFBRDADDR: case SIOCSIFBRDADDR: case SIOCGIFNETMASK: case SIOCSIFNETMASK: case SIOCGIFMETRIC: case SIOCSIFMETRIC: return -EINVAL; case SIOCADDRT: case SIOCDELRT: case SIOCNRDECOBS: if (!capable(CAP_NET_ADMIN)) return -EPERM; return nr_rt_ioctl(cmd, argp); default: return -ENOIOCTLCMD; } return 0; } #ifdef CONFIG_PROC_FS static void *nr_info_start(struct seq_file *seq, loff_t *pos) __acquires(&nr_list_lock) { spin_lock_bh(&nr_list_lock); return seq_hlist_start_head(&nr_list, *pos); } static void *nr_info_next(struct seq_file *seq, void *v, loff_t *pos) { return seq_hlist_next(v, &nr_list, pos); } static void nr_info_stop(struct seq_file *seq, void *v) __releases(&nr_list_lock) { spin_unlock_bh(&nr_list_lock); } static int nr_info_show(struct seq_file *seq, void *v) { struct sock *s = sk_entry(v); struct net_device *dev; struct nr_sock *nr; const char *devname; char buf[11]; if (v == SEQ_START_TOKEN) seq_puts(seq, "user_addr dest_node src_node dev my your st vs vr va t1 t2 t4 idle n2 wnd Snd-Q Rcv-Q inode\n"); else { bh_lock_sock(s); nr = nr_sk(s); if ((dev = nr->device) == NULL) devname = "???"; else devname = dev->name; seq_printf(seq, "%-9s ", ax2asc(buf, &nr->user_addr)); seq_printf(seq, "%-9s ", ax2asc(buf, &nr->dest_addr)); seq_printf(seq, "%-9s %-3s %02X/%02X %02X/%02X %2d %3d %3d %3d %3lu/%03lu %2lu/%02lu %3lu/%03lu %3lu/%03lu %2d/%02d %3d %5d %5d %ld\n", ax2asc(buf, &nr->source_addr), devname, nr->my_index, nr->my_id, nr->your_index, nr->your_id, nr->state, nr->vs, nr->vr, nr->va, ax25_display_timer(&nr->t1timer) / HZ, nr->t1 / HZ, ax25_display_timer(&nr->t2timer) / HZ, nr->t2 / HZ, ax25_display_timer(&nr->t4timer) / HZ, nr->t4 / HZ, ax25_display_timer(&nr->idletimer) / (60 * HZ), nr->idle / (60 * HZ), nr->n2count, nr->n2, nr->window, sk_wmem_alloc_get(s), sk_rmem_alloc_get(s), s->sk_socket ? SOCK_INODE(s->sk_socket)->i_ino : 0L); bh_unlock_sock(s); } return 0; } static const struct seq_operations nr_info_seqops = { .start = nr_info_start, .next = nr_info_next, .stop = nr_info_stop, .show = nr_info_show, }; #endif /* CONFIG_PROC_FS */ static const struct net_proto_family nr_family_ops = { .family = PF_NETROM, .create = nr_create, .owner = THIS_MODULE, }; static const struct proto_ops nr_proto_ops = { .family = PF_NETROM, .owner = THIS_MODULE, .release = nr_release, .bind = nr_bind, .connect = nr_connect, .socketpair = sock_no_socketpair, .accept = nr_accept, .getname = nr_getname, .poll = datagram_poll, .ioctl = nr_ioctl, .gettstamp = sock_gettstamp, .listen = nr_listen, .shutdown = sock_no_shutdown, .setsockopt = nr_setsockopt, .getsockopt = nr_getsockopt, .sendmsg = nr_sendmsg, .recvmsg = nr_recvmsg, .mmap = sock_no_mmap, }; static struct notifier_block nr_dev_notifier = { .notifier_call = nr_device_event, }; static struct net_device **dev_nr; static struct ax25_protocol nr_pid = { .pid = AX25_P_NETROM, .func = nr_route_frame }; static struct ax25_linkfail nr_linkfail_notifier = { .func = nr_link_failed, }; static int __init nr_proto_init(void) { int i; int rc = proto_register(&nr_proto, 0); if (rc) return rc; if (nr_ndevs > 0x7fffffff/sizeof(struct net_device *)) { pr_err("NET/ROM: %s - nr_ndevs parameter too large\n", __func__); rc = -EINVAL; goto unregister_proto; } dev_nr = kcalloc(nr_ndevs, sizeof(struct net_device *), GFP_KERNEL); if (!dev_nr) { pr_err("NET/ROM: %s - unable to allocate device array\n", __func__); rc = -ENOMEM; goto unregister_proto; } for (i = 0; i < nr_ndevs; i++) { char name[IFNAMSIZ]; struct net_device *dev; sprintf(name, "nr%d", i); dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, nr_setup); if (!dev) { rc = -ENOMEM; goto fail; } dev->base_addr = i; rc = register_netdev(dev); if (rc) { free_netdev(dev); goto fail; } nr_set_lockdep_key(dev); dev_nr[i] = dev; } rc = sock_register(&nr_family_ops); if (rc) goto fail; rc = register_netdevice_notifier(&nr_dev_notifier); if (rc) goto out_sock; ax25_register_pid(&nr_pid); ax25_linkfail_register(&nr_linkfail_notifier); #ifdef CONFIG_SYSCTL rc = nr_register_sysctl(); if (rc) goto out_sysctl; #endif nr_loopback_init(); rc = -ENOMEM; if (!proc_create_seq("nr", 0444, init_net.proc_net, &nr_info_seqops)) goto proc_remove1; if (!proc_create_seq("nr_neigh", 0444, init_net.proc_net, &nr_neigh_seqops)) goto proc_remove2; if (!proc_create_seq("nr_nodes", 0444, init_net.proc_net, &nr_node_seqops)) goto proc_remove3; return 0; proc_remove3: remove_proc_entry("nr_neigh", init_net.proc_net); proc_remove2: remove_proc_entry("nr", init_net.proc_net); proc_remove1: nr_loopback_clear(); nr_rt_free(); #ifdef CONFIG_SYSCTL nr_unregister_sysctl(); out_sysctl: #endif ax25_linkfail_release(&nr_linkfail_notifier); ax25_protocol_release(AX25_P_NETROM); unregister_netdevice_notifier(&nr_dev_notifier); out_sock: sock_unregister(PF_NETROM); fail: while (--i >= 0) { unregister_netdev(dev_nr[i]); free_netdev(dev_nr[i]); } kfree(dev_nr); unregister_proto: proto_unregister(&nr_proto); return rc; } module_init(nr_proto_init); module_param(nr_ndevs, int, 0); MODULE_PARM_DESC(nr_ndevs, "number of NET/ROM devices"); MODULE_AUTHOR("Jonathan Naylor G4KLX <g4klx@g4klx.demon.co.uk>"); MODULE_DESCRIPTION("The amateur radio NET/ROM network and transport layer protocol"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_NETROM); static void __exit nr_exit(void) { int i; remove_proc_entry("nr", init_net.proc_net); remove_proc_entry("nr_neigh", init_net.proc_net); remove_proc_entry("nr_nodes", init_net.proc_net); nr_loopback_clear(); nr_rt_free(); #ifdef CONFIG_SYSCTL nr_unregister_sysctl(); #endif ax25_linkfail_release(&nr_linkfail_notifier); ax25_protocol_release(AX25_P_NETROM); unregister_netdevice_notifier(&nr_dev_notifier); sock_unregister(PF_NETROM); for (i = 0; i < nr_ndevs; i++) { struct net_device *dev = dev_nr[i]; if (dev) { unregister_netdev(dev); free_netdev(dev); } } kfree(dev_nr); proto_unregister(&nr_proto); } module_exit(nr_exit);
87 81 106 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM page_pool #if !defined(_TRACE_PAGE_POOL_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_PAGE_POOL_H #include <linux/types.h> #include <linux/tracepoint.h> #include <trace/events/mmflags.h> #include <net/page_pool/types.h> TRACE_EVENT(page_pool_release, TP_PROTO(const struct page_pool *pool, s32 inflight, u32 hold, u32 release), TP_ARGS(pool, inflight, hold, release), TP_STRUCT__entry( __field(const struct page_pool *, pool) __field(s32, inflight) __field(u32, hold) __field(u32, release) __field(u64, cnt) ), TP_fast_assign( __entry->pool = pool; __entry->inflight = inflight; __entry->hold = hold; __entry->release = release; __entry->cnt = pool->destroy_cnt; ), TP_printk("page_pool=%p inflight=%d hold=%u release=%u cnt=%llu", __entry->pool, __entry->inflight, __entry->hold, __entry->release, __entry->cnt) ); TRACE_EVENT(page_pool_state_release, TP_PROTO(const struct page_pool *pool, const struct page *page, u32 release), TP_ARGS(pool, page, release), TP_STRUCT__entry( __field(const struct page_pool *, pool) __field(const struct page *, page) __field(u32, release) __field(unsigned long, pfn) ), TP_fast_assign( __entry->pool = pool; __entry->page = page; __entry->release = release; __entry->pfn = page_to_pfn(page); ), TP_printk("page_pool=%p page=%p pfn=0x%lx release=%u", __entry->pool, __entry->page, __entry->pfn, __entry->release) ); TRACE_EVENT(page_pool_state_hold, TP_PROTO(const struct page_pool *pool, const struct page *page, u32 hold), TP_ARGS(pool, page, hold), TP_STRUCT__entry( __field(const struct page_pool *, pool) __field(const struct page *, page) __field(u32, hold) __field(unsigned long, pfn) ), TP_fast_assign( __entry->pool = pool; __entry->page = page; __entry->hold = hold; __entry->pfn = page_to_pfn(page); ), TP_printk("page_pool=%p page=%p pfn=0x%lx hold=%u", __entry->pool, __entry->page, __entry->pfn, __entry->hold) ); TRACE_EVENT(page_pool_update_nid, TP_PROTO(const struct page_pool *pool, int new_nid), TP_ARGS(pool, new_nid), TP_STRUCT__entry( __field(const struct page_pool *, pool) __field(int, pool_nid) __field(int, new_nid) ), TP_fast_assign( __entry->pool = pool; __entry->pool_nid = pool->p.nid; __entry->new_nid = new_nid; ), TP_printk("page_pool=%p pool_nid=%d new_nid=%d", __entry->pool, __entry->pool_nid, __entry->new_nid) ); #endif /* _TRACE_PAGE_POOL_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_SPECIAL_INSNS_H #define _ASM_X86_SPECIAL_INSNS_H #ifdef __KERNEL__ #include <asm/nops.h> #include <asm/processor-flags.h> #include <linux/irqflags.h> #include <linux/jump_label.h> /* * The compiler should not reorder volatile asm statements with respect to each * other: they should execute in program order. However GCC 4.9.x and 5.x have * a bug (which was fixed in 8.1, 7.3 and 6.5) where they might reorder * volatile asm. The write functions are not affected since they have memory * clobbers preventing reordering. To prevent reads from being reordered with * respect to writes, use a dummy memory operand. */ #define __FORCE_ORDER "m"(*(unsigned int *)0x1000UL) void native_write_cr0(unsigned long val); static inline unsigned long native_read_cr0(void) { unsigned long val; asm volatile("mov %%cr0,%0\n\t" : "=r" (val) : __FORCE_ORDER); return val; } static __always_inline unsigned long native_read_cr2(void) { unsigned long val; asm volatile("mov %%cr2,%0\n\t" : "=r" (val) : __FORCE_ORDER); return val; } static __always_inline void native_write_cr2(unsigned long val) { asm volatile("mov %0,%%cr2": : "r" (val) : "memory"); } static inline unsigned long __native_read_cr3(void) { unsigned long val; asm volatile("mov %%cr3,%0\n\t" : "=r" (val) : __FORCE_ORDER); return val; } static inline void native_write_cr3(unsigned long val) { asm volatile("mov %0,%%cr3": : "r" (val) : "memory"); } static inline unsigned long native_read_cr4(void) { unsigned long val; #ifdef CONFIG_X86_32 /* * This could fault if CR4 does not exist. Non-existent CR4 * is functionally equivalent to CR4 == 0. Keep it simple and pretend * that CR4 == 0 on CPUs that don't have CR4. */ asm volatile("1: mov %%cr4, %0\n" "2:\n" _ASM_EXTABLE(1b, 2b) : "=r" (val) : "0" (0), __FORCE_ORDER); #else /* CR4 always exists on x86_64. */ asm volatile("mov %%cr4,%0\n\t" : "=r" (val) : __FORCE_ORDER); #endif return val; } void native_write_cr4(unsigned long val); #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS static inline u32 rdpkru(void) { u32 ecx = 0; u32 edx, pkru; /* * "rdpkru" instruction. Places PKRU contents in to EAX, * clears EDX and requires that ecx=0. */ asm volatile(".byte 0x0f,0x01,0xee\n\t" : "=a" (pkru), "=d" (edx) : "c" (ecx)); return pkru; } static inline void wrpkru(u32 pkru) { u32 ecx = 0, edx = 0; /* * "wrpkru" instruction. Loads contents in EAX to PKRU, * requires that ecx = edx = 0. */ asm volatile(".byte 0x0f,0x01,0xef\n\t" : : "a" (pkru), "c"(ecx), "d"(edx)); } #else static inline u32 rdpkru(void) { return 0; } static inline void wrpkru(u32 pkru) { } #endif static __always_inline void native_wbinvd(void) { asm volatile("wbinvd": : :"memory"); } static inline unsigned long __read_cr4(void) { return native_read_cr4(); } #ifdef CONFIG_PARAVIRT_XXL #include <asm/paravirt.h> #else static inline unsigned long read_cr0(void) { return native_read_cr0(); } static inline void write_cr0(unsigned long x) { native_write_cr0(x); } static __always_inline unsigned long read_cr2(void) { return native_read_cr2(); } static __always_inline void write_cr2(unsigned long x) { native_write_cr2(x); } /* * Careful! CR3 contains more than just an address. You probably want * read_cr3_pa() instead. */ static inline unsigned long __read_cr3(void) { return __native_read_cr3(); } static inline void write_cr3(unsigned long x) { native_write_cr3(x); } static inline void __write_cr4(unsigned long x) { native_write_cr4(x); } static __always_inline void wbinvd(void) { native_wbinvd(); } #endif /* CONFIG_PARAVIRT_XXL */ static __always_inline void clflush(volatile void *__p) { asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p)); } static inline void clflushopt(volatile void *__p) { alternative_io(".byte 0x3e; clflush %P0", ".byte 0x66; clflush %P0", X86_FEATURE_CLFLUSHOPT, "+m" (*(volatile char __force *)__p)); } static inline void clwb(volatile void *__p) { volatile struct { char x[64]; } *p = __p; asm volatile(ALTERNATIVE_2( ".byte 0x3e; clflush (%[pax])", ".byte 0x66; clflush (%[pax])", /* clflushopt (%%rax) */ X86_FEATURE_CLFLUSHOPT, ".byte 0x66, 0x0f, 0xae, 0x30", /* clwb (%%rax) */ X86_FEATURE_CLWB) : [p] "+m" (*p) : [pax] "a" (p)); } #ifdef CONFIG_X86_USER_SHADOW_STACK static inline int write_user_shstk_64(u64 __user *addr, u64 val) { asm_volatile_goto("1: wrussq %[val], (%[addr])\n" _ASM_EXTABLE(1b, %l[fail]) :: [addr] "r" (addr), [val] "r" (val) :: fail); return 0; fail: return -EFAULT; } #endif /* CONFIG_X86_USER_SHADOW_STACK */ #define nop() asm volatile ("nop") static inline void serialize(void) { /* Instruction opcode for SERIALIZE; supported in binutils >= 2.35. */ asm volatile(".byte 0xf, 0x1, 0xe8" ::: "memory"); } /* The dst parameter must be 64-bytes aligned */ static inline void movdir64b(void __iomem *dst, const void *src) { const struct { char _[64]; } *__src = src; struct { char _[64]; } __iomem *__dst = dst; /* * MOVDIR64B %(rdx), rax. * * Both __src and __dst must be memory constraints in order to tell the * compiler that no other memory accesses should be reordered around * this one. * * Also, both must be supplied as lvalues because this tells * the compiler what the object is (its size) the instruction accesses. * I.e., not the pointers but what they point to, thus the deref'ing '*'. */ asm volatile(".byte 0x66, 0x0f, 0x38, 0xf8, 0x02" : "+m" (*__dst) : "m" (*__src), "a" (__dst), "d" (__src)); } /** * enqcmds - Enqueue a command in supervisor (CPL0) mode * @dst: destination, in MMIO space (must be 512-bit aligned) * @src: 512 bits memory operand * * The ENQCMDS instruction allows software to write a 512-bit command to * a 512-bit-aligned special MMIO region that supports the instruction. * A return status is loaded into the ZF flag in the RFLAGS register. * ZF = 0 equates to success, and ZF = 1 indicates retry or error. * * This function issues the ENQCMDS instruction to submit data from * kernel space to MMIO space, in a unit of 512 bits. Order of data access * is not guaranteed, nor is a memory barrier performed afterwards. It * returns 0 on success and -EAGAIN on failure. * * Warning: Do not use this helper unless your driver has checked that the * ENQCMDS instruction is supported on the platform and the device accepts * ENQCMDS. */ static inline int enqcmds(void __iomem *dst, const void *src) { const struct { char _[64]; } *__src = src; struct { char _[64]; } __iomem *__dst = dst; bool zf; /* * ENQCMDS %(rdx), rax * * See movdir64b()'s comment on operand specification. */ asm volatile(".byte 0xf3, 0x0f, 0x38, 0xf8, 0x02, 0x66, 0x90" CC_SET(z) : CC_OUT(z) (zf), "+m" (*__dst) : "m" (*__src), "a" (__dst), "d" (__src)); /* Submission failure is indicated via EFLAGS.ZF=1 */ if (zf) return -EAGAIN; return 0; } static __always_inline void tile_release(void) { /* * Instruction opcode for TILERELEASE; supported in binutils * version >= 2.36. */ asm volatile(".byte 0xc4, 0xe2, 0x78, 0x49, 0xc0"); } #endif /* __KERNEL__ */ #endif /* _ASM_X86_SPECIAL_INSNS_H */
1036 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/of.h> #include <linux/of_device.h> #include <linux/of_address.h> #include <linux/of_iommu.h> #include <linux/of_reserved_mem.h> #include <linux/dma-direct.h> /* for bus_dma_region */ #include <linux/dma-map-ops.h> #include <linux/init.h> #include <linux/mod_devicetable.h> #include <linux/slab.h> #include <linux/platform_device.h> #include <asm/errno.h> #include "of_private.h" /** * of_match_device - Tell if a struct device matches an of_device_id list * @matches: array of of device match structures to search in * @dev: the of device structure to match against * * Used by a driver to check whether an platform_device present in the * system is in its list of supported devices. */ const struct of_device_id *of_match_device(const struct of_device_id *matches, const struct device *dev) { if (!matches || !dev->of_node || dev->of_node_reused) return NULL; return of_match_node(matches, dev->of_node); } EXPORT_SYMBOL(of_match_device); static void of_dma_set_restricted_buffer(struct device *dev, struct device_node *np) { struct device_node *node, *of_node = dev->of_node; int count, i; if (!IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL)) return; count = of_property_count_elems_of_size(of_node, "memory-region", sizeof(u32)); /* * If dev->of_node doesn't exist or doesn't contain memory-region, try * the OF node having DMA configuration. */ if (count <= 0) { of_node = np; count = of_property_count_elems_of_size( of_node, "memory-region", sizeof(u32)); } for (i = 0; i < count; i++) { node = of_parse_phandle(of_node, "memory-region", i); /* * There might be multiple memory regions, but only one * restricted-dma-pool region is allowed. */ if (of_device_is_compatible(node, "restricted-dma-pool") && of_device_is_available(node)) { of_node_put(node); break; } of_node_put(node); } /* * Attempt to initialize a restricted-dma-pool region if one was found. * Note that count can hold a negative error code. */ if (i < count && of_reserved_mem_device_init_by_idx(dev, of_node, i)) dev_warn(dev, "failed to initialise \"restricted-dma-pool\" memory node\n"); } /** * of_dma_configure_id - Setup DMA configuration * @dev: Device to apply DMA configuration * @np: Pointer to OF node having DMA configuration * @force_dma: Whether device is to be set up by of_dma_configure() even if * DMA capability is not explicitly described by firmware. * @id: Optional const pointer value input id * * Try to get devices's DMA configuration from DT and update it * accordingly. * * If platform code needs to use its own special DMA configuration, it * can use a platform bus notifier and handle BUS_NOTIFY_ADD_DEVICE events * to fix up DMA configuration. */ int of_dma_configure_id(struct device *dev, struct device_node *np, bool force_dma, const u32 *id) { const struct iommu_ops *iommu; const struct bus_dma_region *map = NULL; struct device_node *bus_np; u64 dma_start = 0; u64 mask, end, size = 0; bool coherent; int ret; if (np == dev->of_node) bus_np = __of_get_dma_parent(np); else bus_np = of_node_get(np); ret = of_dma_get_range(bus_np, &map); of_node_put(bus_np); if (ret < 0) { /* * For legacy reasons, we have to assume some devices need * DMA configuration regardless of whether "dma-ranges" is * correctly specified or not. */ if (!force_dma) return ret == -ENODEV ? 0 : ret; } else { const struct bus_dma_region *r = map; u64 dma_end = 0; /* Determine the overall bounds of all DMA regions */ for (dma_start = ~0; r->size; r++) { /* Take lower and upper limits */ if (r->dma_start < dma_start) dma_start = r->dma_start; if (r->dma_start + r->size > dma_end) dma_end = r->dma_start + r->size; } size = dma_end - dma_start; /* * Add a work around to treat the size as mask + 1 in case * it is defined in DT as a mask. */ if (size & 1) { dev_warn(dev, "Invalid size 0x%llx for dma-range(s)\n", size); size = size + 1; } if (!size) { dev_err(dev, "Adjusted size 0x%llx invalid\n", size); kfree(map); return -EINVAL; } } /* * If @dev is expected to be DMA-capable then the bus code that created * it should have initialised its dma_mask pointer by this point. For * now, we'll continue the legacy behaviour of coercing it to the * coherent mask if not, but we'll no longer do so quietly. */ if (!dev->dma_mask) { dev_warn(dev, "DMA mask not set\n"); dev->dma_mask = &dev->coherent_dma_mask; } if (!size && dev->coherent_dma_mask) size = max(dev->coherent_dma_mask, dev->coherent_dma_mask + 1); else if (!size) size = 1ULL << 32; /* * Limit coherent and dma mask based on size and default mask * set by the driver. */ end = dma_start + size - 1; mask = DMA_BIT_MASK(ilog2(end) + 1); dev->coherent_dma_mask &= mask; *dev->dma_mask &= mask; /* ...but only set bus limit and range map if we found valid dma-ranges earlier */ if (!ret) { dev->bus_dma_limit = end; dev->dma_range_map = map; } coherent = of_dma_is_coherent(np); dev_dbg(dev, "device is%sdma coherent\n", coherent ? " " : " not "); iommu = of_iommu_configure(dev, np, id); if (PTR_ERR(iommu) == -EPROBE_DEFER) { /* Don't touch range map if it wasn't set from a valid dma-ranges */ if (!ret) dev->dma_range_map = NULL; kfree(map); return -EPROBE_DEFER; } dev_dbg(dev, "device is%sbehind an iommu\n", iommu ? " " : " not "); arch_setup_dma_ops(dev, dma_start, size, iommu, coherent); if (!iommu) of_dma_set_restricted_buffer(dev, np); return 0; } EXPORT_SYMBOL_GPL(of_dma_configure_id); const void *of_device_get_match_data(const struct device *dev) { const struct of_device_id *match; match = of_match_device(dev->driver->of_match_table, dev); if (!match) return NULL; return match->data; } EXPORT_SYMBOL(of_device_get_match_data); /** * of_device_modalias - Fill buffer with newline terminated modalias string * @dev: Calling device * @str: Modalias string * @len: Size of @str */ ssize_t of_device_modalias(struct device *dev, char *str, ssize_t len) { ssize_t sl; if (!dev || !dev->of_node || dev->of_node_reused) return -ENODEV; sl = of_modalias(dev->of_node, str, len - 2); if (sl < 0) return sl; if (sl > len - 2) return -ENOMEM; str[sl++] = '\n'; str[sl] = 0; return sl; } EXPORT_SYMBOL_GPL(of_device_modalias); /** * of_device_uevent - Display OF related uevent information * @dev: Device to display the uevent information for * @env: Kernel object's userspace event reference to fill up */ void of_device_uevent(const struct device *dev, struct kobj_uevent_env *env) { const char *compat, *type; struct alias_prop *app; struct property *p; int seen = 0; if ((!dev) || (!dev->of_node)) return; add_uevent_var(env, "OF_NAME=%pOFn", dev->of_node); add_uevent_var(env, "OF_FULLNAME=%pOF", dev->of_node); type = of_node_get_device_type(dev->of_node); if (type) add_uevent_var(env, "OF_TYPE=%s", type); /* Since the compatible field can contain pretty much anything * it's not really legal to split it out with commas. We split it * up using a number of environment variables instead. */ of_property_for_each_string(dev->of_node, "compatible", p, compat) { add_uevent_var(env, "OF_COMPATIBLE_%d=%s", seen, compat); seen++; } add_uevent_var(env, "OF_COMPATIBLE_N=%d", seen); seen = 0; mutex_lock(&of_mutex); list_for_each_entry(app, &aliases_lookup, link) { if (dev->of_node == app->np) { add_uevent_var(env, "OF_ALIAS_%d=%s", seen, app->alias); seen++; } } mutex_unlock(&of_mutex); } EXPORT_SYMBOL_GPL(of_device_uevent); int of_device_uevent_modalias(const struct device *dev, struct kobj_uevent_env *env) { int sl; if ((!dev) || (!dev->of_node) || dev->of_node_reused) return -ENODEV; /* Devicetree modalias is tricky, we add it in 2 steps */ if (add_uevent_var(env, "MODALIAS=")) return -ENOMEM; sl = of_modalias(dev->of_node, &env->buf[env->buflen-1], sizeof(env->buf) - env->buflen); if (sl < 0) return sl; if (sl >= (sizeof(env->buf) - env->buflen)) return -ENOMEM; env->buflen += sl; return 0; } EXPORT_SYMBOL_GPL(of_device_uevent_modalias);
6 27 27 27 27 27 27 27 27 27 27 27 27 27 27 23 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2006, Johannes Berg <johannes@sipsolutions.net> */ /* just for IFNAMSIZ */ #include <linux/if.h> #include <linux/slab.h> #include <linux/export.h> #include "led.h" void ieee80211_led_assoc(struct ieee80211_local *local, bool associated) { if (!atomic_read(&local->assoc_led_active)) return; if (associated) led_trigger_event(&local->assoc_led, LED_FULL); else led_trigger_event(&local->assoc_led, LED_OFF); } void ieee80211_led_radio(struct ieee80211_local *local, bool enabled) { if (!atomic_read(&local->radio_led_active)) return; if (enabled) led_trigger_event(&local->radio_led, LED_FULL); else led_trigger_event(&local->radio_led, LED_OFF); } void ieee80211_alloc_led_names(struct ieee80211_local *local) { local->rx_led.name = kasprintf(GFP_KERNEL, "%srx", wiphy_name(local->hw.wiphy)); local->tx_led.name = kasprintf(GFP_KERNEL, "%stx", wiphy_name(local->hw.wiphy)); local->assoc_led.name = kasprintf(GFP_KERNEL, "%sassoc", wiphy_name(local->hw.wiphy)); local->radio_led.name = kasprintf(GFP_KERNEL, "%sradio", wiphy_name(local->hw.wiphy)); } void ieee80211_free_led_names(struct ieee80211_local *local) { kfree(local->rx_led.name); kfree(local->tx_led.name); kfree(local->assoc_led.name); kfree(local->radio_led.name); } static int ieee80211_tx_led_activate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, tx_led); atomic_inc(&local->tx_led_active); return 0; } static void ieee80211_tx_led_deactivate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, tx_led); atomic_dec(&local->tx_led_active); } static int ieee80211_rx_led_activate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, rx_led); atomic_inc(&local->rx_led_active); return 0; } static void ieee80211_rx_led_deactivate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, rx_led); atomic_dec(&local->rx_led_active); } static int ieee80211_assoc_led_activate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, assoc_led); atomic_inc(&local->assoc_led_active); return 0; } static void ieee80211_assoc_led_deactivate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, assoc_led); atomic_dec(&local->assoc_led_active); } static int ieee80211_radio_led_activate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, radio_led); atomic_inc(&local->radio_led_active); return 0; } static void ieee80211_radio_led_deactivate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, radio_led); atomic_dec(&local->radio_led_active); } static int ieee80211_tpt_led_activate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, tpt_led); atomic_inc(&local->tpt_led_active); return 0; } static void ieee80211_tpt_led_deactivate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, tpt_led); atomic_dec(&local->tpt_led_active); } void ieee80211_led_init(struct ieee80211_local *local) { atomic_set(&local->rx_led_active, 0); local->rx_led.activate = ieee80211_rx_led_activate; local->rx_led.deactivate = ieee80211_rx_led_deactivate; if (local->rx_led.name && led_trigger_register(&local->rx_led)) { kfree(local->rx_led.name); local->rx_led.name = NULL; } atomic_set(&local->tx_led_active, 0); local->tx_led.activate = ieee80211_tx_led_activate; local->tx_led.deactivate = ieee80211_tx_led_deactivate; if (local->tx_led.name && led_trigger_register(&local->tx_led)) { kfree(local->tx_led.name); local->tx_led.name = NULL; } atomic_set(&local->assoc_led_active, 0); local->assoc_led.activate = ieee80211_assoc_led_activate; local->assoc_led.deactivate = ieee80211_assoc_led_deactivate; if (local->assoc_led.name && led_trigger_register(&local->assoc_led)) { kfree(local->assoc_led.name); local->assoc_led.name = NULL; } atomic_set(&local->radio_led_active, 0); local->radio_led.activate = ieee80211_radio_led_activate; local->radio_led.deactivate = ieee80211_radio_led_deactivate; if (local->radio_led.name && led_trigger_register(&local->radio_led)) { kfree(local->radio_led.name); local->radio_led.name = NULL; } atomic_set(&local->tpt_led_active, 0); if (local->tpt_led_trigger) { local->tpt_led.activate = ieee80211_tpt_led_activate; local->tpt_led.deactivate = ieee80211_tpt_led_deactivate; if (led_trigger_register(&local->tpt_led)) { kfree(local->tpt_led_trigger); local->tpt_led_trigger = NULL; } } } void ieee80211_led_exit(struct ieee80211_local *local) { if (local->radio_led.name) led_trigger_unregister(&local->radio_led); if (local->assoc_led.name) led_trigger_unregister(&local->assoc_led); if (local->tx_led.name) led_trigger_unregister(&local->tx_led); if (local->rx_led.name) led_trigger_unregister(&local->rx_led); if (local->tpt_led_trigger) { led_trigger_unregister(&local->tpt_led); kfree(local->tpt_led_trigger); } } const char *__ieee80211_get_radio_led_name(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); return local->radio_led.name; } EXPORT_SYMBOL(__ieee80211_get_radio_led_name); const char *__ieee80211_get_assoc_led_name(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); return local->assoc_led.name; } EXPORT_SYMBOL(__ieee80211_get_assoc_led_name); const char *__ieee80211_get_tx_led_name(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); return local->tx_led.name; } EXPORT_SYMBOL(__ieee80211_get_tx_led_name); const char *__ieee80211_get_rx_led_name(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); return local->rx_led.name; } EXPORT_SYMBOL(__ieee80211_get_rx_led_name); static unsigned long tpt_trig_traffic(struct ieee80211_local *local, struct tpt_led_trigger *tpt_trig) { unsigned long traffic, delta; traffic = tpt_trig->tx_bytes + tpt_trig->rx_bytes; delta = traffic - tpt_trig->prev_traffic; tpt_trig->prev_traffic = traffic; return DIV_ROUND_UP(delta, 1024 / 8); } static void tpt_trig_timer(struct timer_list *t) { struct tpt_led_trigger *tpt_trig = from_timer(tpt_trig, t, timer); struct ieee80211_local *local = tpt_trig->local; unsigned long on, off, tpt; int i; if (!tpt_trig->running) return; mod_timer(&tpt_trig->timer, round_jiffies(jiffies + HZ)); tpt = tpt_trig_traffic(local, tpt_trig); /* default to just solid on */ on = 1; off = 0; for (i = tpt_trig->blink_table_len - 1; i >= 0; i--) { if (tpt_trig->blink_table[i].throughput < 0 || tpt > tpt_trig->blink_table[i].throughput) { off = tpt_trig->blink_table[i].blink_time / 2; on = tpt_trig->blink_table[i].blink_time - off; break; } } led_trigger_blink(&local->tpt_led, on, off); } const char * __ieee80211_create_tpt_led_trigger(struct ieee80211_hw *hw, unsigned int flags, const struct ieee80211_tpt_blink *blink_table, unsigned int blink_table_len) { struct ieee80211_local *local = hw_to_local(hw); struct tpt_led_trigger *tpt_trig; if (WARN_ON(local->tpt_led_trigger)) return NULL; tpt_trig = kzalloc(sizeof(struct tpt_led_trigger), GFP_KERNEL); if (!tpt_trig) return NULL; snprintf(tpt_trig->name, sizeof(tpt_trig->name), "%stpt", wiphy_name(local->hw.wiphy)); local->tpt_led.name = tpt_trig->name; tpt_trig->blink_table = blink_table; tpt_trig->blink_table_len = blink_table_len; tpt_trig->want = flags; tpt_trig->local = local; timer_setup(&tpt_trig->timer, tpt_trig_timer, 0); local->tpt_led_trigger = tpt_trig; return tpt_trig->name; } EXPORT_SYMBOL(__ieee80211_create_tpt_led_trigger); static void ieee80211_start_tpt_led_trig(struct ieee80211_local *local) { struct tpt_led_trigger *tpt_trig = local->tpt_led_trigger; if (tpt_trig->running) return; /* reset traffic */ tpt_trig_traffic(local, tpt_trig); tpt_trig->running = true; tpt_trig_timer(&tpt_trig->timer); mod_timer(&tpt_trig->timer, round_jiffies(jiffies + HZ)); } static void ieee80211_stop_tpt_led_trig(struct ieee80211_local *local) { struct tpt_led_trigger *tpt_trig = local->tpt_led_trigger; if (!tpt_trig->running) return; tpt_trig->running = false; del_timer_sync(&tpt_trig->timer); led_trigger_event(&local->tpt_led, LED_OFF); } void ieee80211_mod_tpt_led_trig(struct ieee80211_local *local, unsigned int types_on, unsigned int types_off) { struct tpt_led_trigger *tpt_trig = local->tpt_led_trigger; bool allowed; WARN_ON(types_on & types_off); if (!tpt_trig) return; tpt_trig->active &= ~types_off; tpt_trig->active |= types_on; /* * Regardless of wanted state, we shouldn't blink when * the radio is disabled -- this can happen due to some * code ordering issues with __ieee80211_recalc_idle() * being called before the radio is started. */ allowed = tpt_trig->active & IEEE80211_TPT_LEDTRIG_FL_RADIO; if (!allowed || !(tpt_trig->active & tpt_trig->want)) ieee80211_stop_tpt_led_trig(local); else ieee80211_start_tpt_led_trig(local); }
859 859 859 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_JUMP_LABEL_H #define _LINUX_JUMP_LABEL_H /* * Jump label support * * Copyright (C) 2009-2012 Jason Baron <jbaron@redhat.com> * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra * * DEPRECATED API: * * The use of 'struct static_key' directly, is now DEPRECATED. In addition * static_key_{true,false}() is also DEPRECATED. IE DO NOT use the following: * * struct static_key false = STATIC_KEY_INIT_FALSE; * struct static_key true = STATIC_KEY_INIT_TRUE; * static_key_true() * static_key_false() * * The updated API replacements are: * * DEFINE_STATIC_KEY_TRUE(key); * DEFINE_STATIC_KEY_FALSE(key); * DEFINE_STATIC_KEY_ARRAY_TRUE(keys, count); * DEFINE_STATIC_KEY_ARRAY_FALSE(keys, count); * static_branch_likely() * static_branch_unlikely() * * Jump labels provide an interface to generate dynamic branches using * self-modifying code. Assuming toolchain and architecture support, if we * define a "key" that is initially false via "DEFINE_STATIC_KEY_FALSE(key)", * an "if (static_branch_unlikely(&key))" statement is an unconditional branch * (which defaults to false - and the true block is placed out of line). * Similarly, we can define an initially true key via * "DEFINE_STATIC_KEY_TRUE(key)", and use it in the same * "if (static_branch_unlikely(&key))", in which case we will generate an * unconditional branch to the out-of-line true branch. Keys that are * initially true or false can be using in both static_branch_unlikely() * and static_branch_likely() statements. * * At runtime we can change the branch target by setting the key * to true via a call to static_branch_enable(), or false using * static_branch_disable(). If the direction of the branch is switched by * these calls then we run-time modify the branch target via a * no-op -> jump or jump -> no-op conversion. For example, for an * initially false key that is used in an "if (static_branch_unlikely(&key))" * statement, setting the key to true requires us to patch in a jump * to the out-of-line of true branch. * * In addition to static_branch_{enable,disable}, we can also reference count * the key or branch direction via static_branch_{inc,dec}. Thus, * static_branch_inc() can be thought of as a 'make more true' and * static_branch_dec() as a 'make more false'. * * Since this relies on modifying code, the branch modifying functions * must be considered absolute slow paths (machine wide synchronization etc.). * OTOH, since the affected branches are unconditional, their runtime overhead * will be absolutely minimal, esp. in the default (off) case where the total * effect is a single NOP of appropriate size. The on case will patch in a jump * to the out-of-line block. * * When the control is directly exposed to userspace, it is prudent to delay the * decrement to avoid high frequency code modifications which can (and do) * cause significant performance degradation. Struct static_key_deferred and * static_key_slow_dec_deferred() provide for this. * * Lacking toolchain and or architecture support, static keys fall back to a * simple conditional branch. * * Additional babbling in: Documentation/staging/static-keys.rst */ #ifndef __ASSEMBLY__ #include <linux/types.h> #include <linux/compiler.h> extern bool static_key_initialized; #define STATIC_KEY_CHECK_USE(key) WARN(!static_key_initialized, \ "%s(): static key '%pS' used before call to jump_label_init()", \ __func__, (key)) struct static_key { atomic_t enabled; #ifdef CONFIG_JUMP_LABEL /* * Note: * To make anonymous unions work with old compilers, the static * initialization of them requires brackets. This creates a dependency * on the order of the struct with the initializers. If any fields * are added, STATIC_KEY_INIT_TRUE and STATIC_KEY_INIT_FALSE may need * to be modified. * * bit 0 => 1 if key is initially true * 0 if initially false * bit 1 => 1 if points to struct static_key_mod * 0 if points to struct jump_entry */ union { unsigned long type; struct jump_entry *entries; struct static_key_mod *next; }; #endif /* CONFIG_JUMP_LABEL */ }; #endif /* __ASSEMBLY__ */ #ifdef CONFIG_JUMP_LABEL #include <asm/jump_label.h> #ifndef __ASSEMBLY__ #ifdef CONFIG_HAVE_ARCH_JUMP_LABEL_RELATIVE struct jump_entry { s32 code; s32 target; long key; // key may be far away from the core kernel under KASLR }; static inline unsigned long jump_entry_code(const struct jump_entry *entry) { return (unsigned long)&entry->code + entry->code; } static inline unsigned long jump_entry_target(const struct jump_entry *entry) { return (unsigned long)&entry->target + entry->target; } static inline struct static_key *jump_entry_key(const struct jump_entry *entry) { long offset = entry->key & ~3L; return (struct static_key *)((unsigned long)&entry->key + offset); } #else static inline unsigned long jump_entry_code(const struct jump_entry *entry) { return entry->code; } static inline unsigned long jump_entry_target(const struct jump_entry *entry) { return entry->target; } static inline struct static_key *jump_entry_key(const struct jump_entry *entry) { return (struct static_key *)((unsigned long)entry->key & ~3UL); } #endif static inline bool jump_entry_is_branch(const struct jump_entry *entry) { return (unsigned long)entry->key & 1UL; } static inline bool jump_entry_is_init(const struct jump_entry *entry) { return (unsigned long)entry->key & 2UL; } static inline void jump_entry_set_init(struct jump_entry *entry, bool set) { if (set) entry->key |= 2; else entry->key &= ~2; } static inline int jump_entry_size(struct jump_entry *entry) { #ifdef JUMP_LABEL_NOP_SIZE return JUMP_LABEL_NOP_SIZE; #else return arch_jump_entry_size(entry); #endif } #endif #endif #ifndef __ASSEMBLY__ enum jump_label_type { JUMP_LABEL_NOP = 0, JUMP_LABEL_JMP, }; struct module; #ifdef CONFIG_JUMP_LABEL #define JUMP_TYPE_FALSE 0UL #define JUMP_TYPE_TRUE 1UL #define JUMP_TYPE_LINKED 2UL #define JUMP_TYPE_MASK 3UL static __always_inline bool static_key_false(struct static_key *key) { return arch_static_branch(key, false); } static __always_inline bool static_key_true(struct static_key *key) { return !arch_static_branch(key, true); } extern struct jump_entry __start___jump_table[]; extern struct jump_entry __stop___jump_table[]; extern void jump_label_init(void); extern void jump_label_lock(void); extern void jump_label_unlock(void); extern void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type); extern bool arch_jump_label_transform_queue(struct jump_entry *entry, enum jump_label_type type); extern void arch_jump_label_transform_apply(void); extern int jump_label_text_reserved(void *start, void *end); extern bool static_key_slow_inc(struct static_key *key); extern bool static_key_fast_inc_not_disabled(struct static_key *key); extern void static_key_slow_dec(struct static_key *key); extern bool static_key_slow_inc_cpuslocked(struct static_key *key); extern void static_key_slow_dec_cpuslocked(struct static_key *key); extern int static_key_count(struct static_key *key); extern void static_key_enable(struct static_key *key); extern void static_key_disable(struct static_key *key); extern void static_key_enable_cpuslocked(struct static_key *key); extern void static_key_disable_cpuslocked(struct static_key *key); extern enum jump_label_type jump_label_init_type(struct jump_entry *entry); /* * We should be using ATOMIC_INIT() for initializing .enabled, but * the inclusion of atomic.h is problematic for inclusion of jump_label.h * in 'low-level' headers. Thus, we are initializing .enabled with a * raw value, but have added a BUILD_BUG_ON() to catch any issues in * jump_label_init() see: kernel/jump_label.c. */ #define STATIC_KEY_INIT_TRUE \ { .enabled = { 1 }, \ { .type = JUMP_TYPE_TRUE } } #define STATIC_KEY_INIT_FALSE \ { .enabled = { 0 }, \ { .type = JUMP_TYPE_FALSE } } #else /* !CONFIG_JUMP_LABEL */ #include <linux/atomic.h> #include <linux/bug.h> static __always_inline int static_key_count(struct static_key *key) { return raw_atomic_read(&key->enabled); } static __always_inline void jump_label_init(void) { static_key_initialized = true; } static __always_inline bool static_key_false(struct static_key *key) { if (unlikely_notrace(static_key_count(key) > 0)) return true; return false; } static __always_inline bool static_key_true(struct static_key *key) { if (likely_notrace(static_key_count(key) > 0)) return true; return false; } static inline bool static_key_fast_inc_not_disabled(struct static_key *key) { int v; STATIC_KEY_CHECK_USE(key); /* * Prevent key->enabled getting negative to follow the same semantics * as for CONFIG_JUMP_LABEL=y, see kernel/jump_label.c comment. */ v = atomic_read(&key->enabled); do { if (v < 0 || (v + 1) < 0) return false; } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v + 1))); return true; } #define static_key_slow_inc(key) static_key_fast_inc_not_disabled(key) static inline void static_key_slow_dec(struct static_key *key) { STATIC_KEY_CHECK_USE(key); atomic_dec(&key->enabled); } #define static_key_slow_inc_cpuslocked(key) static_key_slow_inc(key) #define static_key_slow_dec_cpuslocked(key) static_key_slow_dec(key) static inline int jump_label_text_reserved(void *start, void *end) { return 0; } static inline void jump_label_lock(void) {} static inline void jump_label_unlock(void) {} static inline void static_key_enable(struct static_key *key) { STATIC_KEY_CHECK_USE(key); if (atomic_read(&key->enabled) != 0) { WARN_ON_ONCE(atomic_read(&key->enabled) != 1); return; } atomic_set(&key->enabled, 1); } static inline void static_key_disable(struct static_key *key) { STATIC_KEY_CHECK_USE(key); if (atomic_read(&key->enabled) != 1) { WARN_ON_ONCE(atomic_read(&key->enabled) != 0); return; } atomic_set(&key->enabled, 0); } #define static_key_enable_cpuslocked(k) static_key_enable((k)) #define static_key_disable_cpuslocked(k) static_key_disable((k)) #define STATIC_KEY_INIT_TRUE { .enabled = ATOMIC_INIT(1) } #define STATIC_KEY_INIT_FALSE { .enabled = ATOMIC_INIT(0) } #endif /* CONFIG_JUMP_LABEL */ #define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE #define jump_label_enabled static_key_enabled /* -------------------------------------------------------------------------- */ /* * Two type wrappers around static_key, such that we can use compile time * type differentiation to emit the right code. * * All the below code is macros in order to play type games. */ struct static_key_true { struct static_key key; }; struct static_key_false { struct static_key key; }; #define STATIC_KEY_TRUE_INIT (struct static_key_true) { .key = STATIC_KEY_INIT_TRUE, } #define STATIC_KEY_FALSE_INIT (struct static_key_false){ .key = STATIC_KEY_INIT_FALSE, } #define DEFINE_STATIC_KEY_TRUE(name) \ struct static_key_true name = STATIC_KEY_TRUE_INIT #define DEFINE_STATIC_KEY_TRUE_RO(name) \ struct static_key_true name __ro_after_init = STATIC_KEY_TRUE_INIT #define DECLARE_STATIC_KEY_TRUE(name) \ extern struct static_key_true name #define DEFINE_STATIC_KEY_FALSE(name) \ struct static_key_false name = STATIC_KEY_FALSE_INIT #define DEFINE_STATIC_KEY_FALSE_RO(name) \ struct static_key_false name __ro_after_init = STATIC_KEY_FALSE_INIT #define DECLARE_STATIC_KEY_FALSE(name) \ extern struct static_key_false name #define DEFINE_STATIC_KEY_ARRAY_TRUE(name, count) \ struct static_key_true name[count] = { \ [0 ... (count) - 1] = STATIC_KEY_TRUE_INIT, \ } #define DEFINE_STATIC_KEY_ARRAY_FALSE(name, count) \ struct static_key_false name[count] = { \ [0 ... (count) - 1] = STATIC_KEY_FALSE_INIT, \ } #define _DEFINE_STATIC_KEY_1(name) DEFINE_STATIC_KEY_TRUE(name) #define _DEFINE_STATIC_KEY_0(name) DEFINE_STATIC_KEY_FALSE(name) #define DEFINE_STATIC_KEY_MAYBE(cfg, name) \ __PASTE(_DEFINE_STATIC_KEY_, IS_ENABLED(cfg))(name) #define _DEFINE_STATIC_KEY_RO_1(name) DEFINE_STATIC_KEY_TRUE_RO(name) #define _DEFINE_STATIC_KEY_RO_0(name) DEFINE_STATIC_KEY_FALSE_RO(name) #define DEFINE_STATIC_KEY_MAYBE_RO(cfg, name) \ __PASTE(_DEFINE_STATIC_KEY_RO_, IS_ENABLED(cfg))(name) #define _DECLARE_STATIC_KEY_1(name) DECLARE_STATIC_KEY_TRUE(name) #define _DECLARE_STATIC_KEY_0(name) DECLARE_STATIC_KEY_FALSE(name) #define DECLARE_STATIC_KEY_MAYBE(cfg, name) \ __PASTE(_DECLARE_STATIC_KEY_, IS_ENABLED(cfg))(name) extern bool ____wrong_branch_error(void); #define static_key_enabled(x) \ ({ \ if (!__builtin_types_compatible_p(typeof(*x), struct static_key) && \ !__builtin_types_compatible_p(typeof(*x), struct static_key_true) &&\ !__builtin_types_compatible_p(typeof(*x), struct static_key_false)) \ ____wrong_branch_error(); \ static_key_count((struct static_key *)x) > 0; \ }) #ifdef CONFIG_JUMP_LABEL /* * Combine the right initial value (type) with the right branch order * to generate the desired result. * * * type\branch| likely (1) | unlikely (0) * -----------+-----------------------+------------------ * | | * true (1) | ... | ... * | NOP | JMP L * | <br-stmts> | 1: ... * | L: ... | * | | * | | L: <br-stmts> * | | jmp 1b * | | * -----------+-----------------------+------------------ * | | * false (0) | ... | ... * | JMP L | NOP * | <br-stmts> | 1: ... * | L: ... | * | | * | | L: <br-stmts> * | | jmp 1b * | | * -----------+-----------------------+------------------ * * The initial value is encoded in the LSB of static_key::entries, * type: 0 = false, 1 = true. * * The branch type is encoded in the LSB of jump_entry::key, * branch: 0 = unlikely, 1 = likely. * * This gives the following logic table: * * enabled type branch instuction * -----------------------------+----------- * 0 0 0 | NOP * 0 0 1 | JMP * 0 1 0 | NOP * 0 1 1 | JMP * * 1 0 0 | JMP * 1 0 1 | NOP * 1 1 0 | JMP * 1 1 1 | NOP * * Which gives the following functions: * * dynamic: instruction = enabled ^ branch * static: instruction = type ^ branch * * See jump_label_type() / jump_label_init_type(). */ #define static_branch_likely(x) \ ({ \ bool branch; \ if (__builtin_types_compatible_p(typeof(*x), struct static_key_true)) \ branch = !arch_static_branch(&(x)->key, true); \ else if (__builtin_types_compatible_p(typeof(*x), struct static_key_false)) \ branch = !arch_static_branch_jump(&(x)->key, true); \ else \ branch = ____wrong_branch_error(); \ likely_notrace(branch); \ }) #define static_branch_unlikely(x) \ ({ \ bool branch; \ if (__builtin_types_compatible_p(typeof(*x), struct static_key_true)) \ branch = arch_static_branch_jump(&(x)->key, false); \ else if (__builtin_types_compatible_p(typeof(*x), struct static_key_false)) \ branch = arch_static_branch(&(x)->key, false); \ else \ branch = ____wrong_branch_error(); \ unlikely_notrace(branch); \ }) #else /* !CONFIG_JUMP_LABEL */ #define static_branch_likely(x) likely_notrace(static_key_enabled(&(x)->key)) #define static_branch_unlikely(x) unlikely_notrace(static_key_enabled(&(x)->key)) #endif /* CONFIG_JUMP_LABEL */ #define static_branch_maybe(config, x) \ (IS_ENABLED(config) ? static_branch_likely(x) \ : static_branch_unlikely(x)) /* * Advanced usage; refcount, branch is enabled when: count != 0 */ #define static_branch_inc(x) static_key_slow_inc(&(x)->key) #define static_branch_dec(x) static_key_slow_dec(&(x)->key) #define static_branch_inc_cpuslocked(x) static_key_slow_inc_cpuslocked(&(x)->key) #define static_branch_dec_cpuslocked(x) static_key_slow_dec_cpuslocked(&(x)->key) /* * Normal usage; boolean enable/disable. */ #define static_branch_enable(x) static_key_enable(&(x)->key) #define static_branch_disable(x) static_key_disable(&(x)->key) #define static_branch_enable_cpuslocked(x) static_key_enable_cpuslocked(&(x)->key) #define static_branch_disable_cpuslocked(x) static_key_disable_cpuslocked(&(x)->key) #endif /* __ASSEMBLY__ */ #endif /* _LINUX_JUMP_LABEL_H */
114 12 11 114 114 114 2 114 10 114 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux network device link state notification * * Author: * Stefan Rompf <sux@loplof.de> */ #include <linux/module.h> #include <linux/netdevice.h> #include <linux/if.h> #include <net/sock.h> #include <net/pkt_sched.h> #include <linux/rtnetlink.h> #include <linux/jiffies.h> #include <linux/spinlock.h> #include <linux/workqueue.h> #include <linux/bitops.h> #include <linux/types.h> #include "dev.h" enum lw_bits { LW_URGENT = 0, }; static unsigned long linkwatch_flags; static unsigned long linkwatch_nextevent; static void linkwatch_event(struct work_struct *dummy); static DECLARE_DELAYED_WORK(linkwatch_work, linkwatch_event); static LIST_HEAD(lweventlist); static DEFINE_SPINLOCK(lweventlist_lock); static unsigned char default_operstate(const struct net_device *dev) { if (netif_testing(dev)) return IF_OPER_TESTING; /* Some uppers (DSA) have additional sources for being down, so * first check whether lower is indeed the source of its down state. */ if (!netif_carrier_ok(dev)) { int iflink = dev_get_iflink(dev); struct net_device *peer; if (iflink == dev->ifindex) return IF_OPER_DOWN; peer = __dev_get_by_index(dev_net(dev), iflink); if (!peer) return IF_OPER_DOWN; return netif_carrier_ok(peer) ? IF_OPER_DOWN : IF_OPER_LOWERLAYERDOWN; } if (netif_dormant(dev)) return IF_OPER_DORMANT; return IF_OPER_UP; } static void rfc2863_policy(struct net_device *dev) { unsigned char operstate = default_operstate(dev); if (operstate == dev->operstate) return; write_lock(&dev_base_lock); switch(dev->link_mode) { case IF_LINK_MODE_TESTING: if (operstate == IF_OPER_UP) operstate = IF_OPER_TESTING; break; case IF_LINK_MODE_DORMANT: if (operstate == IF_OPER_UP) operstate = IF_OPER_DORMANT; break; case IF_LINK_MODE_DEFAULT: default: break; } dev->operstate = operstate; write_unlock(&dev_base_lock); } void linkwatch_init_dev(struct net_device *dev) { /* Handle pre-registration link state changes */ if (!netif_carrier_ok(dev) || netif_dormant(dev) || netif_testing(dev)) rfc2863_policy(dev); } static bool linkwatch_urgent_event(struct net_device *dev) { if (!netif_running(dev)) return false; if (dev->ifindex != dev_get_iflink(dev)) return true; if (netif_is_lag_port(dev) || netif_is_lag_master(dev)) return true; return netif_carrier_ok(dev) && qdisc_tx_changing(dev); } static void linkwatch_add_event(struct net_device *dev) { unsigned long flags; spin_lock_irqsave(&lweventlist_lock, flags); if (list_empty(&dev->link_watch_list)) { list_add_tail(&dev->link_watch_list, &lweventlist); netdev_hold(dev, &dev->linkwatch_dev_tracker, GFP_ATOMIC); } spin_unlock_irqrestore(&lweventlist_lock, flags); } static void linkwatch_schedule_work(int urgent) { unsigned long delay = linkwatch_nextevent - jiffies; if (test_bit(LW_URGENT, &linkwatch_flags)) return; /* Minimise down-time: drop delay for up event. */ if (urgent) { if (test_and_set_bit(LW_URGENT, &linkwatch_flags)) return; delay = 0; } /* If we wrap around we'll delay it by at most HZ. */ if (delay > HZ) delay = 0; /* * If urgent, schedule immediate execution; otherwise, don't * override the existing timer. */ if (test_bit(LW_URGENT, &linkwatch_flags)) mod_delayed_work(system_wq, &linkwatch_work, 0); else schedule_delayed_work(&linkwatch_work, delay); } static void linkwatch_do_dev(struct net_device *dev) { /* * Make sure the above read is complete since it can be * rewritten as soon as we clear the bit below. */ smp_mb__before_atomic(); /* We are about to handle this device, * so new events can be accepted */ clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state); rfc2863_policy(dev); if (dev->flags & IFF_UP) { if (netif_carrier_ok(dev)) dev_activate(dev); else dev_deactivate(dev); netdev_state_change(dev); } /* Note: our callers are responsible for calling netdev_tracker_free(). * This is the reason we use __dev_put() instead of dev_put(). */ __dev_put(dev); } static void __linkwatch_run_queue(int urgent_only) { #define MAX_DO_DEV_PER_LOOP 100 int do_dev = MAX_DO_DEV_PER_LOOP; struct net_device *dev; LIST_HEAD(wrk); /* Give urgent case more budget */ if (urgent_only) do_dev += MAX_DO_DEV_PER_LOOP; /* * Limit the number of linkwatch events to one * per second so that a runaway driver does not * cause a storm of messages on the netlink * socket. This limit does not apply to up events * while the device qdisc is down. */ if (!urgent_only) linkwatch_nextevent = jiffies + HZ; /* Limit wrap-around effect on delay. */ else if (time_after(linkwatch_nextevent, jiffies + HZ)) linkwatch_nextevent = jiffies; clear_bit(LW_URGENT, &linkwatch_flags); spin_lock_irq(&lweventlist_lock); list_splice_init(&lweventlist, &wrk); while (!list_empty(&wrk) && do_dev > 0) { dev = list_first_entry(&wrk, struct net_device, link_watch_list); list_del_init(&dev->link_watch_list); if (!netif_device_present(dev) || (urgent_only && !linkwatch_urgent_event(dev))) { list_add_tail(&dev->link_watch_list, &lweventlist); continue; } /* We must free netdev tracker under * the spinlock protection. */ netdev_tracker_free(dev, &dev->linkwatch_dev_tracker); spin_unlock_irq(&lweventlist_lock); linkwatch_do_dev(dev); do_dev--; spin_lock_irq(&lweventlist_lock); } /* Add the remaining work back to lweventlist */ list_splice_init(&wrk, &lweventlist); if (!list_empty(&lweventlist)) linkwatch_schedule_work(0); spin_unlock_irq(&lweventlist_lock); } void linkwatch_forget_dev(struct net_device *dev) { unsigned long flags; int clean = 0; spin_lock_irqsave(&lweventlist_lock, flags); if (!list_empty(&dev->link_watch_list)) { list_del_init(&dev->link_watch_list); clean = 1; /* We must release netdev tracker under * the spinlock protection. */ netdev_tracker_free(dev, &dev->linkwatch_dev_tracker); } spin_unlock_irqrestore(&lweventlist_lock, flags); if (clean) linkwatch_do_dev(dev); } /* Must be called with the rtnl semaphore held */ void linkwatch_run_queue(void) { __linkwatch_run_queue(0); } static void linkwatch_event(struct work_struct *dummy) { rtnl_lock(); __linkwatch_run_queue(time_after(linkwatch_nextevent, jiffies)); rtnl_unlock(); } void linkwatch_fire_event(struct net_device *dev) { bool urgent = linkwatch_urgent_event(dev); if (!test_and_set_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) { linkwatch_add_event(dev); } else if (!urgent) return; linkwatch_schedule_work(urgent); } EXPORT_SYMBOL(linkwatch_fire_event);
9142 9143 9143 8654 8653 8654 9137 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 // SPDX-License-Identifier: GPL-2.0 /* * security/tomoyo/common.c * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #include <linux/uaccess.h> #include <linux/slab.h> #include <linux/security.h> #include <linux/string_helpers.h> #include "common.h" /* String table for operation mode. */ const char * const tomoyo_mode[TOMOYO_CONFIG_MAX_MODE] = { [TOMOYO_CONFIG_DISABLED] = "disabled", [TOMOYO_CONFIG_LEARNING] = "learning", [TOMOYO_CONFIG_PERMISSIVE] = "permissive", [TOMOYO_CONFIG_ENFORCING] = "enforcing" }; /* String table for /sys/kernel/security/tomoyo/profile */ const char * const tomoyo_mac_keywords[TOMOYO_MAX_MAC_INDEX + TOMOYO_MAX_MAC_CATEGORY_INDEX] = { /* CONFIG::file group */ [TOMOYO_MAC_FILE_EXECUTE] = "execute", [TOMOYO_MAC_FILE_OPEN] = "open", [TOMOYO_MAC_FILE_CREATE] = "create", [TOMOYO_MAC_FILE_UNLINK] = "unlink", [TOMOYO_MAC_FILE_GETATTR] = "getattr", [TOMOYO_MAC_FILE_MKDIR] = "mkdir", [TOMOYO_MAC_FILE_RMDIR] = "rmdir", [TOMOYO_MAC_FILE_MKFIFO] = "mkfifo", [TOMOYO_MAC_FILE_MKSOCK] = "mksock", [TOMOYO_MAC_FILE_TRUNCATE] = "truncate", [TOMOYO_MAC_FILE_SYMLINK] = "symlink", [TOMOYO_MAC_FILE_MKBLOCK] = "mkblock", [TOMOYO_MAC_FILE_MKCHAR] = "mkchar", [TOMOYO_MAC_FILE_LINK] = "link", [TOMOYO_MAC_FILE_RENAME] = "rename", [TOMOYO_MAC_FILE_CHMOD] = "chmod", [TOMOYO_MAC_FILE_CHOWN] = "chown", [TOMOYO_MAC_FILE_CHGRP] = "chgrp", [TOMOYO_MAC_FILE_IOCTL] = "ioctl", [TOMOYO_MAC_FILE_CHROOT] = "chroot", [TOMOYO_MAC_FILE_MOUNT] = "mount", [TOMOYO_MAC_FILE_UMOUNT] = "unmount", [TOMOYO_MAC_FILE_PIVOT_ROOT] = "pivot_root", /* CONFIG::network group */ [TOMOYO_MAC_NETWORK_INET_STREAM_BIND] = "inet_stream_bind", [TOMOYO_MAC_NETWORK_INET_STREAM_LISTEN] = "inet_stream_listen", [TOMOYO_MAC_NETWORK_INET_STREAM_CONNECT] = "inet_stream_connect", [TOMOYO_MAC_NETWORK_INET_DGRAM_BIND] = "inet_dgram_bind", [TOMOYO_MAC_NETWORK_INET_DGRAM_SEND] = "inet_dgram_send", [TOMOYO_MAC_NETWORK_INET_RAW_BIND] = "inet_raw_bind", [TOMOYO_MAC_NETWORK_INET_RAW_SEND] = "inet_raw_send", [TOMOYO_MAC_NETWORK_UNIX_STREAM_BIND] = "unix_stream_bind", [TOMOYO_MAC_NETWORK_UNIX_STREAM_LISTEN] = "unix_stream_listen", [TOMOYO_MAC_NETWORK_UNIX_STREAM_CONNECT] = "unix_stream_connect", [TOMOYO_MAC_NETWORK_UNIX_DGRAM_BIND] = "unix_dgram_bind", [TOMOYO_MAC_NETWORK_UNIX_DGRAM_SEND] = "unix_dgram_send", [TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_BIND] = "unix_seqpacket_bind", [TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_LISTEN] = "unix_seqpacket_listen", [TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_CONNECT] = "unix_seqpacket_connect", /* CONFIG::misc group */ [TOMOYO_MAC_ENVIRON] = "env", /* CONFIG group */ [TOMOYO_MAX_MAC_INDEX + TOMOYO_MAC_CATEGORY_FILE] = "file", [TOMOYO_MAX_MAC_INDEX + TOMOYO_MAC_CATEGORY_NETWORK] = "network", [TOMOYO_MAX_MAC_INDEX + TOMOYO_MAC_CATEGORY_MISC] = "misc", }; /* String table for conditions. */ const char * const tomoyo_condition_keyword[TOMOYO_MAX_CONDITION_KEYWORD] = { [TOMOYO_TASK_UID] = "task.uid", [TOMOYO_TASK_EUID] = "task.euid", [TOMOYO_TASK_SUID] = "task.suid", [TOMOYO_TASK_FSUID] = "task.fsuid", [TOMOYO_TASK_GID] = "task.gid", [TOMOYO_TASK_EGID] = "task.egid", [TOMOYO_TASK_SGID] = "task.sgid", [TOMOYO_TASK_FSGID] = "task.fsgid", [TOMOYO_TASK_PID] = "task.pid", [TOMOYO_TASK_PPID] = "task.ppid", [TOMOYO_EXEC_ARGC] = "exec.argc", [TOMOYO_EXEC_ENVC] = "exec.envc", [TOMOYO_TYPE_IS_SOCKET] = "socket", [TOMOYO_TYPE_IS_SYMLINK] = "symlink", [TOMOYO_TYPE_IS_FILE] = "file", [TOMOYO_TYPE_IS_BLOCK_DEV] = "block", [TOMOYO_TYPE_IS_DIRECTORY] = "directory", [TOMOYO_TYPE_IS_CHAR_DEV] = "char", [TOMOYO_TYPE_IS_FIFO] = "fifo", [TOMOYO_MODE_SETUID] = "setuid", [TOMOYO_MODE_SETGID] = "setgid", [TOMOYO_MODE_STICKY] = "sticky", [TOMOYO_MODE_OWNER_READ] = "owner_read", [TOMOYO_MODE_OWNER_WRITE] = "owner_write", [TOMOYO_MODE_OWNER_EXECUTE] = "owner_execute", [TOMOYO_MODE_GROUP_READ] = "group_read", [TOMOYO_MODE_GROUP_WRITE] = "group_write", [TOMOYO_MODE_GROUP_EXECUTE] = "group_execute", [TOMOYO_MODE_OTHERS_READ] = "others_read", [TOMOYO_MODE_OTHERS_WRITE] = "others_write", [TOMOYO_MODE_OTHERS_EXECUTE] = "others_execute", [TOMOYO_EXEC_REALPATH] = "exec.realpath", [TOMOYO_SYMLINK_TARGET] = "symlink.target", [TOMOYO_PATH1_UID] = "path1.uid", [TOMOYO_PATH1_GID] = "path1.gid", [TOMOYO_PATH1_INO] = "path1.ino", [TOMOYO_PATH1_MAJOR] = "path1.major", [TOMOYO_PATH1_MINOR] = "path1.minor", [TOMOYO_PATH1_PERM] = "path1.perm", [TOMOYO_PATH1_TYPE] = "path1.type", [TOMOYO_PATH1_DEV_MAJOR] = "path1.dev_major", [TOMOYO_PATH1_DEV_MINOR] = "path1.dev_minor", [TOMOYO_PATH2_UID] = "path2.uid", [TOMOYO_PATH2_GID] = "path2.gid", [TOMOYO_PATH2_INO] = "path2.ino", [TOMOYO_PATH2_MAJOR] = "path2.major", [TOMOYO_PATH2_MINOR] = "path2.minor", [TOMOYO_PATH2_PERM] = "path2.perm", [TOMOYO_PATH2_TYPE] = "path2.type", [TOMOYO_PATH2_DEV_MAJOR] = "path2.dev_major", [TOMOYO_PATH2_DEV_MINOR] = "path2.dev_minor", [TOMOYO_PATH1_PARENT_UID] = "path1.parent.uid", [TOMOYO_PATH1_PARENT_GID] = "path1.parent.gid", [TOMOYO_PATH1_PARENT_INO] = "path1.parent.ino", [TOMOYO_PATH1_PARENT_PERM] = "path1.parent.perm", [TOMOYO_PATH2_PARENT_UID] = "path2.parent.uid", [TOMOYO_PATH2_PARENT_GID] = "path2.parent.gid", [TOMOYO_PATH2_PARENT_INO] = "path2.parent.ino", [TOMOYO_PATH2_PARENT_PERM] = "path2.parent.perm", }; /* String table for PREFERENCE keyword. */ static const char * const tomoyo_pref_keywords[TOMOYO_MAX_PREF] = { [TOMOYO_PREF_MAX_AUDIT_LOG] = "max_audit_log", [TOMOYO_PREF_MAX_LEARNING_ENTRY] = "max_learning_entry", }; /* String table for path operation. */ const char * const tomoyo_path_keyword[TOMOYO_MAX_PATH_OPERATION] = { [TOMOYO_TYPE_EXECUTE] = "execute", [TOMOYO_TYPE_READ] = "read", [TOMOYO_TYPE_WRITE] = "write", [TOMOYO_TYPE_APPEND] = "append", [TOMOYO_TYPE_UNLINK] = "unlink", [TOMOYO_TYPE_GETATTR] = "getattr", [TOMOYO_TYPE_RMDIR] = "rmdir", [TOMOYO_TYPE_TRUNCATE] = "truncate", [TOMOYO_TYPE_SYMLINK] = "symlink", [TOMOYO_TYPE_CHROOT] = "chroot", [TOMOYO_TYPE_UMOUNT] = "unmount", }; /* String table for socket's operation. */ const char * const tomoyo_socket_keyword[TOMOYO_MAX_NETWORK_OPERATION] = { [TOMOYO_NETWORK_BIND] = "bind", [TOMOYO_NETWORK_LISTEN] = "listen", [TOMOYO_NETWORK_CONNECT] = "connect", [TOMOYO_NETWORK_SEND] = "send", }; /* String table for categories. */ static const char * const tomoyo_category_keywords [TOMOYO_MAX_MAC_CATEGORY_INDEX] = { [TOMOYO_MAC_CATEGORY_FILE] = "file", [TOMOYO_MAC_CATEGORY_NETWORK] = "network", [TOMOYO_MAC_CATEGORY_MISC] = "misc", }; /* Permit policy management by non-root user? */ static bool tomoyo_manage_by_non_root; /* Utility functions. */ /** * tomoyo_addprintf - strncat()-like-snprintf(). * * @buffer: Buffer to write to. Must be '\0'-terminated. * @len: Size of @buffer. * @fmt: The printf()'s format string, followed by parameters. * * Returns nothing. */ __printf(3, 4) static void tomoyo_addprintf(char *buffer, int len, const char *fmt, ...) { va_list args; const int pos = strlen(buffer); va_start(args, fmt); vsnprintf(buffer + pos, len - pos - 1, fmt, args); va_end(args); } /** * tomoyo_flush - Flush queued string to userspace's buffer. * * @head: Pointer to "struct tomoyo_io_buffer". * * Returns true if all data was flushed, false otherwise. */ static bool tomoyo_flush(struct tomoyo_io_buffer *head) { while (head->r.w_pos) { const char *w = head->r.w[0]; size_t len = strlen(w); if (len) { if (len > head->read_user_buf_avail) len = head->read_user_buf_avail; if (!len) return false; if (copy_to_user(head->read_user_buf, w, len)) return false; head->read_user_buf_avail -= len; head->read_user_buf += len; w += len; } head->r.w[0] = w; if (*w) return false; /* Add '\0' for audit logs and query. */ if (head->poll) { if (!head->read_user_buf_avail || copy_to_user(head->read_user_buf, "", 1)) return false; head->read_user_buf_avail--; head->read_user_buf++; } head->r.w_pos--; for (len = 0; len < head->r.w_pos; len++) head->r.w[len] = head->r.w[len + 1]; } head->r.avail = 0; return true; } /** * tomoyo_set_string - Queue string to "struct tomoyo_io_buffer" structure. * * @head: Pointer to "struct tomoyo_io_buffer". * @string: String to print. * * Note that @string has to be kept valid until @head is kfree()d. * This means that char[] allocated on stack memory cannot be passed to * this function. Use tomoyo_io_printf() for char[] allocated on stack memory. */ static void tomoyo_set_string(struct tomoyo_io_buffer *head, const char *string) { if (head->r.w_pos < TOMOYO_MAX_IO_READ_QUEUE) { head->r.w[head->r.w_pos++] = string; tomoyo_flush(head); } else WARN_ON(1); } static void tomoyo_io_printf(struct tomoyo_io_buffer *head, const char *fmt, ...) __printf(2, 3); /** * tomoyo_io_printf - printf() to "struct tomoyo_io_buffer" structure. * * @head: Pointer to "struct tomoyo_io_buffer". * @fmt: The printf()'s format string, followed by parameters. */ static void tomoyo_io_printf(struct tomoyo_io_buffer *head, const char *fmt, ...) { va_list args; size_t len; size_t pos = head->r.avail; int size = head->readbuf_size - pos; if (size <= 0) return; va_start(args, fmt); len = vsnprintf(head->read_buf + pos, size, fmt, args) + 1; va_end(args); if (pos + len >= head->readbuf_size) { WARN_ON(1); return; } head->r.avail += len; tomoyo_set_string(head, head->read_buf + pos); } /** * tomoyo_set_space - Put a space to "struct tomoyo_io_buffer" structure. * * @head: Pointer to "struct tomoyo_io_buffer". * * Returns nothing. */ static void tomoyo_set_space(struct tomoyo_io_buffer *head) { tomoyo_set_string(head, " "); } /** * tomoyo_set_lf - Put a line feed to "struct tomoyo_io_buffer" structure. * * @head: Pointer to "struct tomoyo_io_buffer". * * Returns nothing. */ static bool tomoyo_set_lf(struct tomoyo_io_buffer *head) { tomoyo_set_string(head, "\n"); return !head->r.w_pos; } /** * tomoyo_set_slash - Put a shash to "struct tomoyo_io_buffer" structure. * * @head: Pointer to "struct tomoyo_io_buffer". * * Returns nothing. */ static void tomoyo_set_slash(struct tomoyo_io_buffer *head) { tomoyo_set_string(head, "/"); } /* List of namespaces. */ LIST_HEAD(tomoyo_namespace_list); /* True if namespace other than tomoyo_kernel_namespace is defined. */ static bool tomoyo_namespace_enabled; /** * tomoyo_init_policy_namespace - Initialize namespace. * * @ns: Pointer to "struct tomoyo_policy_namespace". * * Returns nothing. */ void tomoyo_init_policy_namespace(struct tomoyo_policy_namespace *ns) { unsigned int idx; for (idx = 0; idx < TOMOYO_MAX_ACL_GROUPS; idx++) INIT_LIST_HEAD(&ns->acl_group[idx]); for (idx = 0; idx < TOMOYO_MAX_GROUP; idx++) INIT_LIST_HEAD(&ns->group_list[idx]); for (idx = 0; idx < TOMOYO_MAX_POLICY; idx++) INIT_LIST_HEAD(&ns->policy_list[idx]); ns->profile_version = 20150505; tomoyo_namespace_enabled = !list_empty(&tomoyo_namespace_list); list_add_tail_rcu(&ns->namespace_list, &tomoyo_namespace_list); } /** * tomoyo_print_namespace - Print namespace header. * * @head: Pointer to "struct tomoyo_io_buffer". * * Returns nothing. */ static void tomoyo_print_namespace(struct tomoyo_io_buffer *head) { if (!tomoyo_namespace_enabled) return; tomoyo_set_string(head, container_of(head->r.ns, struct tomoyo_policy_namespace, namespace_list)->name); tomoyo_set_space(head); } /** * tomoyo_print_name_union - Print a tomoyo_name_union. * * @head: Pointer to "struct tomoyo_io_buffer". * @ptr: Pointer to "struct tomoyo_name_union". */ static void tomoyo_print_name_union(struct tomoyo_io_buffer *head, const struct tomoyo_name_union *ptr) { tomoyo_set_space(head); if (ptr->group) { tomoyo_set_string(head, "@"); tomoyo_set_string(head, ptr->group->group_name->name); } else { tomoyo_set_string(head, ptr->filename->name); } } /** * tomoyo_print_name_union_quoted - Print a tomoyo_name_union with a quote. * * @head: Pointer to "struct tomoyo_io_buffer". * @ptr: Pointer to "struct tomoyo_name_union". * * Returns nothing. */ static void tomoyo_print_name_union_quoted(struct tomoyo_io_buffer *head, const struct tomoyo_name_union *ptr) { if (ptr->group) { tomoyo_set_string(head, "@"); tomoyo_set_string(head, ptr->group->group_name->name); } else { tomoyo_set_string(head, "\""); tomoyo_set_string(head, ptr->filename->name); tomoyo_set_string(head, "\""); } } /** * tomoyo_print_number_union_nospace - Print a tomoyo_number_union without a space. * * @head: Pointer to "struct tomoyo_io_buffer". * @ptr: Pointer to "struct tomoyo_number_union". * * Returns nothing. */ static void tomoyo_print_number_union_nospace (struct tomoyo_io_buffer *head, const struct tomoyo_number_union *ptr) { if (ptr->group) { tomoyo_set_string(head, "@"); tomoyo_set_string(head, ptr->group->group_name->name); } else { int i; unsigned long min = ptr->values[0]; const unsigned long max = ptr->values[1]; u8 min_type = ptr->value_type[0]; const u8 max_type = ptr->value_type[1]; char buffer[128]; buffer[0] = '\0'; for (i = 0; i < 2; i++) { switch (min_type) { case TOMOYO_VALUE_TYPE_HEXADECIMAL: tomoyo_addprintf(buffer, sizeof(buffer), "0x%lX", min); break; case TOMOYO_VALUE_TYPE_OCTAL: tomoyo_addprintf(buffer, sizeof(buffer), "0%lo", min); break; default: tomoyo_addprintf(buffer, sizeof(buffer), "%lu", min); break; } if (min == max && min_type == max_type) break; tomoyo_addprintf(buffer, sizeof(buffer), "-"); min_type = max_type; min = max; } tomoyo_io_printf(head, "%s", buffer); } } /** * tomoyo_print_number_union - Print a tomoyo_number_union. * * @head: Pointer to "struct tomoyo_io_buffer". * @ptr: Pointer to "struct tomoyo_number_union". * * Returns nothing. */ static void tomoyo_print_number_union(struct tomoyo_io_buffer *head, const struct tomoyo_number_union *ptr) { tomoyo_set_space(head); tomoyo_print_number_union_nospace(head, ptr); } /** * tomoyo_assign_profile - Create a new profile. * * @ns: Pointer to "struct tomoyo_policy_namespace". * @profile: Profile number to create. * * Returns pointer to "struct tomoyo_profile" on success, NULL otherwise. */ static struct tomoyo_profile *tomoyo_assign_profile (struct tomoyo_policy_namespace *ns, const unsigned int profile) { struct tomoyo_profile *ptr; struct tomoyo_profile *entry; if (profile >= TOMOYO_MAX_PROFILES) return NULL; ptr = ns->profile_ptr[profile]; if (ptr) return ptr; entry = kzalloc(sizeof(*entry), GFP_NOFS | __GFP_NOWARN); if (mutex_lock_interruptible(&tomoyo_policy_lock)) goto out; ptr = ns->profile_ptr[profile]; if (!ptr && tomoyo_memory_ok(entry)) { ptr = entry; ptr->default_config = TOMOYO_CONFIG_DISABLED | TOMOYO_CONFIG_WANT_GRANT_LOG | TOMOYO_CONFIG_WANT_REJECT_LOG; memset(ptr->config, TOMOYO_CONFIG_USE_DEFAULT, sizeof(ptr->config)); ptr->pref[TOMOYO_PREF_MAX_AUDIT_LOG] = CONFIG_SECURITY_TOMOYO_MAX_AUDIT_LOG; ptr->pref[TOMOYO_PREF_MAX_LEARNING_ENTRY] = CONFIG_SECURITY_TOMOYO_MAX_ACCEPT_ENTRY; mb(); /* Avoid out-of-order execution. */ ns->profile_ptr[profile] = ptr; entry = NULL; } mutex_unlock(&tomoyo_policy_lock); out: kfree(entry); return ptr; } /** * tomoyo_profile - Find a profile. * * @ns: Pointer to "struct tomoyo_policy_namespace". * @profile: Profile number to find. * * Returns pointer to "struct tomoyo_profile". */ struct tomoyo_profile *tomoyo_profile(const struct tomoyo_policy_namespace *ns, const u8 profile) { static struct tomoyo_profile tomoyo_null_profile; struct tomoyo_profile *ptr = ns->profile_ptr[profile]; if (!ptr) ptr = &tomoyo_null_profile; return ptr; } /** * tomoyo_find_yesno - Find values for specified keyword. * * @string: String to check. * @find: Name of keyword. * * Returns 1 if "@find=yes" was found, 0 if "@find=no" was found, -1 otherwise. */ static s8 tomoyo_find_yesno(const char *string, const char *find) { const char *cp = strstr(string, find); if (cp) { cp += strlen(find); if (!strncmp(cp, "=yes", 4)) return 1; else if (!strncmp(cp, "=no", 3)) return 0; } return -1; } /** * tomoyo_set_uint - Set value for specified preference. * * @i: Pointer to "unsigned int". * @string: String to check. * @find: Name of keyword. * * Returns nothing. */ static void tomoyo_set_uint(unsigned int *i, const char *string, const char *find) { const char *cp = strstr(string, find); if (cp) sscanf(cp + strlen(find), "=%u", i); } /** * tomoyo_set_mode - Set mode for specified profile. * * @name: Name of functionality. * @value: Mode for @name. * @profile: Pointer to "struct tomoyo_profile". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_set_mode(char *name, const char *value, struct tomoyo_profile *profile) { u8 i; u8 config; if (!strcmp(name, "CONFIG")) { i = TOMOYO_MAX_MAC_INDEX + TOMOYO_MAX_MAC_CATEGORY_INDEX; config = profile->default_config; } else if (tomoyo_str_starts(&name, "CONFIG::")) { config = 0; for (i = 0; i < TOMOYO_MAX_MAC_INDEX + TOMOYO_MAX_MAC_CATEGORY_INDEX; i++) { int len = 0; if (i < TOMOYO_MAX_MAC_INDEX) { const u8 c = tomoyo_index2category[i]; const char *category = tomoyo_category_keywords[c]; len = strlen(category); if (strncmp(name, category, len) || name[len++] != ':' || name[len++] != ':') continue; } if (strcmp(name + len, tomoyo_mac_keywords[i])) continue; config = profile->config[i]; break; } if (i == TOMOYO_MAX_MAC_INDEX + TOMOYO_MAX_MAC_CATEGORY_INDEX) return -EINVAL; } else { return -EINVAL; } if (strstr(value, "use_default")) { config = TOMOYO_CONFIG_USE_DEFAULT; } else { u8 mode; for (mode = 0; mode < 4; mode++) if (strstr(value, tomoyo_mode[mode])) /* * Update lower 3 bits in order to distinguish * 'config' from 'TOMOYO_CONFIG_USE_DEFAULT'. */ config = (config & ~7) | mode; if (config != TOMOYO_CONFIG_USE_DEFAULT) { switch (tomoyo_find_yesno(value, "grant_log")) { case 1: config |= TOMOYO_CONFIG_WANT_GRANT_LOG; break; case 0: config &= ~TOMOYO_CONFIG_WANT_GRANT_LOG; break; } switch (tomoyo_find_yesno(value, "reject_log")) { case 1: config |= TOMOYO_CONFIG_WANT_REJECT_LOG; break; case 0: config &= ~TOMOYO_CONFIG_WANT_REJECT_LOG; break; } } } if (i < TOMOYO_MAX_MAC_INDEX + TOMOYO_MAX_MAC_CATEGORY_INDEX) profile->config[i] = config; else if (config != TOMOYO_CONFIG_USE_DEFAULT) profile->default_config = config; return 0; } /** * tomoyo_write_profile - Write profile table. * * @head: Pointer to "struct tomoyo_io_buffer". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_write_profile(struct tomoyo_io_buffer *head) { char *data = head->write_buf; unsigned int i; char *cp; struct tomoyo_profile *profile; if (sscanf(data, "PROFILE_VERSION=%u", &head->w.ns->profile_version) == 1) return 0; i = simple_strtoul(data, &cp, 10); if (*cp != '-') return -EINVAL; data = cp + 1; profile = tomoyo_assign_profile(head->w.ns, i); if (!profile) return -EINVAL; cp = strchr(data, '='); if (!cp) return -EINVAL; *cp++ = '\0'; if (!strcmp(data, "COMMENT")) { static DEFINE_SPINLOCK(lock); const struct tomoyo_path_info *new_comment = tomoyo_get_name(cp); const struct tomoyo_path_info *old_comment; if (!new_comment) return -ENOMEM; spin_lock(&lock); old_comment = profile->comment; profile->comment = new_comment; spin_unlock(&lock); tomoyo_put_name(old_comment); return 0; } if (!strcmp(data, "PREFERENCE")) { for (i = 0; i < TOMOYO_MAX_PREF; i++) tomoyo_set_uint(&profile->pref[i], cp, tomoyo_pref_keywords[i]); return 0; } return tomoyo_set_mode(data, cp, profile); } /** * tomoyo_print_config - Print mode for specified functionality. * * @head: Pointer to "struct tomoyo_io_buffer". * @config: Mode for that functionality. * * Returns nothing. * * Caller prints functionality's name. */ static void tomoyo_print_config(struct tomoyo_io_buffer *head, const u8 config) { tomoyo_io_printf(head, "={ mode=%s grant_log=%s reject_log=%s }\n", tomoyo_mode[config & 3], str_yes_no(config & TOMOYO_CONFIG_WANT_GRANT_LOG), str_yes_no(config & TOMOYO_CONFIG_WANT_REJECT_LOG)); } /** * tomoyo_read_profile - Read profile table. * * @head: Pointer to "struct tomoyo_io_buffer". * * Returns nothing. */ static void tomoyo_read_profile(struct tomoyo_io_buffer *head) { u8 index; struct tomoyo_policy_namespace *ns = container_of(head->r.ns, typeof(*ns), namespace_list); const struct tomoyo_profile *profile; if (head->r.eof) return; next: index = head->r.index; profile = ns->profile_ptr[index]; switch (head->r.step) { case 0: tomoyo_print_namespace(head); tomoyo_io_printf(head, "PROFILE_VERSION=%u\n", ns->profile_version); head->r.step++; break; case 1: for ( ; head->r.index < TOMOYO_MAX_PROFILES; head->r.index++) if (ns->profile_ptr[head->r.index]) break; if (head->r.index == TOMOYO_MAX_PROFILES) { head->r.eof = true; return; } head->r.step++; break; case 2: { u8 i; const struct tomoyo_path_info *comment = profile->comment; tomoyo_print_namespace(head); tomoyo_io_printf(head, "%u-COMMENT=", index); tomoyo_set_string(head, comment ? comment->name : ""); tomoyo_set_lf(head); tomoyo_print_namespace(head); tomoyo_io_printf(head, "%u-PREFERENCE={ ", index); for (i = 0; i < TOMOYO_MAX_PREF; i++) tomoyo_io_printf(head, "%s=%u ", tomoyo_pref_keywords[i], profile->pref[i]); tomoyo_set_string(head, "}\n"); head->r.step++; } break; case 3: { tomoyo_print_namespace(head); tomoyo_io_printf(head, "%u-%s", index, "CONFIG"); tomoyo_print_config(head, profile->default_config); head->r.bit = 0; head->r.step++; } break; case 4: for ( ; head->r.bit < TOMOYO_MAX_MAC_INDEX + TOMOYO_MAX_MAC_CATEGORY_INDEX; head->r.bit++) { const u8 i = head->r.bit; const u8 config = profile->config[i]; if (config == TOMOYO_CONFIG_USE_DEFAULT) continue; tomoyo_print_namespace(head); if (i < TOMOYO_MAX_MAC_INDEX) tomoyo_io_printf(head, "%u-CONFIG::%s::%s", index, tomoyo_category_keywords [tomoyo_index2category[i]], tomoyo_mac_keywords[i]); else tomoyo_io_printf(head, "%u-CONFIG::%s", index, tomoyo_mac_keywords[i]); tomoyo_print_config(head, config); head->r.bit++; break; } if (head->r.bit == TOMOYO_MAX_MAC_INDEX + TOMOYO_MAX_MAC_CATEGORY_INDEX) { head->r.index++; head->r.step = 1; } break; } if (tomoyo_flush(head)) goto next; } /** * tomoyo_same_manager - Check for duplicated "struct tomoyo_manager" entry. * * @a: Pointer to "struct tomoyo_acl_head". * @b: Pointer to "struct tomoyo_acl_head". * * Returns true if @a == @b, false otherwise. */ static bool tomoyo_same_manager(const struct tomoyo_acl_head *a, const struct tomoyo_acl_head *b) { return container_of(a, struct tomoyo_manager, head)->manager == container_of(b, struct tomoyo_manager, head)->manager; } /** * tomoyo_update_manager_entry - Add a manager entry. * * @manager: The path to manager or the domainnamme. * @is_delete: True if it is a delete request. * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_update_manager_entry(const char *manager, const bool is_delete) { struct tomoyo_manager e = { }; struct tomoyo_acl_param param = { /* .ns = &tomoyo_kernel_namespace, */ .is_delete = is_delete, .list = &tomoyo_kernel_namespace.policy_list[TOMOYO_ID_MANAGER], }; int error = is_delete ? -ENOENT : -ENOMEM; if (!tomoyo_correct_domain(manager) && !tomoyo_correct_word(manager)) return -EINVAL; e.manager = tomoyo_get_name(manager); if (e.manager) { error = tomoyo_update_policy(&e.head, sizeof(e), &param, tomoyo_same_manager); tomoyo_put_name(e.manager); } return error; } /** * tomoyo_write_manager - Write manager policy. * * @head: Pointer to "struct tomoyo_io_buffer". * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_write_manager(struct tomoyo_io_buffer *head) { char *data = head->write_buf; if (!strcmp(data, "manage_by_non_root")) { tomoyo_manage_by_non_root = !head->w.is_delete; return 0; } return tomoyo_update_manager_entry(data, head->w.is_delete); } /** * tomoyo_read_manager - Read manager policy. * * @head: Pointer to "struct tomoyo_io_buffer". * * Caller holds tomoyo_read_lock(). */ static void tomoyo_read_manager(struct tomoyo_io_buffer *head) { if (head->r.eof) return; list_for_each_cookie(head->r.acl, &tomoyo_kernel_namespace.policy_list[TOMOYO_ID_MANAGER]) { struct tomoyo_manager *ptr = list_entry(head->r.acl, typeof(*ptr), head.list); if (ptr->head.is_deleted) continue; if (!tomoyo_flush(head)) return; tomoyo_set_string(head, ptr->manager->name); tomoyo_set_lf(head); } head->r.eof = true; } /** * tomoyo_manager - Check whether the current process is a policy manager. * * Returns true if the current process is permitted to modify policy * via /sys/kernel/security/tomoyo/ interface. * * Caller holds tomoyo_read_lock(). */ static bool tomoyo_manager(void) { struct tomoyo_manager *ptr; const char *exe; const struct task_struct *task = current; const struct tomoyo_path_info *domainname = tomoyo_domain()->domainname; bool found = IS_ENABLED(CONFIG_SECURITY_TOMOYO_INSECURE_BUILTIN_SETTING); if (!tomoyo_policy_loaded) return true; if (!tomoyo_manage_by_non_root && (!uid_eq(task->cred->uid, GLOBAL_ROOT_UID) || !uid_eq(task->cred->euid, GLOBAL_ROOT_UID))) return false; exe = tomoyo_get_exe(); if (!exe) return false; list_for_each_entry_rcu(ptr, &tomoyo_kernel_namespace.policy_list[TOMOYO_ID_MANAGER], head.list, srcu_read_lock_held(&tomoyo_ss)) { if (!ptr->head.is_deleted && (!tomoyo_pathcmp(domainname, ptr->manager) || !strcmp(exe, ptr->manager->name))) { found = true; break; } } if (!found) { /* Reduce error messages. */ static pid_t last_pid; const pid_t pid = current->pid; if (last_pid != pid) { pr_warn("%s ( %s ) is not permitted to update policies.\n", domainname->name, exe); last_pid = pid; } } kfree(exe); return found; } static struct tomoyo_domain_info *tomoyo_find_domain_by_qid (unsigned int serial); /** * tomoyo_select_domain - Parse select command. * * @head: Pointer to "struct tomoyo_io_buffer". * @data: String to parse. * * Returns true on success, false otherwise. * * Caller holds tomoyo_read_lock(). */ static bool tomoyo_select_domain(struct tomoyo_io_buffer *head, const char *data) { unsigned int pid; struct tomoyo_domain_info *domain = NULL; bool global_pid = false; if (strncmp(data, "select ", 7)) return false; data += 7; if (sscanf(data, "pid=%u", &pid) == 1 || (global_pid = true, sscanf(data, "global-pid=%u", &pid) == 1)) { struct task_struct *p; rcu_read_lock(); if (global_pid) p = find_task_by_pid_ns(pid, &init_pid_ns); else p = find_task_by_vpid(pid); if (p) domain = tomoyo_task(p)->domain_info; rcu_read_unlock(); } else if (!strncmp(data, "domain=", 7)) { if (tomoyo_domain_def(data + 7)) domain = tomoyo_find_domain(data + 7); } else if (sscanf(data, "Q=%u", &pid) == 1) { domain = tomoyo_find_domain_by_qid(pid); } else return false; head->w.domain = domain; /* Accessing read_buf is safe because head->io_sem is held. */ if (!head->read_buf) return true; /* Do nothing if open(O_WRONLY). */ memset(&head->r, 0, sizeof(head->r)); head->r.print_this_domain_only = true; if (domain) head->r.domain = &domain->list; else head->r.eof = true; tomoyo_io_printf(head, "# select %s\n", data); if (domain && domain->is_deleted) tomoyo_io_printf(head, "# This is a deleted domain.\n"); return true; } /** * tomoyo_same_task_acl - Check for duplicated "struct tomoyo_task_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * * Returns true if @a == @b, false otherwise. */ static bool tomoyo_same_task_acl(const struct tomoyo_acl_info *a, const struct tomoyo_acl_info *b) { const struct tomoyo_task_acl *p1 = container_of(a, typeof(*p1), head); const struct tomoyo_task_acl *p2 = container_of(b, typeof(*p2), head); return p1->domainname == p2->domainname; } /** * tomoyo_write_task - Update task related list. * * @param: Pointer to "struct tomoyo_acl_param". * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_write_task(struct tomoyo_acl_param *param) { int error = -EINVAL; if (tomoyo_str_starts(&param->data, "manual_domain_transition ")) { struct tomoyo_task_acl e = { .head.type = TOMOYO_TYPE_MANUAL_TASK_ACL, .domainname = tomoyo_get_domainname(param), }; if (e.domainname) error = tomoyo_update_domain(&e.head, sizeof(e), param, tomoyo_same_task_acl, NULL); tomoyo_put_name(e.domainname); } return error; } /** * tomoyo_delete_domain - Delete a domain. * * @domainname: The name of domain. * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_delete_domain(char *domainname) { struct tomoyo_domain_info *domain; struct tomoyo_path_info name; name.name = domainname; tomoyo_fill_path_info(&name); if (mutex_lock_interruptible(&tomoyo_policy_lock)) return -EINTR; /* Is there an active domain? */ list_for_each_entry_rcu(domain, &tomoyo_domain_list, list, srcu_read_lock_held(&tomoyo_ss)) { /* Never delete tomoyo_kernel_domain */ if (domain == &tomoyo_kernel_domain) continue; if (domain->is_deleted || tomoyo_pathcmp(domain->domainname, &name)) continue; domain->is_deleted = true; break; } mutex_unlock(&tomoyo_policy_lock); return 0; } /** * tomoyo_write_domain2 - Write domain policy. * * @ns: Pointer to "struct tomoyo_policy_namespace". * @list: Pointer to "struct list_head". * @data: Policy to be interpreted. * @is_delete: True if it is a delete request. * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_write_domain2(struct tomoyo_policy_namespace *ns, struct list_head *list, char *data, const bool is_delete) { struct tomoyo_acl_param param = { .ns = ns, .list = list, .data = data, .is_delete = is_delete, }; static const struct { const char *keyword; int (*write)(struct tomoyo_acl_param *param); } tomoyo_callback[5] = { { "file ", tomoyo_write_file }, { "network inet ", tomoyo_write_inet_network }, { "network unix ", tomoyo_write_unix_network }, { "misc ", tomoyo_write_misc }, { "task ", tomoyo_write_task }, }; u8 i; for (i = 0; i < ARRAY_SIZE(tomoyo_callback); i++) { if (!tomoyo_str_starts(&param.data, tomoyo_callback[i].keyword)) continue; return tomoyo_callback[i].write(&param); } return -EINVAL; } /* String table for domain flags. */ const char * const tomoyo_dif[TOMOYO_MAX_DOMAIN_INFO_FLAGS] = { [TOMOYO_DIF_QUOTA_WARNED] = "quota_exceeded\n", [TOMOYO_DIF_TRANSITION_FAILED] = "transition_failed\n", }; /** * tomoyo_write_domain - Write domain policy. * * @head: Pointer to "struct tomoyo_io_buffer". * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_write_domain(struct tomoyo_io_buffer *head) { char *data = head->write_buf; struct tomoyo_policy_namespace *ns; struct tomoyo_domain_info *domain = head->w.domain; const bool is_delete = head->w.is_delete; bool is_select = !is_delete && tomoyo_str_starts(&data, "select "); unsigned int idx; if (*data == '<') { int ret = 0; domain = NULL; if (is_delete) ret = tomoyo_delete_domain(data); else if (is_select) domain = tomoyo_find_domain(data); else domain = tomoyo_assign_domain(data, false); head->w.domain = domain; return ret; } if (!domain) return -EINVAL; ns = domain->ns; if (sscanf(data, "use_profile %u", &idx) == 1 && idx < TOMOYO_MAX_PROFILES) { if (!tomoyo_policy_loaded || ns->profile_ptr[idx]) if (!is_delete) domain->profile = (u8) idx; return 0; } if (sscanf(data, "use_group %u\n", &idx) == 1 && idx < TOMOYO_MAX_ACL_GROUPS) { if (!is_delete) set_bit(idx, domain->group); else clear_bit(idx, domain->group); return 0; } for (idx = 0; idx < TOMOYO_MAX_DOMAIN_INFO_FLAGS; idx++) { const char *cp = tomoyo_dif[idx]; if (strncmp(data, cp, strlen(cp) - 1)) continue; domain->flags[idx] = !is_delete; return 0; } return tomoyo_write_domain2(ns, &domain->acl_info_list, data, is_delete); } /** * tomoyo_print_condition - Print condition part. * * @head: Pointer to "struct tomoyo_io_buffer". * @cond: Pointer to "struct tomoyo_condition". * * Returns true on success, false otherwise. */ static bool tomoyo_print_condition(struct tomoyo_io_buffer *head, const struct tomoyo_condition *cond) { switch (head->r.cond_step) { case 0: head->r.cond_index = 0; head->r.cond_step++; if (cond->transit) { tomoyo_set_space(head); tomoyo_set_string(head, cond->transit->name); } fallthrough; case 1: { const u16 condc = cond->condc; const struct tomoyo_condition_element *condp = (typeof(condp)) (cond + 1); const struct tomoyo_number_union *numbers_p = (typeof(numbers_p)) (condp + condc); const struct tomoyo_name_union *names_p = (typeof(names_p)) (numbers_p + cond->numbers_count); const struct tomoyo_argv *argv = (typeof(argv)) (names_p + cond->names_count); const struct tomoyo_envp *envp = (typeof(envp)) (argv + cond->argc); u16 skip; for (skip = 0; skip < head->r.cond_index; skip++) { const u8 left = condp->left; const u8 right = condp->right; condp++; switch (left) { case TOMOYO_ARGV_ENTRY: argv++; continue; case TOMOYO_ENVP_ENTRY: envp++; continue; case TOMOYO_NUMBER_UNION: numbers_p++; break; } switch (right) { case TOMOYO_NAME_UNION: names_p++; break; case TOMOYO_NUMBER_UNION: numbers_p++; break; } } while (head->r.cond_index < condc) { const u8 match = condp->equals; const u8 left = condp->left; const u8 right = condp->right; if (!tomoyo_flush(head)) return false; condp++; head->r.cond_index++; tomoyo_set_space(head); switch (left) { case TOMOYO_ARGV_ENTRY: tomoyo_io_printf(head, "exec.argv[%lu]%s=\"", argv->index, argv->is_not ? "!" : ""); tomoyo_set_string(head, argv->value->name); tomoyo_set_string(head, "\""); argv++; continue; case TOMOYO_ENVP_ENTRY: tomoyo_set_string(head, "exec.envp[\""); tomoyo_set_string(head, envp->name->name); tomoyo_io_printf(head, "\"]%s=", envp->is_not ? "!" : ""); if (envp->value) { tomoyo_set_string(head, "\""); tomoyo_set_string(head, envp->value->name); tomoyo_set_string(head, "\""); } else { tomoyo_set_string(head, "NULL"); } envp++; continue; case TOMOYO_NUMBER_UNION: tomoyo_print_number_union_nospace (head, numbers_p++); break; default: tomoyo_set_string(head, tomoyo_condition_keyword[left]); break; } tomoyo_set_string(head, match ? "=" : "!="); switch (right) { case TOMOYO_NAME_UNION: tomoyo_print_name_union_quoted (head, names_p++); break; case TOMOYO_NUMBER_UNION: tomoyo_print_number_union_nospace (head, numbers_p++); break; default: tomoyo_set_string(head, tomoyo_condition_keyword[right]); break; } } } head->r.cond_step++; fallthrough; case 2: if (!tomoyo_flush(head)) break; head->r.cond_step++; fallthrough; case 3: if (cond->grant_log != TOMOYO_GRANTLOG_AUTO) tomoyo_io_printf(head, " grant_log=%s", str_yes_no(cond->grant_log == TOMOYO_GRANTLOG_YES)); tomoyo_set_lf(head); return true; } return false; } /** * tomoyo_set_group - Print "acl_group " header keyword and category name. * * @head: Pointer to "struct tomoyo_io_buffer". * @category: Category name. * * Returns nothing. */ static void tomoyo_set_group(struct tomoyo_io_buffer *head, const char *category) { if (head->type == TOMOYO_EXCEPTIONPOLICY) { tomoyo_print_namespace(head); tomoyo_io_printf(head, "acl_group %u ", head->r.acl_group_index); } tomoyo_set_string(head, category); } /** * tomoyo_print_entry - Print an ACL entry. * * @head: Pointer to "struct tomoyo_io_buffer". * @acl: Pointer to an ACL entry. * * Returns true on success, false otherwise. */ static bool tomoyo_print_entry(struct tomoyo_io_buffer *head, struct tomoyo_acl_info *acl) { const u8 acl_type = acl->type; bool first = true; u8 bit; if (head->r.print_cond_part) goto print_cond_part; if (acl->is_deleted) return true; if (!tomoyo_flush(head)) return false; else if (acl_type == TOMOYO_TYPE_PATH_ACL) { struct tomoyo_path_acl *ptr = container_of(acl, typeof(*ptr), head); const u16 perm = ptr->perm; for (bit = 0; bit < TOMOYO_MAX_PATH_OPERATION; bit++) { if (!(perm & (1 << bit))) continue; if (head->r.print_transition_related_only && bit != TOMOYO_TYPE_EXECUTE) continue; if (first) { tomoyo_set_group(head, "file "); first = false; } else { tomoyo_set_slash(head); } tomoyo_set_string(head, tomoyo_path_keyword[bit]); } if (first) return true; tomoyo_print_name_union(head, &ptr->name); } else if (acl_type == TOMOYO_TYPE_MANUAL_TASK_ACL) { struct tomoyo_task_acl *ptr = container_of(acl, typeof(*ptr), head); tomoyo_set_group(head, "task "); tomoyo_set_string(head, "manual_domain_transition "); tomoyo_set_string(head, ptr->domainname->name); } else if (head->r.print_transition_related_only) { return true; } else if (acl_type == TOMOYO_TYPE_PATH2_ACL) { struct tomoyo_path2_acl *ptr = container_of(acl, typeof(*ptr), head); const u8 perm = ptr->perm; for (bit = 0; bit < TOMOYO_MAX_PATH2_OPERATION; bit++) { if (!(perm & (1 << bit))) continue; if (first) { tomoyo_set_group(head, "file "); first = false; } else { tomoyo_set_slash(head); } tomoyo_set_string(head, tomoyo_mac_keywords [tomoyo_pp2mac[bit]]); } if (first) return true; tomoyo_print_name_union(head, &ptr->name1); tomoyo_print_name_union(head, &ptr->name2); } else if (acl_type == TOMOYO_TYPE_PATH_NUMBER_ACL) { struct tomoyo_path_number_acl *ptr = container_of(acl, typeof(*ptr), head); const u8 perm = ptr->perm; for (bit = 0; bit < TOMOYO_MAX_PATH_NUMBER_OPERATION; bit++) { if (!(perm & (1 << bit))) continue; if (first) { tomoyo_set_group(head, "file "); first = false; } else { tomoyo_set_slash(head); } tomoyo_set_string(head, tomoyo_mac_keywords [tomoyo_pn2mac[bit]]); } if (first) return true; tomoyo_print_name_union(head, &ptr->name); tomoyo_print_number_union(head, &ptr->number); } else if (acl_type == TOMOYO_TYPE_MKDEV_ACL) { struct tomoyo_mkdev_acl *ptr = container_of(acl, typeof(*ptr), head); const u8 perm = ptr->perm; for (bit = 0; bit < TOMOYO_MAX_MKDEV_OPERATION; bit++) { if (!(perm & (1 << bit))) continue; if (first) { tomoyo_set_group(head, "file "); first = false; } else { tomoyo_set_slash(head); } tomoyo_set_string(head, tomoyo_mac_keywords [tomoyo_pnnn2mac[bit]]); } if (first) return true; tomoyo_print_name_union(head, &ptr->name); tomoyo_print_number_union(head, &ptr->mode); tomoyo_print_number_union(head, &ptr->major); tomoyo_print_number_union(head, &ptr->minor); } else if (acl_type == TOMOYO_TYPE_INET_ACL) { struct tomoyo_inet_acl *ptr = container_of(acl, typeof(*ptr), head); const u8 perm = ptr->perm; for (bit = 0; bit < TOMOYO_MAX_NETWORK_OPERATION; bit++) { if (!(perm & (1 << bit))) continue; if (first) { tomoyo_set_group(head, "network inet "); tomoyo_set_string(head, tomoyo_proto_keyword [ptr->protocol]); tomoyo_set_space(head); first = false; } else { tomoyo_set_slash(head); } tomoyo_set_string(head, tomoyo_socket_keyword[bit]); } if (first) return true; tomoyo_set_space(head); if (ptr->address.group) { tomoyo_set_string(head, "@"); tomoyo_set_string(head, ptr->address.group->group_name ->name); } else { char buf[128]; tomoyo_print_ip(buf, sizeof(buf), &ptr->address); tomoyo_io_printf(head, "%s", buf); } tomoyo_print_number_union(head, &ptr->port); } else if (acl_type == TOMOYO_TYPE_UNIX_ACL) { struct tomoyo_unix_acl *ptr = container_of(acl, typeof(*ptr), head); const u8 perm = ptr->perm; for (bit = 0; bit < TOMOYO_MAX_NETWORK_OPERATION; bit++) { if (!(perm & (1 << bit))) continue; if (first) { tomoyo_set_group(head, "network unix "); tomoyo_set_string(head, tomoyo_proto_keyword [ptr->protocol]); tomoyo_set_space(head); first = false; } else { tomoyo_set_slash(head); } tomoyo_set_string(head, tomoyo_socket_keyword[bit]); } if (first) return true; tomoyo_print_name_union(head, &ptr->name); } else if (acl_type == TOMOYO_TYPE_MOUNT_ACL) { struct tomoyo_mount_acl *ptr = container_of(acl, typeof(*ptr), head); tomoyo_set_group(head, "file mount"); tomoyo_print_name_union(head, &ptr->dev_name); tomoyo_print_name_union(head, &ptr->dir_name); tomoyo_print_name_union(head, &ptr->fs_type); tomoyo_print_number_union(head, &ptr->flags); } else if (acl_type == TOMOYO_TYPE_ENV_ACL) { struct tomoyo_env_acl *ptr = container_of(acl, typeof(*ptr), head); tomoyo_set_group(head, "misc env "); tomoyo_set_string(head, ptr->env->name); } if (acl->cond) { head->r.print_cond_part = true; head->r.cond_step = 0; if (!tomoyo_flush(head)) return false; print_cond_part: if (!tomoyo_print_condition(head, acl->cond)) return false; head->r.print_cond_part = false; } else { tomoyo_set_lf(head); } return true; } /** * tomoyo_read_domain2 - Read domain policy. * * @head: Pointer to "struct tomoyo_io_buffer". * @list: Pointer to "struct list_head". * * Caller holds tomoyo_read_lock(). * * Returns true on success, false otherwise. */ static bool tomoyo_read_domain2(struct tomoyo_io_buffer *head, struct list_head *list) { list_for_each_cookie(head->r.acl, list) { struct tomoyo_acl_info *ptr = list_entry(head->r.acl, typeof(*ptr), list); if (!tomoyo_print_entry(head, ptr)) return false; } head->r.acl = NULL; return true; } /** * tomoyo_read_domain - Read domain policy. * * @head: Pointer to "struct tomoyo_io_buffer". * * Caller holds tomoyo_read_lock(). */ static void tomoyo_read_domain(struct tomoyo_io_buffer *head) { if (head->r.eof) return; list_for_each_cookie(head->r.domain, &tomoyo_domain_list) { struct tomoyo_domain_info *domain = list_entry(head->r.domain, typeof(*domain), list); u8 i; switch (head->r.step) { case 0: if (domain->is_deleted && !head->r.print_this_domain_only) continue; /* Print domainname and flags. */ tomoyo_set_string(head, domain->domainname->name); tomoyo_set_lf(head); tomoyo_io_printf(head, "use_profile %u\n", domain->profile); for (i = 0; i < TOMOYO_MAX_DOMAIN_INFO_FLAGS; i++) if (domain->flags[i]) tomoyo_set_string(head, tomoyo_dif[i]); head->r.index = 0; head->r.step++; fallthrough; case 1: while (head->r.index < TOMOYO_MAX_ACL_GROUPS) { i = head->r.index++; if (!test_bit(i, domain->group)) continue; tomoyo_io_printf(head, "use_group %u\n", i); if (!tomoyo_flush(head)) return; } head->r.index = 0; head->r.step++; tomoyo_set_lf(head); fallthrough; case 2: if (!tomoyo_read_domain2(head, &domain->acl_info_list)) return; head->r.step++; if (!tomoyo_set_lf(head)) return; fallthrough; case 3: head->r.step = 0; if (head->r.print_this_domain_only) goto done; } } done: head->r.eof = true; } /** * tomoyo_write_pid: Specify PID to obtain domainname. * * @head: Pointer to "struct tomoyo_io_buffer". * * Returns 0. */ static int tomoyo_write_pid(struct tomoyo_io_buffer *head) { head->r.eof = false; return 0; } /** * tomoyo_read_pid - Get domainname of the specified PID. * * @head: Pointer to "struct tomoyo_io_buffer". * * Returns the domainname which the specified PID is in on success, * empty string otherwise. * The PID is specified by tomoyo_write_pid() so that the user can obtain * using read()/write() interface rather than sysctl() interface. */ static void tomoyo_read_pid(struct tomoyo_io_buffer *head) { char *buf = head->write_buf; bool global_pid = false; unsigned int pid; struct task_struct *p; struct tomoyo_domain_info *domain = NULL; /* Accessing write_buf is safe because head->io_sem is held. */ if (!buf) { head->r.eof = true; return; /* Do nothing if open(O_RDONLY). */ } if (head->r.w_pos || head->r.eof) return; head->r.eof = true; if (tomoyo_str_starts(&buf, "global-pid ")) global_pid = true; if (kstrtouint(buf, 10, &pid)) return; rcu_read_lock(); if (global_pid) p = find_task_by_pid_ns(pid, &init_pid_ns); else p = find_task_by_vpid(pid); if (p) domain = tomoyo_task(p)->domain_info; rcu_read_unlock(); if (!domain) return; tomoyo_io_printf(head, "%u %u ", pid, domain->profile); tomoyo_set_string(head, domain->domainname->name); } /* String table for domain transition control keywords. */ static const char *tomoyo_transition_type[TOMOYO_MAX_TRANSITION_TYPE] = { [TOMOYO_TRANSITION_CONTROL_NO_RESET] = "no_reset_domain ", [TOMOYO_TRANSITION_CONTROL_RESET] = "reset_domain ", [TOMOYO_TRANSITION_CONTROL_NO_INITIALIZE] = "no_initialize_domain ", [TOMOYO_TRANSITION_CONTROL_INITIALIZE] = "initialize_domain ", [TOMOYO_TRANSITION_CONTROL_NO_KEEP] = "no_keep_domain ", [TOMOYO_TRANSITION_CONTROL_KEEP] = "keep_domain ", }; /* String table for grouping keywords. */ static const char *tomoyo_group_name[TOMOYO_MAX_GROUP] = { [TOMOYO_PATH_GROUP] = "path_group ", [TOMOYO_NUMBER_GROUP] = "number_group ", [TOMOYO_ADDRESS_GROUP] = "address_group ", }; /** * tomoyo_write_exception - Write exception policy. * * @head: Pointer to "struct tomoyo_io_buffer". * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_write_exception(struct tomoyo_io_buffer *head) { const bool is_delete = head->w.is_delete; struct tomoyo_acl_param param = { .ns = head->w.ns, .is_delete = is_delete, .data = head->write_buf, }; u8 i; if (tomoyo_str_starts(&param.data, "aggregator ")) return tomoyo_write_aggregator(&param); for (i = 0; i < TOMOYO_MAX_TRANSITION_TYPE; i++) if (tomoyo_str_starts(&param.data, tomoyo_transition_type[i])) return tomoyo_write_transition_control(&param, i); for (i = 0; i < TOMOYO_MAX_GROUP; i++) if (tomoyo_str_starts(&param.data, tomoyo_group_name[i])) return tomoyo_write_group(&param, i); if (tomoyo_str_starts(&param.data, "acl_group ")) { unsigned int group; char *data; group = simple_strtoul(param.data, &data, 10); if (group < TOMOYO_MAX_ACL_GROUPS && *data++ == ' ') return tomoyo_write_domain2 (head->w.ns, &head->w.ns->acl_group[group], data, is_delete); } return -EINVAL; } /** * tomoyo_read_group - Read "struct tomoyo_path_group"/"struct tomoyo_number_group"/"struct tomoyo_address_group" list. * * @head: Pointer to "struct tomoyo_io_buffer". * @idx: Index number. * * Returns true on success, false otherwise. * * Caller holds tomoyo_read_lock(). */ static bool tomoyo_read_group(struct tomoyo_io_buffer *head, const int idx) { struct tomoyo_policy_namespace *ns = container_of(head->r.ns, typeof(*ns), namespace_list); struct list_head *list = &ns->group_list[idx]; list_for_each_cookie(head->r.group, list) { struct tomoyo_group *group = list_entry(head->r.group, typeof(*group), head.list); list_for_each_cookie(head->r.acl, &group->member_list) { struct tomoyo_acl_head *ptr = list_entry(head->r.acl, typeof(*ptr), list); if (ptr->is_deleted) continue; if (!tomoyo_flush(head)) return false; tomoyo_print_namespace(head); tomoyo_set_string(head, tomoyo_group_name[idx]); tomoyo_set_string(head, group->group_name->name); if (idx == TOMOYO_PATH_GROUP) { tomoyo_set_space(head); tomoyo_set_string(head, container_of (ptr, struct tomoyo_path_group, head)->member_name->name); } else if (idx == TOMOYO_NUMBER_GROUP) { tomoyo_print_number_union(head, &container_of (ptr, struct tomoyo_number_group, head)->number); } else if (idx == TOMOYO_ADDRESS_GROUP) { char buffer[128]; struct tomoyo_address_group *member = container_of(ptr, typeof(*member), head); tomoyo_print_ip(buffer, sizeof(buffer), &member->address); tomoyo_io_printf(head, " %s", buffer); } tomoyo_set_lf(head); } head->r.acl = NULL; } head->r.group = NULL; return true; } /** * tomoyo_read_policy - Read "struct tomoyo_..._entry" list. * * @head: Pointer to "struct tomoyo_io_buffer". * @idx: Index number. * * Returns true on success, false otherwise. * * Caller holds tomoyo_read_lock(). */ static bool tomoyo_read_policy(struct tomoyo_io_buffer *head, const int idx) { struct tomoyo_policy_namespace *ns = container_of(head->r.ns, typeof(*ns), namespace_list); struct list_head *list = &ns->policy_list[idx]; list_for_each_cookie(head->r.acl, list) { struct tomoyo_acl_head *acl = container_of(head->r.acl, typeof(*acl), list); if (acl->is_deleted) continue; if (!tomoyo_flush(head)) return false; switch (idx) { case TOMOYO_ID_TRANSITION_CONTROL: { struct tomoyo_transition_control *ptr = container_of(acl, typeof(*ptr), head); tomoyo_print_namespace(head); tomoyo_set_string(head, tomoyo_transition_type [ptr->type]); tomoyo_set_string(head, ptr->program ? ptr->program->name : "any"); tomoyo_set_string(head, " from "); tomoyo_set_string(head, ptr->domainname ? ptr->domainname->name : "any"); } break; case TOMOYO_ID_AGGREGATOR: { struct tomoyo_aggregator *ptr = container_of(acl, typeof(*ptr), head); tomoyo_print_namespace(head); tomoyo_set_string(head, "aggregator "); tomoyo_set_string(head, ptr->original_name->name); tomoyo_set_space(head); tomoyo_set_string(head, ptr->aggregated_name->name); } break; default: continue; } tomoyo_set_lf(head); } head->r.acl = NULL; return true; } /** * tomoyo_read_exception - Read exception policy. * * @head: Pointer to "struct tomoyo_io_buffer". * * Caller holds tomoyo_read_lock(). */ static void tomoyo_read_exception(struct tomoyo_io_buffer *head) { struct tomoyo_policy_namespace *ns = container_of(head->r.ns, typeof(*ns), namespace_list); if (head->r.eof) return; while (head->r.step < TOMOYO_MAX_POLICY && tomoyo_read_policy(head, head->r.step)) head->r.step++; if (head->r.step < TOMOYO_MAX_POLICY) return; while (head->r.step < TOMOYO_MAX_POLICY + TOMOYO_MAX_GROUP && tomoyo_read_group(head, head->r.step - TOMOYO_MAX_POLICY)) head->r.step++; if (head->r.step < TOMOYO_MAX_POLICY + TOMOYO_MAX_GROUP) return; while (head->r.step < TOMOYO_MAX_POLICY + TOMOYO_MAX_GROUP + TOMOYO_MAX_ACL_GROUPS) { head->r.acl_group_index = head->r.step - TOMOYO_MAX_POLICY - TOMOYO_MAX_GROUP; if (!tomoyo_read_domain2(head, &ns->acl_group [head->r.acl_group_index])) return; head->r.step++; } head->r.eof = true; } /* Wait queue for kernel -> userspace notification. */ static DECLARE_WAIT_QUEUE_HEAD(tomoyo_query_wait); /* Wait queue for userspace -> kernel notification. */ static DECLARE_WAIT_QUEUE_HEAD(tomoyo_answer_wait); /* Structure for query. */ struct tomoyo_query { struct list_head list; struct tomoyo_domain_info *domain; char *query; size_t query_len; unsigned int serial; u8 timer; u8 answer; u8 retry; }; /* The list for "struct tomoyo_query". */ static LIST_HEAD(tomoyo_query_list); /* Lock for manipulating tomoyo_query_list. */ static DEFINE_SPINLOCK(tomoyo_query_list_lock); /* * Number of "struct file" referring /sys/kernel/security/tomoyo/query * interface. */ static atomic_t tomoyo_query_observers = ATOMIC_INIT(0); /** * tomoyo_truncate - Truncate a line. * * @str: String to truncate. * * Returns length of truncated @str. */ static int tomoyo_truncate(char *str) { char *start = str; while (*(unsigned char *) str > (unsigned char) ' ') str++; *str = '\0'; return strlen(start) + 1; } /** * tomoyo_add_entry - Add an ACL to current thread's domain. Used by learning mode. * * @domain: Pointer to "struct tomoyo_domain_info". * @header: Lines containing ACL. * * Returns nothing. */ static void tomoyo_add_entry(struct tomoyo_domain_info *domain, char *header) { char *buffer; char *realpath = NULL; char *argv0 = NULL; char *symlink = NULL; char *cp = strchr(header, '\n'); int len; if (!cp) return; cp = strchr(cp + 1, '\n'); if (!cp) return; *cp++ = '\0'; len = strlen(cp) + 1; /* strstr() will return NULL if ordering is wrong. */ if (*cp == 'f') { argv0 = strstr(header, " argv[]={ \""); if (argv0) { argv0 += 10; len += tomoyo_truncate(argv0) + 14; } realpath = strstr(header, " exec={ realpath=\""); if (realpath) { realpath += 8; len += tomoyo_truncate(realpath) + 6; } symlink = strstr(header, " symlink.target=\""); if (symlink) len += tomoyo_truncate(symlink + 1) + 1; } buffer = kmalloc(len, GFP_NOFS); if (!buffer) return; snprintf(buffer, len - 1, "%s", cp); if (realpath) tomoyo_addprintf(buffer, len, " exec.%s", realpath); if (argv0) tomoyo_addprintf(buffer, len, " exec.argv[0]=%s", argv0); if (symlink) tomoyo_addprintf(buffer, len, "%s", symlink); tomoyo_normalize_line(buffer); if (!tomoyo_write_domain2(domain->ns, &domain->acl_info_list, buffer, false)) tomoyo_update_stat(TOMOYO_STAT_POLICY_UPDATES); kfree(buffer); } /** * tomoyo_supervisor - Ask for the supervisor's decision. * * @r: Pointer to "struct tomoyo_request_info". * @fmt: The printf()'s format string, followed by parameters. * * Returns 0 if the supervisor decided to permit the access request which * violated the policy in enforcing mode, TOMOYO_RETRY_REQUEST if the * supervisor decided to retry the access request which violated the policy in * enforcing mode, 0 if it is not in enforcing mode, -EPERM otherwise. */ int tomoyo_supervisor(struct tomoyo_request_info *r, const char *fmt, ...) { va_list args; int error; int len; static unsigned int tomoyo_serial; struct tomoyo_query entry = { }; bool quota_exceeded = false; va_start(args, fmt); len = vsnprintf(NULL, 0, fmt, args) + 1; va_end(args); /* Write /sys/kernel/security/tomoyo/audit. */ va_start(args, fmt); tomoyo_write_log2(r, len, fmt, args); va_end(args); /* Nothing more to do if granted. */ if (r->granted) return 0; if (r->mode) tomoyo_update_stat(r->mode); switch (r->mode) { case TOMOYO_CONFIG_ENFORCING: error = -EPERM; if (atomic_read(&tomoyo_query_observers)) break; goto out; case TOMOYO_CONFIG_LEARNING: error = 0; /* Check max_learning_entry parameter. */ if (tomoyo_domain_quota_is_ok(r)) break; fallthrough; default: return 0; } /* Get message. */ va_start(args, fmt); entry.query = tomoyo_init_log(r, len, fmt, args); va_end(args); if (!entry.query) goto out; entry.query_len = strlen(entry.query) + 1; if (!error) { tomoyo_add_entry(r->domain, entry.query); goto out; } len = kmalloc_size_roundup(entry.query_len); entry.domain = r->domain; spin_lock(&tomoyo_query_list_lock); if (tomoyo_memory_quota[TOMOYO_MEMORY_QUERY] && tomoyo_memory_used[TOMOYO_MEMORY_QUERY] + len >= tomoyo_memory_quota[TOMOYO_MEMORY_QUERY]) { quota_exceeded = true; } else { entry.serial = tomoyo_serial++; entry.retry = r->retry; tomoyo_memory_used[TOMOYO_MEMORY_QUERY] += len; list_add_tail(&entry.list, &tomoyo_query_list); } spin_unlock(&tomoyo_query_list_lock); if (quota_exceeded) goto out; /* Give 10 seconds for supervisor's opinion. */ while (entry.timer < 10) { wake_up_all(&tomoyo_query_wait); if (wait_event_interruptible_timeout (tomoyo_answer_wait, entry.answer || !atomic_read(&tomoyo_query_observers), HZ)) break; entry.timer++; } spin_lock(&tomoyo_query_list_lock); list_del(&entry.list); tomoyo_memory_used[TOMOYO_MEMORY_QUERY] -= len; spin_unlock(&tomoyo_query_list_lock); switch (entry.answer) { case 3: /* Asked to retry by administrator. */ error = TOMOYO_RETRY_REQUEST; r->retry++; break; case 1: /* Granted by administrator. */ error = 0; break; default: /* Timed out or rejected by administrator. */ break; } out: kfree(entry.query); return error; } /** * tomoyo_find_domain_by_qid - Get domain by query id. * * @serial: Query ID assigned by tomoyo_supervisor(). * * Returns pointer to "struct tomoyo_domain_info" if found, NULL otherwise. */ static struct tomoyo_domain_info *tomoyo_find_domain_by_qid (unsigned int serial) { struct tomoyo_query *ptr; struct tomoyo_domain_info *domain = NULL; spin_lock(&tomoyo_query_list_lock); list_for_each_entry(ptr, &tomoyo_query_list, list) { if (ptr->serial != serial) continue; domain = ptr->domain; break; } spin_unlock(&tomoyo_query_list_lock); return domain; } /** * tomoyo_poll_query - poll() for /sys/kernel/security/tomoyo/query. * * @file: Pointer to "struct file". * @wait: Pointer to "poll_table". * * Returns EPOLLIN | EPOLLRDNORM when ready to read, 0 otherwise. * * Waits for access requests which violated policy in enforcing mode. */ static __poll_t tomoyo_poll_query(struct file *file, poll_table *wait) { if (!list_empty(&tomoyo_query_list)) return EPOLLIN | EPOLLRDNORM; poll_wait(file, &tomoyo_query_wait, wait); if (!list_empty(&tomoyo_query_list)) return EPOLLIN | EPOLLRDNORM; return 0; } /** * tomoyo_read_query - Read access requests which violated policy in enforcing mode. * * @head: Pointer to "struct tomoyo_io_buffer". */ static void tomoyo_read_query(struct tomoyo_io_buffer *head) { struct list_head *tmp; unsigned int pos = 0; size_t len = 0; char *buf; if (head->r.w_pos) return; kfree(head->read_buf); head->read_buf = NULL; spin_lock(&tomoyo_query_list_lock); list_for_each(tmp, &tomoyo_query_list) { struct tomoyo_query *ptr = list_entry(tmp, typeof(*ptr), list); if (pos++ != head->r.query_index) continue; len = ptr->query_len; break; } spin_unlock(&tomoyo_query_list_lock); if (!len) { head->r.query_index = 0; return; } buf = kzalloc(len + 32, GFP_NOFS); if (!buf) return; pos = 0; spin_lock(&tomoyo_query_list_lock); list_for_each(tmp, &tomoyo_query_list) { struct tomoyo_query *ptr = list_entry(tmp, typeof(*ptr), list); if (pos++ != head->r.query_index) continue; /* * Some query can be skipped because tomoyo_query_list * can change, but I don't care. */ if (len == ptr->query_len) snprintf(buf, len + 31, "Q%u-%hu\n%s", ptr->serial, ptr->retry, ptr->query); break; } spin_unlock(&tomoyo_query_list_lock); if (buf[0]) { head->read_buf = buf; head->r.w[head->r.w_pos++] = buf; head->r.query_index++; } else { kfree(buf); } } /** * tomoyo_write_answer - Write the supervisor's decision. * * @head: Pointer to "struct tomoyo_io_buffer". * * Returns 0 on success, -EINVAL otherwise. */ static int tomoyo_write_answer(struct tomoyo_io_buffer *head) { char *data = head->write_buf; struct list_head *tmp; unsigned int serial; unsigned int answer; spin_lock(&tomoyo_query_list_lock); list_for_each(tmp, &tomoyo_query_list) { struct tomoyo_query *ptr = list_entry(tmp, typeof(*ptr), list); ptr->timer = 0; } spin_unlock(&tomoyo_query_list_lock); if (sscanf(data, "A%u=%u", &serial, &answer) != 2) return -EINVAL; spin_lock(&tomoyo_query_list_lock); list_for_each(tmp, &tomoyo_query_list) { struct tomoyo_query *ptr = list_entry(tmp, typeof(*ptr), list); if (ptr->serial != serial) continue; ptr->answer = answer; /* Remove from tomoyo_query_list. */ if (ptr->answer) list_del_init(&ptr->list); break; } spin_unlock(&tomoyo_query_list_lock); return 0; } /** * tomoyo_read_version: Get version. * * @head: Pointer to "struct tomoyo_io_buffer". * * Returns version information. */ static void tomoyo_read_version(struct tomoyo_io_buffer *head) { if (!head->r.eof) { tomoyo_io_printf(head, "2.6.0"); head->r.eof = true; } } /* String table for /sys/kernel/security/tomoyo/stat interface. */ static const char * const tomoyo_policy_headers[TOMOYO_MAX_POLICY_STAT] = { [TOMOYO_STAT_POLICY_UPDATES] = "update:", [TOMOYO_STAT_POLICY_LEARNING] = "violation in learning mode:", [TOMOYO_STAT_POLICY_PERMISSIVE] = "violation in permissive mode:", [TOMOYO_STAT_POLICY_ENFORCING] = "violation in enforcing mode:", }; /* String table for /sys/kernel/security/tomoyo/stat interface. */ static const char * const tomoyo_memory_headers[TOMOYO_MAX_MEMORY_STAT] = { [TOMOYO_MEMORY_POLICY] = "policy:", [TOMOYO_MEMORY_AUDIT] = "audit log:", [TOMOYO_MEMORY_QUERY] = "query message:", }; /* Counter for number of updates. */ static atomic_t tomoyo_stat_updated[TOMOYO_MAX_POLICY_STAT]; /* Timestamp counter for last updated. */ static time64_t tomoyo_stat_modified[TOMOYO_MAX_POLICY_STAT]; /** * tomoyo_update_stat - Update statistic counters. * * @index: Index for policy type. * * Returns nothing. */ void tomoyo_update_stat(const u8 index) { atomic_inc(&tomoyo_stat_updated[index]); tomoyo_stat_modified[index] = ktime_get_real_seconds(); } /** * tomoyo_read_stat - Read statistic data. * * @head: Pointer to "struct tomoyo_io_buffer". * * Returns nothing. */ static void tomoyo_read_stat(struct tomoyo_io_buffer *head) { u8 i; unsigned int total = 0; if (head->r.eof) return; for (i = 0; i < TOMOYO_MAX_POLICY_STAT; i++) { tomoyo_io_printf(head, "Policy %-30s %10u", tomoyo_policy_headers[i], atomic_read(&tomoyo_stat_updated[i])); if (tomoyo_stat_modified[i]) { struct tomoyo_time stamp; tomoyo_convert_time(tomoyo_stat_modified[i], &stamp); tomoyo_io_printf(head, " (Last: %04u/%02u/%02u %02u:%02u:%02u)", stamp.year, stamp.month, stamp.day, stamp.hour, stamp.min, stamp.sec); } tomoyo_set_lf(head); } for (i = 0; i < TOMOYO_MAX_MEMORY_STAT; i++) { unsigned int used = tomoyo_memory_used[i]; total += used; tomoyo_io_printf(head, "Memory used by %-22s %10u", tomoyo_memory_headers[i], used); used = tomoyo_memory_quota[i]; if (used) tomoyo_io_printf(head, " (Quota: %10u)", used); tomoyo_set_lf(head); } tomoyo_io_printf(head, "Total memory used: %10u\n", total); head->r.eof = true; } /** * tomoyo_write_stat - Set memory quota. * * @head: Pointer to "struct tomoyo_io_buffer". * * Returns 0. */ static int tomoyo_write_stat(struct tomoyo_io_buffer *head) { char *data = head->write_buf; u8 i; if (tomoyo_str_starts(&data, "Memory used by ")) for (i = 0; i < TOMOYO_MAX_MEMORY_STAT; i++) if (tomoyo_str_starts(&data, tomoyo_memory_headers[i])) sscanf(data, "%u", &tomoyo_memory_quota[i]); return 0; } /** * tomoyo_open_control - open() for /sys/kernel/security/tomoyo/ interface. * * @type: Type of interface. * @file: Pointer to "struct file". * * Returns 0 on success, negative value otherwise. */ int tomoyo_open_control(const u8 type, struct file *file) { struct tomoyo_io_buffer *head = kzalloc(sizeof(*head), GFP_NOFS); if (!head) return -ENOMEM; mutex_init(&head->io_sem); head->type = type; switch (type) { case TOMOYO_DOMAINPOLICY: /* /sys/kernel/security/tomoyo/domain_policy */ head->write = tomoyo_write_domain; head->read = tomoyo_read_domain; break; case TOMOYO_EXCEPTIONPOLICY: /* /sys/kernel/security/tomoyo/exception_policy */ head->write = tomoyo_write_exception; head->read = tomoyo_read_exception; break; case TOMOYO_AUDIT: /* /sys/kernel/security/tomoyo/audit */ head->poll = tomoyo_poll_log; head->read = tomoyo_read_log; break; case TOMOYO_PROCESS_STATUS: /* /sys/kernel/security/tomoyo/.process_status */ head->write = tomoyo_write_pid; head->read = tomoyo_read_pid; break; case TOMOYO_VERSION: /* /sys/kernel/security/tomoyo/version */ head->read = tomoyo_read_version; head->readbuf_size = 128; break; case TOMOYO_STAT: /* /sys/kernel/security/tomoyo/stat */ head->write = tomoyo_write_stat; head->read = tomoyo_read_stat; head->readbuf_size = 1024; break; case TOMOYO_PROFILE: /* /sys/kernel/security/tomoyo/profile */ head->write = tomoyo_write_profile; head->read = tomoyo_read_profile; break; case TOMOYO_QUERY: /* /sys/kernel/security/tomoyo/query */ head->poll = tomoyo_poll_query; head->write = tomoyo_write_answer; head->read = tomoyo_read_query; break; case TOMOYO_MANAGER: /* /sys/kernel/security/tomoyo/manager */ head->write = tomoyo_write_manager; head->read = tomoyo_read_manager; break; } if (!(file->f_mode & FMODE_READ)) { /* * No need to allocate read_buf since it is not opened * for reading. */ head->read = NULL; head->poll = NULL; } else if (!head->poll) { /* Don't allocate read_buf for poll() access. */ if (!head->readbuf_size) head->readbuf_size = 4096 * 2; head->read_buf = kzalloc(head->readbuf_size, GFP_NOFS); if (!head->read_buf) { kfree(head); return -ENOMEM; } } if (!(file->f_mode & FMODE_WRITE)) { /* * No need to allocate write_buf since it is not opened * for writing. */ head->write = NULL; } else if (head->write) { head->writebuf_size = 4096 * 2; head->write_buf = kzalloc(head->writebuf_size, GFP_NOFS); if (!head->write_buf) { kfree(head->read_buf); kfree(head); return -ENOMEM; } } /* * If the file is /sys/kernel/security/tomoyo/query , increment the * observer counter. * The obserber counter is used by tomoyo_supervisor() to see if * there is some process monitoring /sys/kernel/security/tomoyo/query. */ if (type == TOMOYO_QUERY) atomic_inc(&tomoyo_query_observers); file->private_data = head; tomoyo_notify_gc(head, true); return 0; } /** * tomoyo_poll_control - poll() for /sys/kernel/security/tomoyo/ interface. * * @file: Pointer to "struct file". * @wait: Pointer to "poll_table". Maybe NULL. * * Returns EPOLLIN | EPOLLRDNORM | EPOLLOUT | EPOLLWRNORM if ready to read/write, * EPOLLOUT | EPOLLWRNORM otherwise. */ __poll_t tomoyo_poll_control(struct file *file, poll_table *wait) { struct tomoyo_io_buffer *head = file->private_data; if (head->poll) return head->poll(file, wait) | EPOLLOUT | EPOLLWRNORM; return EPOLLIN | EPOLLRDNORM | EPOLLOUT | EPOLLWRNORM; } /** * tomoyo_set_namespace_cursor - Set namespace to read. * * @head: Pointer to "struct tomoyo_io_buffer". * * Returns nothing. */ static inline void tomoyo_set_namespace_cursor(struct tomoyo_io_buffer *head) { struct list_head *ns; if (head->type != TOMOYO_EXCEPTIONPOLICY && head->type != TOMOYO_PROFILE) return; /* * If this is the first read, or reading previous namespace finished * and has more namespaces to read, update the namespace cursor. */ ns = head->r.ns; if (!ns || (head->r.eof && ns->next != &tomoyo_namespace_list)) { /* Clearing is OK because tomoyo_flush() returned true. */ memset(&head->r, 0, sizeof(head->r)); head->r.ns = ns ? ns->next : tomoyo_namespace_list.next; } } /** * tomoyo_has_more_namespace - Check for unread namespaces. * * @head: Pointer to "struct tomoyo_io_buffer". * * Returns true if we have more entries to print, false otherwise. */ static inline bool tomoyo_has_more_namespace(struct tomoyo_io_buffer *head) { return (head->type == TOMOYO_EXCEPTIONPOLICY || head->type == TOMOYO_PROFILE) && head->r.eof && head->r.ns->next != &tomoyo_namespace_list; } /** * tomoyo_read_control - read() for /sys/kernel/security/tomoyo/ interface. * * @head: Pointer to "struct tomoyo_io_buffer". * @buffer: Pointer to buffer to write to. * @buffer_len: Size of @buffer. * * Returns bytes read on success, negative value otherwise. */ ssize_t tomoyo_read_control(struct tomoyo_io_buffer *head, char __user *buffer, const int buffer_len) { int len; int idx; if (!head->read) return -EINVAL; if (mutex_lock_interruptible(&head->io_sem)) return -EINTR; head->read_user_buf = buffer; head->read_user_buf_avail = buffer_len; idx = tomoyo_read_lock(); if (tomoyo_flush(head)) /* Call the policy handler. */ do { tomoyo_set_namespace_cursor(head); head->read(head); } while (tomoyo_flush(head) && tomoyo_has_more_namespace(head)); tomoyo_read_unlock(idx); len = head->read_user_buf - buffer; mutex_unlock(&head->io_sem); return len; } /** * tomoyo_parse_policy - Parse a policy line. * * @head: Pointer to "struct tomoyo_io_buffer". * @line: Line to parse. * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_parse_policy(struct tomoyo_io_buffer *head, char *line) { /* Delete request? */ head->w.is_delete = !strncmp(line, "delete ", 7); if (head->w.is_delete) memmove(line, line + 7, strlen(line + 7) + 1); /* Selecting namespace to update. */ if (head->type == TOMOYO_EXCEPTIONPOLICY || head->type == TOMOYO_PROFILE) { if (*line == '<') { char *cp = strchr(line, ' '); if (cp) { *cp++ = '\0'; head->w.ns = tomoyo_assign_namespace(line); memmove(line, cp, strlen(cp) + 1); } else head->w.ns = NULL; } else head->w.ns = &tomoyo_kernel_namespace; /* Don't allow updating if namespace is invalid. */ if (!head->w.ns) return -ENOENT; } /* Do the update. */ return head->write(head); } /** * tomoyo_write_control - write() for /sys/kernel/security/tomoyo/ interface. * * @head: Pointer to "struct tomoyo_io_buffer". * @buffer: Pointer to buffer to read from. * @buffer_len: Size of @buffer. * * Returns @buffer_len on success, negative value otherwise. */ ssize_t tomoyo_write_control(struct tomoyo_io_buffer *head, const char __user *buffer, const int buffer_len) { int error = buffer_len; size_t avail_len = buffer_len; char *cp0 = head->write_buf; int idx; if (!head->write) return -EINVAL; if (mutex_lock_interruptible(&head->io_sem)) return -EINTR; head->read_user_buf_avail = 0; idx = tomoyo_read_lock(); /* Read a line and dispatch it to the policy handler. */ while (avail_len > 0) { char c; if (head->w.avail >= head->writebuf_size - 1) { const int len = head->writebuf_size * 2; char *cp = kzalloc(len, GFP_NOFS); if (!cp) { error = -ENOMEM; break; } memmove(cp, cp0, head->w.avail); kfree(cp0); head->write_buf = cp; cp0 = cp; head->writebuf_size = len; } if (get_user(c, buffer)) { error = -EFAULT; break; } buffer++; avail_len--; cp0[head->w.avail++] = c; if (c != '\n') continue; cp0[head->w.avail - 1] = '\0'; head->w.avail = 0; tomoyo_normalize_line(cp0); if (!strcmp(cp0, "reset")) { head->w.ns = &tomoyo_kernel_namespace; head->w.domain = NULL; memset(&head->r, 0, sizeof(head->r)); continue; } /* Don't allow updating policies by non manager programs. */ switch (head->type) { case TOMOYO_PROCESS_STATUS: /* This does not write anything. */ break; case TOMOYO_DOMAINPOLICY: if (tomoyo_select_domain(head, cp0)) continue; fallthrough; case TOMOYO_EXCEPTIONPOLICY: if (!strcmp(cp0, "select transition_only")) { head->r.print_transition_related_only = true; continue; } fallthrough; default: if (!tomoyo_manager()) { error = -EPERM; goto out; } } switch (tomoyo_parse_policy(head, cp0)) { case -EPERM: error = -EPERM; goto out; case 0: switch (head->type) { case TOMOYO_DOMAINPOLICY: case TOMOYO_EXCEPTIONPOLICY: case TOMOYO_STAT: case TOMOYO_PROFILE: case TOMOYO_MANAGER: tomoyo_update_stat(TOMOYO_STAT_POLICY_UPDATES); break; default: break; } break; } } out: tomoyo_read_unlock(idx); mutex_unlock(&head->io_sem); return error; } /** * tomoyo_close_control - close() for /sys/kernel/security/tomoyo/ interface. * * @head: Pointer to "struct tomoyo_io_buffer". */ void tomoyo_close_control(struct tomoyo_io_buffer *head) { /* * If the file is /sys/kernel/security/tomoyo/query , decrement the * observer counter. */ if (head->type == TOMOYO_QUERY && atomic_dec_and_test(&tomoyo_query_observers)) wake_up_all(&tomoyo_answer_wait); tomoyo_notify_gc(head, false); } /** * tomoyo_check_profile - Check all profiles currently assigned to domains are defined. */ void tomoyo_check_profile(void) { struct tomoyo_domain_info *domain; const int idx = tomoyo_read_lock(); tomoyo_policy_loaded = true; pr_info("TOMOYO: 2.6.0\n"); list_for_each_entry_rcu(domain, &tomoyo_domain_list, list, srcu_read_lock_held(&tomoyo_ss)) { const u8 profile = domain->profile; struct tomoyo_policy_namespace *ns = domain->ns; if (ns->profile_version == 20110903) { pr_info_once("Converting profile version from %u to %u.\n", 20110903, 20150505); ns->profile_version = 20150505; } if (ns->profile_version != 20150505) pr_err("Profile version %u is not supported.\n", ns->profile_version); else if (!ns->profile_ptr[profile]) pr_err("Profile %u (used by '%s') is not defined.\n", profile, domain->domainname->name); else continue; pr_err("Userland tools for TOMOYO 2.6 must be installed and policy must be initialized.\n"); pr_err("Please see https://tomoyo.osdn.jp/2.6/ for more information.\n"); panic("STOP!"); } tomoyo_read_unlock(idx); pr_info("Mandatory Access Control activated.\n"); } /** * tomoyo_load_builtin_policy - Load built-in policy. * * Returns nothing. */ void __init tomoyo_load_builtin_policy(void) { #ifdef CONFIG_SECURITY_TOMOYO_INSECURE_BUILTIN_SETTING static char tomoyo_builtin_profile[] __initdata = "PROFILE_VERSION=20150505\n" "0-CONFIG={ mode=learning grant_log=no reject_log=yes }\n"; static char tomoyo_builtin_exception_policy[] __initdata = "aggregator proc:/self/exe /proc/self/exe\n"; static char tomoyo_builtin_domain_policy[] __initdata = ""; static char tomoyo_builtin_manager[] __initdata = ""; static char tomoyo_builtin_stat[] __initdata = ""; #else /* * This include file is manually created and contains built-in policy * named "tomoyo_builtin_profile", "tomoyo_builtin_exception_policy", * "tomoyo_builtin_domain_policy", "tomoyo_builtin_manager", * "tomoyo_builtin_stat" in the form of "static char [] __initdata". */ #include "builtin-policy.h" #endif u8 i; const int idx = tomoyo_read_lock(); for (i = 0; i < 5; i++) { struct tomoyo_io_buffer head = { }; char *start = ""; switch (i) { case 0: start = tomoyo_builtin_profile; head.type = TOMOYO_PROFILE; head.write = tomoyo_write_profile; break; case 1: start = tomoyo_builtin_exception_policy; head.type = TOMOYO_EXCEPTIONPOLICY; head.write = tomoyo_write_exception; break; case 2: start = tomoyo_builtin_domain_policy; head.type = TOMOYO_DOMAINPOLICY; head.write = tomoyo_write_domain; break; case 3: start = tomoyo_builtin_manager; head.type = TOMOYO_MANAGER; head.write = tomoyo_write_manager; break; case 4: start = tomoyo_builtin_stat; head.type = TOMOYO_STAT; head.write = tomoyo_write_stat; break; } while (1) { char *end = strchr(start, '\n'); if (!end) break; *end = '\0'; tomoyo_normalize_line(start); head.write_buf = start; tomoyo_parse_policy(&head, start); start = end + 1; } } tomoyo_read_unlock(idx); #ifdef CONFIG_SECURITY_TOMOYO_OMIT_USERSPACE_LOADER tomoyo_check_profile(); #endif }
1519 33 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright 2011-2014 Autronica Fire and Security AS * * 2011-2014 Arvid Brodin, arvid.brodin@alten.se * * include file for HSR and PRP. */ #ifndef __HSR_SLAVE_H #define __HSR_SLAVE_H #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include "hsr_main.h" int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev, enum hsr_port_type pt, struct netlink_ext_ack *extack); void hsr_del_port(struct hsr_port *port); bool hsr_port_exists(const struct net_device *dev); static inline struct hsr_port *hsr_port_get_rtnl(const struct net_device *dev) { ASSERT_RTNL(); return hsr_port_exists(dev) ? rtnl_dereference(dev->rx_handler_data) : NULL; } static inline struct hsr_port *hsr_port_get_rcu(const struct net_device *dev) { return hsr_port_exists(dev) ? rcu_dereference(dev->rx_handler_data) : NULL; } bool hsr_invalid_dan_ingress_frame(__be16 protocol); #endif /* __HSR_SLAVE_H */
10 10 10 6 6 6 5 5 1 8 6 6 4 4 6 4 4 4 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 // SPDX-License-Identifier: GPL-2.0-only /* * * Author Karsten Keil <kkeil@novell.com> * * Copyright 2008 by Karsten Keil <kkeil@novell.com> */ #include <linux/mISDNif.h> #include <linux/slab.h> #include <linux/export.h> #include "core.h" static u_int *debug; static struct proto mISDN_proto = { .name = "misdn", .owner = THIS_MODULE, .obj_size = sizeof(struct mISDN_sock) }; #define _pms(sk) ((struct mISDN_sock *)sk) static struct mISDN_sock_list data_sockets = { .lock = __RW_LOCK_UNLOCKED(data_sockets.lock) }; static struct mISDN_sock_list base_sockets = { .lock = __RW_LOCK_UNLOCKED(base_sockets.lock) }; #define L2_HEADER_LEN 4 static inline struct sk_buff * _l2_alloc_skb(unsigned int len, gfp_t gfp_mask) { struct sk_buff *skb; skb = alloc_skb(len + L2_HEADER_LEN, gfp_mask); if (likely(skb)) skb_reserve(skb, L2_HEADER_LEN); return skb; } static void mISDN_sock_link(struct mISDN_sock_list *l, struct sock *sk) { write_lock_bh(&l->lock); sk_add_node(sk, &l->head); write_unlock_bh(&l->lock); } static void mISDN_sock_unlink(struct mISDN_sock_list *l, struct sock *sk) { write_lock_bh(&l->lock); sk_del_node_init(sk); write_unlock_bh(&l->lock); } static int mISDN_send(struct mISDNchannel *ch, struct sk_buff *skb) { struct mISDN_sock *msk; int err; msk = container_of(ch, struct mISDN_sock, ch); if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s len %d %p\n", __func__, skb->len, skb); if (msk->sk.sk_state == MISDN_CLOSED) return -EUNATCH; __net_timestamp(skb); err = sock_queue_rcv_skb(&msk->sk, skb); if (err) printk(KERN_WARNING "%s: error %d\n", __func__, err); return err; } static int mISDN_ctrl(struct mISDNchannel *ch, u_int cmd, void *arg) { struct mISDN_sock *msk; msk = container_of(ch, struct mISDN_sock, ch); if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s(%p, %x, %p)\n", __func__, ch, cmd, arg); switch (cmd) { case CLOSE_CHANNEL: msk->sk.sk_state = MISDN_CLOSED; break; } return 0; } static inline void mISDN_sock_cmsg(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) { struct __kernel_old_timeval tv; if (_pms(sk)->cmask & MISDN_TIME_STAMP) { skb_get_timestamp(skb, &tv); put_cmsg(msg, SOL_MISDN, MISDN_TIME_STAMP, sizeof(tv), &tv); } } static int mISDN_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sk_buff *skb; struct sock *sk = sock->sk; int copied, err; if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s: len %d, flags %x ch.nr %d, proto %x\n", __func__, (int)len, flags, _pms(sk)->ch.nr, sk->sk_protocol); if (flags & (MSG_OOB)) return -EOPNOTSUPP; if (sk->sk_state == MISDN_CLOSED) return 0; skb = skb_recv_datagram(sk, flags, &err); if (!skb) return err; if (msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_mISDN *, maddr, msg->msg_name); maddr->family = AF_ISDN; maddr->dev = _pms(sk)->dev->id; if ((sk->sk_protocol == ISDN_P_LAPD_TE) || (sk->sk_protocol == ISDN_P_LAPD_NT)) { maddr->channel = (mISDN_HEAD_ID(skb) >> 16) & 0xff; maddr->tei = (mISDN_HEAD_ID(skb) >> 8) & 0xff; maddr->sapi = mISDN_HEAD_ID(skb) & 0xff; } else { maddr->channel = _pms(sk)->ch.nr; maddr->sapi = _pms(sk)->ch.addr & 0xFF; maddr->tei = (_pms(sk)->ch.addr >> 8) & 0xFF; } msg->msg_namelen = sizeof(*maddr); } copied = skb->len + MISDN_HEADER_LEN; if (len < copied) { if (flags & MSG_PEEK) refcount_dec(&skb->users); else skb_queue_head(&sk->sk_receive_queue, skb); return -ENOSPC; } memcpy(skb_push(skb, MISDN_HEADER_LEN), mISDN_HEAD_P(skb), MISDN_HEADER_LEN); err = skb_copy_datagram_msg(skb, 0, msg, copied); mISDN_sock_cmsg(sk, msg, skb); skb_free_datagram(sk, skb); return err ? : copied; } static int mISDN_sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct sk_buff *skb; int err = -ENOMEM; if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s: len %d flags %x ch %d proto %x\n", __func__, (int)len, msg->msg_flags, _pms(sk)->ch.nr, sk->sk_protocol); if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_NOSIGNAL | MSG_ERRQUEUE)) return -EINVAL; if (len < MISDN_HEADER_LEN) return -EINVAL; if (sk->sk_state != MISDN_BOUND) return -EBADFD; lock_sock(sk); skb = _l2_alloc_skb(len, GFP_KERNEL); if (!skb) goto done; if (memcpy_from_msg(skb_put(skb, len), msg, len)) { err = -EFAULT; goto done; } memcpy(mISDN_HEAD_P(skb), skb->data, MISDN_HEADER_LEN); skb_pull(skb, MISDN_HEADER_LEN); if (msg->msg_namelen >= sizeof(struct sockaddr_mISDN)) { /* if we have a address, we use it */ DECLARE_SOCKADDR(struct sockaddr_mISDN *, maddr, msg->msg_name); mISDN_HEAD_ID(skb) = maddr->channel; } else { /* use default for L2 messages */ if ((sk->sk_protocol == ISDN_P_LAPD_TE) || (sk->sk_protocol == ISDN_P_LAPD_NT)) mISDN_HEAD_ID(skb) = _pms(sk)->ch.nr; } if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s: ID:%x\n", __func__, mISDN_HEAD_ID(skb)); err = -ENODEV; if (!_pms(sk)->ch.peer) goto done; err = _pms(sk)->ch.recv(_pms(sk)->ch.peer, skb); if (err) goto done; else { skb = NULL; err = len; } done: kfree_skb(skb); release_sock(sk); return err; } static int data_sock_release(struct socket *sock) { struct sock *sk = sock->sk; if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s(%p) sk=%p\n", __func__, sock, sk); if (!sk) return 0; switch (sk->sk_protocol) { case ISDN_P_TE_S0: case ISDN_P_NT_S0: case ISDN_P_TE_E1: case ISDN_P_NT_E1: if (sk->sk_state == MISDN_BOUND) delete_channel(&_pms(sk)->ch); else mISDN_sock_unlink(&data_sockets, sk); break; case ISDN_P_LAPD_TE: case ISDN_P_LAPD_NT: case ISDN_P_B_RAW: case ISDN_P_B_HDLC: case ISDN_P_B_X75SLP: case ISDN_P_B_L2DTMF: case ISDN_P_B_L2DSP: case ISDN_P_B_L2DSPHDLC: delete_channel(&_pms(sk)->ch); mISDN_sock_unlink(&data_sockets, sk); break; } lock_sock(sk); sock_orphan(sk); skb_queue_purge(&sk->sk_receive_queue); release_sock(sk); sock_put(sk); return 0; } static int data_sock_ioctl_bound(struct sock *sk, unsigned int cmd, void __user *p) { struct mISDN_ctrl_req cq; int err = -EINVAL, val[2]; struct mISDNchannel *bchan, *next; lock_sock(sk); if (!_pms(sk)->dev) { err = -ENODEV; goto done; } switch (cmd) { case IMCTRLREQ: if (copy_from_user(&cq, p, sizeof(cq))) { err = -EFAULT; break; } if ((sk->sk_protocol & ~ISDN_P_B_MASK) == ISDN_P_B_START) { list_for_each_entry_safe(bchan, next, &_pms(sk)->dev->bchannels, list) { if (bchan->nr == cq.channel) { err = bchan->ctrl(bchan, CONTROL_CHANNEL, &cq); break; } } } else err = _pms(sk)->dev->D.ctrl(&_pms(sk)->dev->D, CONTROL_CHANNEL, &cq); if (err) break; if (copy_to_user(p, &cq, sizeof(cq))) err = -EFAULT; break; case IMCLEAR_L2: if (sk->sk_protocol != ISDN_P_LAPD_NT) { err = -EINVAL; break; } val[0] = cmd; if (get_user(val[1], (int __user *)p)) { err = -EFAULT; break; } err = _pms(sk)->dev->teimgr->ctrl(_pms(sk)->dev->teimgr, CONTROL_CHANNEL, val); break; case IMHOLD_L1: if (sk->sk_protocol != ISDN_P_LAPD_NT && sk->sk_protocol != ISDN_P_LAPD_TE) { err = -EINVAL; break; } val[0] = cmd; if (get_user(val[1], (int __user *)p)) { err = -EFAULT; break; } err = _pms(sk)->dev->teimgr->ctrl(_pms(sk)->dev->teimgr, CONTROL_CHANNEL, val); break; default: err = -EINVAL; break; } done: release_sock(sk); return err; } static int data_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { int err = 0, id; struct sock *sk = sock->sk; struct mISDNdevice *dev; struct mISDNversion ver; switch (cmd) { case IMGETVERSION: ver.major = MISDN_MAJOR_VERSION; ver.minor = MISDN_MINOR_VERSION; ver.release = MISDN_RELEASE; if (copy_to_user((void __user *)arg, &ver, sizeof(ver))) err = -EFAULT; break; case IMGETCOUNT: id = get_mdevice_count(); if (put_user(id, (int __user *)arg)) err = -EFAULT; break; case IMGETDEVINFO: if (get_user(id, (int __user *)arg)) { err = -EFAULT; break; } dev = get_mdevice(id); if (dev) { struct mISDN_devinfo di; memset(&di, 0, sizeof(di)); di.id = dev->id; di.Dprotocols = dev->Dprotocols; di.Bprotocols = dev->Bprotocols | get_all_Bprotocols(); di.protocol = dev->D.protocol; memcpy(di.channelmap, dev->channelmap, sizeof(di.channelmap)); di.nrbchan = dev->nrbchan; strscpy(di.name, dev_name(&dev->dev), sizeof(di.name)); if (copy_to_user((void __user *)arg, &di, sizeof(di))) err = -EFAULT; } else err = -ENODEV; break; default: if (sk->sk_state == MISDN_BOUND) err = data_sock_ioctl_bound(sk, cmd, (void __user *)arg); else err = -ENOTCONN; } return err; } static int data_sock_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int len) { struct sock *sk = sock->sk; int err = 0, opt = 0; if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s(%p, %d, %x, optval, %d)\n", __func__, sock, level, optname, len); lock_sock(sk); switch (optname) { case MISDN_TIME_STAMP: if (copy_from_sockptr(&opt, optval, sizeof(int))) { err = -EFAULT; break; } if (opt) _pms(sk)->cmask |= MISDN_TIME_STAMP; else _pms(sk)->cmask &= ~MISDN_TIME_STAMP; break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } static int data_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; int len, opt; if (get_user(len, optlen)) return -EFAULT; if (len != sizeof(char)) return -EINVAL; switch (optname) { case MISDN_TIME_STAMP: if (_pms(sk)->cmask & MISDN_TIME_STAMP) opt = 1; else opt = 0; if (put_user(opt, optval)) return -EFAULT; break; default: return -ENOPROTOOPT; } return 0; } static int data_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sockaddr_mISDN *maddr = (struct sockaddr_mISDN *) addr; struct sock *sk = sock->sk; struct sock *csk; int err = 0; if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s(%p) sk=%p\n", __func__, sock, sk); if (addr_len != sizeof(struct sockaddr_mISDN)) return -EINVAL; if (!maddr || maddr->family != AF_ISDN) return -EINVAL; lock_sock(sk); if (_pms(sk)->dev) { err = -EALREADY; goto done; } _pms(sk)->dev = get_mdevice(maddr->dev); if (!_pms(sk)->dev) { err = -ENODEV; goto done; } if (sk->sk_protocol < ISDN_P_B_START) { read_lock_bh(&data_sockets.lock); sk_for_each(csk, &data_sockets.head) { if (sk == csk) continue; if (_pms(csk)->dev != _pms(sk)->dev) continue; if (csk->sk_protocol >= ISDN_P_B_START) continue; if (IS_ISDN_P_TE(csk->sk_protocol) == IS_ISDN_P_TE(sk->sk_protocol)) continue; read_unlock_bh(&data_sockets.lock); err = -EBUSY; goto done; } read_unlock_bh(&data_sockets.lock); } _pms(sk)->ch.send = mISDN_send; _pms(sk)->ch.ctrl = mISDN_ctrl; switch (sk->sk_protocol) { case ISDN_P_TE_S0: case ISDN_P_NT_S0: case ISDN_P_TE_E1: case ISDN_P_NT_E1: mISDN_sock_unlink(&data_sockets, sk); err = connect_layer1(_pms(sk)->dev, &_pms(sk)->ch, sk->sk_protocol, maddr); if (err) mISDN_sock_link(&data_sockets, sk); break; case ISDN_P_LAPD_TE: case ISDN_P_LAPD_NT: err = create_l2entity(_pms(sk)->dev, &_pms(sk)->ch, sk->sk_protocol, maddr); break; case ISDN_P_B_RAW: case ISDN_P_B_HDLC: case ISDN_P_B_X75SLP: case ISDN_P_B_L2DTMF: case ISDN_P_B_L2DSP: case ISDN_P_B_L2DSPHDLC: err = connect_Bstack(_pms(sk)->dev, &_pms(sk)->ch, sk->sk_protocol, maddr); break; default: err = -EPROTONOSUPPORT; } if (err) goto done; sk->sk_state = MISDN_BOUND; _pms(sk)->ch.protocol = sk->sk_protocol; done: release_sock(sk); return err; } static int data_sock_getname(struct socket *sock, struct sockaddr *addr, int peer) { struct sockaddr_mISDN *maddr = (struct sockaddr_mISDN *) addr; struct sock *sk = sock->sk; if (!_pms(sk)->dev) return -EBADFD; lock_sock(sk); maddr->family = AF_ISDN; maddr->dev = _pms(sk)->dev->id; maddr->channel = _pms(sk)->ch.nr; maddr->sapi = _pms(sk)->ch.addr & 0xff; maddr->tei = (_pms(sk)->ch.addr >> 8) & 0xff; release_sock(sk); return sizeof(*maddr); } static const struct proto_ops data_sock_ops = { .family = PF_ISDN, .owner = THIS_MODULE, .release = data_sock_release, .ioctl = data_sock_ioctl, .bind = data_sock_bind, .getname = data_sock_getname, .sendmsg = mISDN_sock_sendmsg, .recvmsg = mISDN_sock_recvmsg, .poll = datagram_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = data_sock_setsockopt, .getsockopt = data_sock_getsockopt, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .mmap = sock_no_mmap }; static int data_sock_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; if (sock->type != SOCK_DGRAM) return -ESOCKTNOSUPPORT; sk = sk_alloc(net, PF_ISDN, GFP_KERNEL, &mISDN_proto, kern); if (!sk) return -ENOMEM; sock_init_data(sock, sk); sock->ops = &data_sock_ops; sock->state = SS_UNCONNECTED; sock_reset_flag(sk, SOCK_ZAPPED); sk->sk_protocol = protocol; sk->sk_state = MISDN_OPEN; mISDN_sock_link(&data_sockets, sk); return 0; } static int base_sock_release(struct socket *sock) { struct sock *sk = sock->sk; printk(KERN_DEBUG "%s(%p) sk=%p\n", __func__, sock, sk); if (!sk) return 0; mISDN_sock_unlink(&base_sockets, sk); sock_orphan(sk); sock_put(sk); return 0; } static int base_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { int err = 0, id; struct mISDNdevice *dev; struct mISDNversion ver; switch (cmd) { case IMGETVERSION: ver.major = MISDN_MAJOR_VERSION; ver.minor = MISDN_MINOR_VERSION; ver.release = MISDN_RELEASE; if (copy_to_user((void __user *)arg, &ver, sizeof(ver))) err = -EFAULT; break; case IMGETCOUNT: id = get_mdevice_count(); if (put_user(id, (int __user *)arg)) err = -EFAULT; break; case IMGETDEVINFO: if (get_user(id, (int __user *)arg)) { err = -EFAULT; break; } dev = get_mdevice(id); if (dev) { struct mISDN_devinfo di; memset(&di, 0, sizeof(di)); di.id = dev->id; di.Dprotocols = dev->Dprotocols; di.Bprotocols = dev->Bprotocols | get_all_Bprotocols(); di.protocol = dev->D.protocol; memcpy(di.channelmap, dev->channelmap, sizeof(di.channelmap)); di.nrbchan = dev->nrbchan; strscpy(di.name, dev_name(&dev->dev), sizeof(di.name)); if (copy_to_user((void __user *)arg, &di, sizeof(di))) err = -EFAULT; } else err = -ENODEV; break; case IMSETDEVNAME: { struct mISDN_devrename dn; if (copy_from_user(&dn, (void __user *)arg, sizeof(dn))) { err = -EFAULT; break; } dn.name[sizeof(dn.name) - 1] = '\0'; dev = get_mdevice(dn.id); if (dev) err = device_rename(&dev->dev, dn.name); else err = -ENODEV; } break; default: err = -EINVAL; } return err; } static int base_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sockaddr_mISDN *maddr = (struct sockaddr_mISDN *) addr; struct sock *sk = sock->sk; int err = 0; if (addr_len < sizeof(struct sockaddr_mISDN)) return -EINVAL; if (!maddr || maddr->family != AF_ISDN) return -EINVAL; lock_sock(sk); if (_pms(sk)->dev) { err = -EALREADY; goto done; } _pms(sk)->dev = get_mdevice(maddr->dev); if (!_pms(sk)->dev) { err = -ENODEV; goto done; } sk->sk_state = MISDN_BOUND; done: release_sock(sk); return err; } static const struct proto_ops base_sock_ops = { .family = PF_ISDN, .owner = THIS_MODULE, .release = base_sock_release, .ioctl = base_sock_ioctl, .bind = base_sock_bind, .getname = sock_no_getname, .sendmsg = sock_no_sendmsg, .recvmsg = sock_no_recvmsg, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .mmap = sock_no_mmap }; static int base_sock_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; if (sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; if (!capable(CAP_NET_RAW)) return -EPERM; sk = sk_alloc(net, PF_ISDN, GFP_KERNEL, &mISDN_proto, kern); if (!sk) return -ENOMEM; sock_init_data(sock, sk); sock->ops = &base_sock_ops; sock->state = SS_UNCONNECTED; sock_reset_flag(sk, SOCK_ZAPPED); sk->sk_protocol = protocol; sk->sk_state = MISDN_OPEN; mISDN_sock_link(&base_sockets, sk); return 0; } static int mISDN_sock_create(struct net *net, struct socket *sock, int proto, int kern) { int err = -EPROTONOSUPPORT; switch (proto) { case ISDN_P_BASE: err = base_sock_create(net, sock, proto, kern); break; case ISDN_P_TE_S0: case ISDN_P_NT_S0: case ISDN_P_TE_E1: case ISDN_P_NT_E1: case ISDN_P_LAPD_TE: case ISDN_P_LAPD_NT: case ISDN_P_B_RAW: case ISDN_P_B_HDLC: case ISDN_P_B_X75SLP: case ISDN_P_B_L2DTMF: case ISDN_P_B_L2DSP: case ISDN_P_B_L2DSPHDLC: err = data_sock_create(net, sock, proto, kern); break; default: return err; } return err; } static const struct net_proto_family mISDN_sock_family_ops = { .owner = THIS_MODULE, .family = PF_ISDN, .create = mISDN_sock_create, }; int misdn_sock_init(u_int *deb) { int err; debug = deb; err = sock_register(&mISDN_sock_family_ops); if (err) printk(KERN_ERR "%s: error(%d)\n", __func__, err); return err; } void misdn_sock_cleanup(void) { sock_unregister(PF_ISDN); }
401 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 // SPDX-License-Identifier: GPL-2.0-only #include <linux/types.h> #include <linux/netfilter.h> #include <net/tcp.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_conntrack_seqadj.h> int nf_ct_seqadj_init(struct nf_conn *ct, enum ip_conntrack_info ctinfo, s32 off) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); struct nf_conn_seqadj *seqadj; struct nf_ct_seqadj *this_way; if (off == 0) return 0; set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); seqadj = nfct_seqadj(ct); this_way = &seqadj->seq[dir]; this_way->offset_before = off; this_way->offset_after = off; return 0; } EXPORT_SYMBOL_GPL(nf_ct_seqadj_init); int nf_ct_seqadj_set(struct nf_conn *ct, enum ip_conntrack_info ctinfo, __be32 seq, s32 off) { struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); struct nf_ct_seqadj *this_way; if (off == 0) return 0; if (unlikely(!seqadj)) { WARN_ONCE(1, "Missing nfct_seqadj_ext_add() setup call\n"); return 0; } set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); spin_lock_bh(&ct->lock); this_way = &seqadj->seq[dir]; if (this_way->offset_before == this_way->offset_after || before(this_way->correction_pos, ntohl(seq))) { this_way->correction_pos = ntohl(seq); this_way->offset_before = this_way->offset_after; this_way->offset_after += off; } spin_unlock_bh(&ct->lock); return 0; } EXPORT_SYMBOL_GPL(nf_ct_seqadj_set); void nf_ct_tcp_seqadj_set(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, s32 off) { const struct tcphdr *th; if (nf_ct_protonum(ct) != IPPROTO_TCP) return; th = (struct tcphdr *)(skb_network_header(skb) + ip_hdrlen(skb)); nf_ct_seqadj_set(ct, ctinfo, th->seq, off); } EXPORT_SYMBOL_GPL(nf_ct_tcp_seqadj_set); /* Adjust one found SACK option including checksum correction */ static void nf_ct_sack_block_adjust(struct sk_buff *skb, struct tcphdr *tcph, unsigned int sackoff, unsigned int sackend, struct nf_ct_seqadj *seq) { while (sackoff < sackend) { struct tcp_sack_block_wire *sack; __be32 new_start_seq, new_end_seq; sack = (void *)skb->data + sackoff; if (after(ntohl(sack->start_seq) - seq->offset_before, seq->correction_pos)) new_start_seq = htonl(ntohl(sack->start_seq) - seq->offset_after); else new_start_seq = htonl(ntohl(sack->start_seq) - seq->offset_before); if (after(ntohl(sack->end_seq) - seq->offset_before, seq->correction_pos)) new_end_seq = htonl(ntohl(sack->end_seq) - seq->offset_after); else new_end_seq = htonl(ntohl(sack->end_seq) - seq->offset_before); pr_debug("sack_adjust: start_seq: %u->%u, end_seq: %u->%u\n", ntohl(sack->start_seq), ntohl(new_start_seq), ntohl(sack->end_seq), ntohl(new_end_seq)); inet_proto_csum_replace4(&tcph->check, skb, sack->start_seq, new_start_seq, false); inet_proto_csum_replace4(&tcph->check, skb, sack->end_seq, new_end_seq, false); sack->start_seq = new_start_seq; sack->end_seq = new_end_seq; sackoff += sizeof(*sack); } } /* TCP SACK sequence number adjustment */ static unsigned int nf_ct_sack_adjust(struct sk_buff *skb, unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { struct tcphdr *tcph = (void *)skb->data + protoff; struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); unsigned int dir, optoff, optend; optoff = protoff + sizeof(struct tcphdr); optend = protoff + tcph->doff * 4; if (skb_ensure_writable(skb, optend)) return 0; tcph = (void *)skb->data + protoff; dir = CTINFO2DIR(ctinfo); while (optoff < optend) { /* Usually: option, length. */ unsigned char *op = skb->data + optoff; switch (op[0]) { case TCPOPT_EOL: return 1; case TCPOPT_NOP: optoff++; continue; default: /* no partial options */ if (optoff + 1 == optend || optoff + op[1] > optend || op[1] < 2) return 0; if (op[0] == TCPOPT_SACK && op[1] >= 2+TCPOLEN_SACK_PERBLOCK && ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0) nf_ct_sack_block_adjust(skb, tcph, optoff + 2, optoff+op[1], &seqadj->seq[!dir]); optoff += op[1]; } } return 1; } /* TCP sequence number adjustment. Returns 1 on success, 0 on failure */ int nf_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int protoff) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); struct tcphdr *tcph; __be32 newseq, newack; s32 seqoff, ackoff; struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); struct nf_ct_seqadj *this_way, *other_way; int res = 1; this_way = &seqadj->seq[dir]; other_way = &seqadj->seq[!dir]; if (skb_ensure_writable(skb, protoff + sizeof(*tcph))) return 0; tcph = (void *)skb->data + protoff; spin_lock_bh(&ct->lock); if (after(ntohl(tcph->seq), this_way->correction_pos)) seqoff = this_way->offset_after; else seqoff = this_way->offset_before; newseq = htonl(ntohl(tcph->seq) + seqoff); inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, false); pr_debug("Adjusting sequence number from %u->%u\n", ntohl(tcph->seq), ntohl(newseq)); tcph->seq = newseq; if (!tcph->ack) goto out; if (after(ntohl(tcph->ack_seq) - other_way->offset_before, other_way->correction_pos)) ackoff = other_way->offset_after; else ackoff = other_way->offset_before; newack = htonl(ntohl(tcph->ack_seq) - ackoff); inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, false); pr_debug("Adjusting ack number from %u->%u, ack from %u->%u\n", ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq), ntohl(newack)); tcph->ack_seq = newack; res = nf_ct_sack_adjust(skb, protoff, ct, ctinfo); out: spin_unlock_bh(&ct->lock); return res; } EXPORT_SYMBOL_GPL(nf_ct_seq_adjust); s32 nf_ct_seq_offset(const struct nf_conn *ct, enum ip_conntrack_dir dir, u32 seq) { struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); struct nf_ct_seqadj *this_way; if (!seqadj) return 0; this_way = &seqadj->seq[dir]; return after(seq, this_way->correction_pos) ? this_way->offset_after : this_way->offset_before; } EXPORT_SYMBOL_GPL(nf_ct_seq_offset);
4866 4866 4751 4751 4751 4751 5758 5759 5760 5760 5760 4523 2383 5759 5759 5760 5757 5758 5757 5758 11 11 11 11 394 83 5758 5758 5758 5756 221 220 221 221 1105 1107 1105 5757 5758 5758 5757 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 // SPDX-License-Identifier: GPL-2.0 /* * Interface between ext4 and JBD */ #include "ext4_jbd2.h" #include <trace/events/ext4.h> int ext4_inode_journal_mode(struct inode *inode) { if (EXT4_JOURNAL(inode) == NULL) return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ /* We do not support data journalling with delayed allocation */ if (!S_ISREG(inode->i_mode) || ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && !test_opt(inode->i_sb, DELALLOC))) { /* We do not support data journalling for encrypted data */ if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ } if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ BUG(); } /* Just increment the non-pointer handle value */ static handle_t *ext4_get_nojournal(void) { handle_t *handle = current->journal_info; unsigned long ref_cnt = (unsigned long)handle; BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT); ref_cnt++; handle = (handle_t *)ref_cnt; current->journal_info = handle; return handle; } /* Decrement the non-pointer handle value */ static void ext4_put_nojournal(handle_t *handle) { unsigned long ref_cnt = (unsigned long)handle; BUG_ON(ref_cnt == 0); ref_cnt--; handle = (handle_t *)ref_cnt; current->journal_info = handle; } /* * Wrappers for jbd2_journal_start/end. */ static int ext4_journal_check_start(struct super_block *sb) { journal_t *journal; might_sleep(); if (unlikely(ext4_forced_shutdown(sb))) return -EIO; if (WARN_ON_ONCE(sb_rdonly(sb))) return -EROFS; WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE); journal = EXT4_SB(sb)->s_journal; /* * Special case here: if the journal has aborted behind our * backs (eg. EIO in the commit thread), then we still need to * take the FS itself readonly cleanly. */ if (journal && is_journal_aborted(journal)) { ext4_abort(sb, -journal->j_errno, "Detected aborted journal"); return -EROFS; } return 0; } handle_t *__ext4_journal_start_sb(struct inode *inode, struct super_block *sb, unsigned int line, int type, int blocks, int rsv_blocks, int revoke_creds) { journal_t *journal; int err; if (inode) trace_ext4_journal_start_inode(inode, blocks, rsv_blocks, revoke_creds, type, _RET_IP_); else trace_ext4_journal_start_sb(sb, blocks, rsv_blocks, revoke_creds, type, _RET_IP_); err = ext4_journal_check_start(sb); if (err < 0) return ERR_PTR(err); journal = EXT4_SB(sb)->s_journal; if (!journal || (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) return ext4_get_nojournal(); return jbd2__journal_start(journal, blocks, rsv_blocks, revoke_creds, GFP_NOFS, type, line); } int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) { struct super_block *sb; int err; int rc; if (!ext4_handle_valid(handle)) { ext4_put_nojournal(handle); return 0; } err = handle->h_err; if (!handle->h_transaction) { rc = jbd2_journal_stop(handle); return err ? err : rc; } sb = handle->h_transaction->t_journal->j_private; rc = jbd2_journal_stop(handle); if (!err) err = rc; if (err) __ext4_std_error(sb, where, line, err); return err; } handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line, int type) { struct super_block *sb; int err; if (!ext4_handle_valid(handle)) return ext4_get_nojournal(); sb = handle->h_journal->j_private; trace_ext4_journal_start_reserved(sb, jbd2_handle_buffer_credits(handle), _RET_IP_); err = ext4_journal_check_start(sb); if (err < 0) { jbd2_journal_free_reserved(handle); return ERR_PTR(err); } err = jbd2_journal_start_reserved(handle, type, line); if (err < 0) return ERR_PTR(err); return handle; } int __ext4_journal_ensure_credits(handle_t *handle, int check_cred, int extend_cred, int revoke_cred) { if (!ext4_handle_valid(handle)) return 0; if (is_handle_aborted(handle)) return -EROFS; if (jbd2_handle_buffer_credits(handle) >= check_cred && handle->h_revoke_credits >= revoke_cred) return 0; extend_cred = max(0, extend_cred - jbd2_handle_buffer_credits(handle)); revoke_cred = max(0, revoke_cred - handle->h_revoke_credits); return ext4_journal_extend(handle, extend_cred, revoke_cred); } static void ext4_journal_abort_handle(const char *caller, unsigned int line, const char *err_fn, struct buffer_head *bh, handle_t *handle, int err) { char nbuf[16]; const char *errstr = ext4_decode_error(NULL, err, nbuf); BUG_ON(!ext4_handle_valid(handle)); if (bh) BUFFER_TRACE(bh, "abort"); if (!handle->h_err) handle->h_err = err; if (is_handle_aborted(handle)) return; printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n", caller, line, errstr, err_fn); jbd2_journal_abort_handle(handle); } static void ext4_check_bdev_write_error(struct super_block *sb) { struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; struct ext4_sb_info *sbi = EXT4_SB(sb); int err; /* * If the block device has write error flag, it may have failed to * async write out metadata buffers in the background. In this case, * we could read old data from disk and write it out again, which * may lead to on-disk filesystem inconsistency. */ if (errseq_check(&mapping->wb_err, READ_ONCE(sbi->s_bdev_wb_err))) { spin_lock(&sbi->s_bdev_wb_lock); err = errseq_check_and_advance(&mapping->wb_err, &sbi->s_bdev_wb_err); spin_unlock(&sbi->s_bdev_wb_lock); if (err) ext4_error_err(sb, -err, "Error while async write back metadata"); } } int __ext4_journal_get_write_access(const char *where, unsigned int line, handle_t *handle, struct super_block *sb, struct buffer_head *bh, enum ext4_journal_trigger_type trigger_type) { int err; might_sleep(); ext4_check_bdev_write_error(sb); if (ext4_handle_valid(handle)) { err = jbd2_journal_get_write_access(handle, bh); if (err) { ext4_journal_abort_handle(where, line, __func__, bh, handle, err); return err; } } if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb)) return 0; BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT); jbd2_journal_set_triggers(bh, &EXT4_SB(sb)->s_journal_triggers[trigger_type].tr_triggers); return 0; } /* * The ext4 forget function must perform a revoke if we are freeing data * which has been journaled. Metadata (eg. indirect blocks) must be * revoked in all cases. * * "bh" may be NULL: a metadata block may have been freed from memory * but there may still be a record of it in the journal, and that record * still needs to be revoked. */ int __ext4_forget(const char *where, unsigned int line, handle_t *handle, int is_metadata, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t blocknr) { int err; might_sleep(); trace_ext4_forget(inode, is_metadata, blocknr); BUFFER_TRACE(bh, "enter"); ext4_debug("forgetting bh %p: is_metadata=%d, mode %o, data mode %x\n", bh, is_metadata, inode->i_mode, test_opt(inode->i_sb, DATA_FLAGS)); /* In the no journal case, we can just do a bforget and return */ if (!ext4_handle_valid(handle)) { bforget(bh); return 0; } /* Never use the revoke function if we are doing full data * journaling: there is no need to, and a V1 superblock won't * support it. Otherwise, only skip the revoke on un-journaled * data blocks. */ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || (!is_metadata && !ext4_should_journal_data(inode))) { if (bh) { BUFFER_TRACE(bh, "call jbd2_journal_forget"); err = jbd2_journal_forget(handle, bh); if (err) ext4_journal_abort_handle(where, line, __func__, bh, handle, err); return err; } return 0; } /* * data!=journal && (is_metadata || should_journal_data(inode)) */ BUFFER_TRACE(bh, "call jbd2_journal_revoke"); err = jbd2_journal_revoke(handle, blocknr, bh); if (err) { ext4_journal_abort_handle(where, line, __func__, bh, handle, err); __ext4_error(inode->i_sb, where, line, true, -err, 0, "error %d when attempting revoke", err); } BUFFER_TRACE(bh, "exit"); return err; } int __ext4_journal_get_create_access(const char *where, unsigned int line, handle_t *handle, struct super_block *sb, struct buffer_head *bh, enum ext4_journal_trigger_type trigger_type) { int err; if (!ext4_handle_valid(handle)) return 0; err = jbd2_journal_get_create_access(handle, bh); if (err) { ext4_journal_abort_handle(where, line, __func__, bh, handle, err); return err; } if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb)) return 0; BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT); jbd2_journal_set_triggers(bh, &EXT4_SB(sb)->s_journal_triggers[trigger_type].tr_triggers); return 0; } int __ext4_handle_dirty_metadata(const char *where, unsigned int line, handle_t *handle, struct inode *inode, struct buffer_head *bh) { int err = 0; might_sleep(); set_buffer_meta(bh); set_buffer_prio(bh); set_buffer_uptodate(bh); if (ext4_handle_valid(handle)) { err = jbd2_journal_dirty_metadata(handle, bh); /* Errors can only happen due to aborted journal or a nasty bug */ if (!is_handle_aborted(handle) && WARN_ON_ONCE(err)) { ext4_journal_abort_handle(where, line, __func__, bh, handle, err); if (inode == NULL) { pr_err("EXT4: jbd2_journal_dirty_metadata " "failed: handle type %u started at " "line %u, credits %u/%u, errcode %d", handle->h_type, handle->h_line_no, handle->h_requested_credits, jbd2_handle_buffer_credits(handle), err); return err; } ext4_error_inode(inode, where, line, bh->b_blocknr, "journal_dirty_metadata failed: " "handle type %u started at line %u, " "credits %u/%u, errcode %d", handle->h_type, handle->h_line_no, handle->h_requested_credits, jbd2_handle_buffer_credits(handle), err); } } else { if (inode) mark_buffer_dirty_inode(bh, inode); else mark_buffer_dirty(bh); if (inode && inode_needs_sync(inode)) { sync_dirty_buffer(bh); if (buffer_req(bh) && !buffer_uptodate(bh)) { ext4_error_inode_err(inode, where, line, bh->b_blocknr, EIO, "IO error syncing itable block"); err = -EIO; } } } return err; }
6 6 214 6 4 1 3 1 2 1 2 6 6 6 4 3 1 1 4 2 3 3 3 3 5 5 5 5 3 4 4 1 4 1 4 3 7 3 7 7 1 1 1 7 4 2 4 4 4 213 245 1 6 6 1539 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 // SPDX-License-Identifier: GPL-2.0 /* Generic nexthop implementation * * Copyright (c) 2017-19 Cumulus Networks * Copyright (c) 2017-19 David Ahern <dsa@cumulusnetworks.com> */ #include <linux/nexthop.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <net/arp.h> #include <net/ipv6_stubs.h> #include <net/lwtunnel.h> #include <net/ndisc.h> #include <net/nexthop.h> #include <net/route.h> #include <net/sock.h> #define NH_RES_DEFAULT_IDLE_TIMER (120 * HZ) #define NH_RES_DEFAULT_UNBALANCED_TIMER 0 /* No forced rebalancing. */ static void remove_nexthop(struct net *net, struct nexthop *nh, struct nl_info *nlinfo); #define NH_DEV_HASHBITS 8 #define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS) static const struct nla_policy rtm_nh_policy_new[] = { [NHA_ID] = { .type = NLA_U32 }, [NHA_GROUP] = { .type = NLA_BINARY }, [NHA_GROUP_TYPE] = { .type = NLA_U16 }, [NHA_BLACKHOLE] = { .type = NLA_FLAG }, [NHA_OIF] = { .type = NLA_U32 }, [NHA_GATEWAY] = { .type = NLA_BINARY }, [NHA_ENCAP_TYPE] = { .type = NLA_U16 }, [NHA_ENCAP] = { .type = NLA_NESTED }, [NHA_FDB] = { .type = NLA_FLAG }, [NHA_RES_GROUP] = { .type = NLA_NESTED }, }; static const struct nla_policy rtm_nh_policy_get[] = { [NHA_ID] = { .type = NLA_U32 }, }; static const struct nla_policy rtm_nh_policy_dump[] = { [NHA_OIF] = { .type = NLA_U32 }, [NHA_GROUPS] = { .type = NLA_FLAG }, [NHA_MASTER] = { .type = NLA_U32 }, [NHA_FDB] = { .type = NLA_FLAG }, }; static const struct nla_policy rtm_nh_res_policy_new[] = { [NHA_RES_GROUP_BUCKETS] = { .type = NLA_U16 }, [NHA_RES_GROUP_IDLE_TIMER] = { .type = NLA_U32 }, [NHA_RES_GROUP_UNBALANCED_TIMER] = { .type = NLA_U32 }, }; static const struct nla_policy rtm_nh_policy_dump_bucket[] = { [NHA_ID] = { .type = NLA_U32 }, [NHA_OIF] = { .type = NLA_U32 }, [NHA_MASTER] = { .type = NLA_U32 }, [NHA_RES_BUCKET] = { .type = NLA_NESTED }, }; static const struct nla_policy rtm_nh_res_bucket_policy_dump[] = { [NHA_RES_BUCKET_NH_ID] = { .type = NLA_U32 }, }; static const struct nla_policy rtm_nh_policy_get_bucket[] = { [NHA_ID] = { .type = NLA_U32 }, [NHA_RES_BUCKET] = { .type = NLA_NESTED }, }; static const struct nla_policy rtm_nh_res_bucket_policy_get[] = { [NHA_RES_BUCKET_INDEX] = { .type = NLA_U16 }, }; static bool nexthop_notifiers_is_empty(struct net *net) { return !net->nexthop.notifier_chain.head; } static void __nh_notifier_single_info_init(struct nh_notifier_single_info *nh_info, const struct nh_info *nhi) { nh_info->dev = nhi->fib_nhc.nhc_dev; nh_info->gw_family = nhi->fib_nhc.nhc_gw_family; if (nh_info->gw_family == AF_INET) nh_info->ipv4 = nhi->fib_nhc.nhc_gw.ipv4; else if (nh_info->gw_family == AF_INET6) nh_info->ipv6 = nhi->fib_nhc.nhc_gw.ipv6; nh_info->is_reject = nhi->reject_nh; nh_info->is_fdb = nhi->fdb_nh; nh_info->has_encap = !!nhi->fib_nhc.nhc_lwtstate; } static int nh_notifier_single_info_init(struct nh_notifier_info *info, const struct nexthop *nh) { struct nh_info *nhi = rtnl_dereference(nh->nh_info); info->type = NH_NOTIFIER_INFO_TYPE_SINGLE; info->nh = kzalloc(sizeof(*info->nh), GFP_KERNEL); if (!info->nh) return -ENOMEM; __nh_notifier_single_info_init(info->nh, nhi); return 0; } static void nh_notifier_single_info_fini(struct nh_notifier_info *info) { kfree(info->nh); } static int nh_notifier_mpath_info_init(struct nh_notifier_info *info, struct nh_group *nhg) { u16 num_nh = nhg->num_nh; int i; info->type = NH_NOTIFIER_INFO_TYPE_GRP; info->nh_grp = kzalloc(struct_size(info->nh_grp, nh_entries, num_nh), GFP_KERNEL); if (!info->nh_grp) return -ENOMEM; info->nh_grp->num_nh = num_nh; info->nh_grp->is_fdb = nhg->fdb_nh; for (i = 0; i < num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; struct nh_info *nhi; nhi = rtnl_dereference(nhge->nh->nh_info); info->nh_grp->nh_entries[i].id = nhge->nh->id; info->nh_grp->nh_entries[i].weight = nhge->weight; __nh_notifier_single_info_init(&info->nh_grp->nh_entries[i].nh, nhi); } return 0; } static int nh_notifier_res_table_info_init(struct nh_notifier_info *info, struct nh_group *nhg) { struct nh_res_table *res_table = rtnl_dereference(nhg->res_table); u16 num_nh_buckets = res_table->num_nh_buckets; unsigned long size; u16 i; info->type = NH_NOTIFIER_INFO_TYPE_RES_TABLE; size = struct_size(info->nh_res_table, nhs, num_nh_buckets); info->nh_res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN); if (!info->nh_res_table) return -ENOMEM; info->nh_res_table->num_nh_buckets = num_nh_buckets; for (i = 0; i < num_nh_buckets; i++) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; struct nh_grp_entry *nhge; struct nh_info *nhi; nhge = rtnl_dereference(bucket->nh_entry); nhi = rtnl_dereference(nhge->nh->nh_info); __nh_notifier_single_info_init(&info->nh_res_table->nhs[i], nhi); } return 0; } static int nh_notifier_grp_info_init(struct nh_notifier_info *info, const struct nexthop *nh) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); if (nhg->hash_threshold) return nh_notifier_mpath_info_init(info, nhg); else if (nhg->resilient) return nh_notifier_res_table_info_init(info, nhg); return -EINVAL; } static void nh_notifier_grp_info_fini(struct nh_notifier_info *info, const struct nexthop *nh) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); if (nhg->hash_threshold) kfree(info->nh_grp); else if (nhg->resilient) vfree(info->nh_res_table); } static int nh_notifier_info_init(struct nh_notifier_info *info, const struct nexthop *nh) { info->id = nh->id; if (nh->is_group) return nh_notifier_grp_info_init(info, nh); else return nh_notifier_single_info_init(info, nh); } static void nh_notifier_info_fini(struct nh_notifier_info *info, const struct nexthop *nh) { if (nh->is_group) nh_notifier_grp_info_fini(info, nh); else nh_notifier_single_info_fini(info); } static int call_nexthop_notifiers(struct net *net, enum nexthop_event_type event_type, struct nexthop *nh, struct netlink_ext_ack *extack) { struct nh_notifier_info info = { .net = net, .extack = extack, }; int err; ASSERT_RTNL(); if (nexthop_notifiers_is_empty(net)) return 0; err = nh_notifier_info_init(&info, nh); if (err) { NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info"); return err; } err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, event_type, &info); nh_notifier_info_fini(&info, nh); return notifier_to_errno(err); } static int nh_notifier_res_bucket_idle_timer_get(const struct nh_notifier_info *info, bool force, unsigned int *p_idle_timer_ms) { struct nh_res_table *res_table; struct nh_group *nhg; struct nexthop *nh; int err = 0; /* When 'force' is false, nexthop bucket replacement is performed * because the bucket was deemed to be idle. In this case, capable * listeners can choose to perform an atomic replacement: The bucket is * only replaced if it is inactive. However, if the idle timer interval * is smaller than the interval in which a listener is querying * buckets' activity from the device, then atomic replacement should * not be tried. Pass the idle timer value to listeners, so that they * could determine which type of replacement to perform. */ if (force) { *p_idle_timer_ms = 0; return 0; } rcu_read_lock(); nh = nexthop_find_by_id(info->net, info->id); if (!nh) { err = -EINVAL; goto out; } nhg = rcu_dereference(nh->nh_grp); res_table = rcu_dereference(nhg->res_table); *p_idle_timer_ms = jiffies_to_msecs(res_table->idle_timer); out: rcu_read_unlock(); return err; } static int nh_notifier_res_bucket_info_init(struct nh_notifier_info *info, u16 bucket_index, bool force, struct nh_info *oldi, struct nh_info *newi) { unsigned int idle_timer_ms; int err; err = nh_notifier_res_bucket_idle_timer_get(info, force, &idle_timer_ms); if (err) return err; info->type = NH_NOTIFIER_INFO_TYPE_RES_BUCKET; info->nh_res_bucket = kzalloc(sizeof(*info->nh_res_bucket), GFP_KERNEL); if (!info->nh_res_bucket) return -ENOMEM; info->nh_res_bucket->bucket_index = bucket_index; info->nh_res_bucket->idle_timer_ms = idle_timer_ms; info->nh_res_bucket->force = force; __nh_notifier_single_info_init(&info->nh_res_bucket->old_nh, oldi); __nh_notifier_single_info_init(&info->nh_res_bucket->new_nh, newi); return 0; } static void nh_notifier_res_bucket_info_fini(struct nh_notifier_info *info) { kfree(info->nh_res_bucket); } static int __call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id, u16 bucket_index, bool force, struct nh_info *oldi, struct nh_info *newi, struct netlink_ext_ack *extack) { struct nh_notifier_info info = { .net = net, .extack = extack, .id = nhg_id, }; int err; if (nexthop_notifiers_is_empty(net)) return 0; err = nh_notifier_res_bucket_info_init(&info, bucket_index, force, oldi, newi); if (err) return err; err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, NEXTHOP_EVENT_BUCKET_REPLACE, &info); nh_notifier_res_bucket_info_fini(&info); return notifier_to_errno(err); } /* There are three users of RES_TABLE, and NHs etc. referenced from there: * * 1) a collection of callbacks for NH maintenance. This operates under * RTNL, * 2) the delayed work that gradually balances the resilient table, * 3) and nexthop_select_path(), operating under RCU. * * Both the delayed work and the RTNL block are writers, and need to * maintain mutual exclusion. Since there are only two and well-known * writers for each table, the RTNL code can make sure it has exclusive * access thus: * * - Have the DW operate without locking; * - synchronously cancel the DW; * - do the writing; * - if the write was not actually a delete, call upkeep, which schedules * DW again if necessary. * * The functions that are always called from the RTNL context use * rtnl_dereference(). The functions that can also be called from the DW do * a raw dereference and rely on the above mutual exclusion scheme. */ #define nh_res_dereference(p) (rcu_dereference_raw(p)) static int call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id, u16 bucket_index, bool force, struct nexthop *old_nh, struct nexthop *new_nh, struct netlink_ext_ack *extack) { struct nh_info *oldi = nh_res_dereference(old_nh->nh_info); struct nh_info *newi = nh_res_dereference(new_nh->nh_info); return __call_nexthop_res_bucket_notifiers(net, nhg_id, bucket_index, force, oldi, newi, extack); } static int call_nexthop_res_table_notifiers(struct net *net, struct nexthop *nh, struct netlink_ext_ack *extack) { struct nh_notifier_info info = { .net = net, .extack = extack, }; struct nh_group *nhg; int err; ASSERT_RTNL(); if (nexthop_notifiers_is_empty(net)) return 0; /* At this point, the nexthop buckets are still not populated. Only * emit a notification with the logical nexthops, so that a listener * could potentially veto it in case of unsupported configuration. */ nhg = rtnl_dereference(nh->nh_grp); err = nh_notifier_mpath_info_init(&info, nhg); if (err) { NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info"); return err; } err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE, &info); kfree(info.nh_grp); return notifier_to_errno(err); } static int call_nexthop_notifier(struct notifier_block *nb, struct net *net, enum nexthop_event_type event_type, struct nexthop *nh, struct netlink_ext_ack *extack) { struct nh_notifier_info info = { .net = net, .extack = extack, }; int err; err = nh_notifier_info_init(&info, nh); if (err) return err; err = nb->notifier_call(nb, event_type, &info); nh_notifier_info_fini(&info, nh); return notifier_to_errno(err); } static unsigned int nh_dev_hashfn(unsigned int val) { unsigned int mask = NH_DEV_HASHSIZE - 1; return (val ^ (val >> NH_DEV_HASHBITS) ^ (val >> (NH_DEV_HASHBITS * 2))) & mask; } static void nexthop_devhash_add(struct net *net, struct nh_info *nhi) { struct net_device *dev = nhi->fib_nhc.nhc_dev; struct hlist_head *head; unsigned int hash; WARN_ON(!dev); hash = nh_dev_hashfn(dev->ifindex); head = &net->nexthop.devhash[hash]; hlist_add_head(&nhi->dev_hash, head); } static void nexthop_free_group(struct nexthop *nh) { struct nh_group *nhg; int i; nhg = rcu_dereference_raw(nh->nh_grp); for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; WARN_ON(!list_empty(&nhge->nh_list)); nexthop_put(nhge->nh); } WARN_ON(nhg->spare == nhg); if (nhg->resilient) vfree(rcu_dereference_raw(nhg->res_table)); kfree(nhg->spare); kfree(nhg); } static void nexthop_free_single(struct nexthop *nh) { struct nh_info *nhi; nhi = rcu_dereference_raw(nh->nh_info); switch (nhi->family) { case AF_INET: fib_nh_release(nh->net, &nhi->fib_nh); break; case AF_INET6: ipv6_stub->fib6_nh_release(&nhi->fib6_nh); break; } kfree(nhi); } void nexthop_free_rcu(struct rcu_head *head) { struct nexthop *nh = container_of(head, struct nexthop, rcu); if (nh->is_group) nexthop_free_group(nh); else nexthop_free_single(nh); kfree(nh); } EXPORT_SYMBOL_GPL(nexthop_free_rcu); static struct nexthop *nexthop_alloc(void) { struct nexthop *nh; nh = kzalloc(sizeof(struct nexthop), GFP_KERNEL); if (nh) { INIT_LIST_HEAD(&nh->fi_list); INIT_LIST_HEAD(&nh->f6i_list); INIT_LIST_HEAD(&nh->grp_list); INIT_LIST_HEAD(&nh->fdb_list); } return nh; } static struct nh_group *nexthop_grp_alloc(u16 num_nh) { struct nh_group *nhg; nhg = kzalloc(struct_size(nhg, nh_entries, num_nh), GFP_KERNEL); if (nhg) nhg->num_nh = num_nh; return nhg; } static void nh_res_table_upkeep_dw(struct work_struct *work); static struct nh_res_table * nexthop_res_table_alloc(struct net *net, u32 nhg_id, struct nh_config *cfg) { const u16 num_nh_buckets = cfg->nh_grp_res_num_buckets; struct nh_res_table *res_table; unsigned long size; size = struct_size(res_table, nh_buckets, num_nh_buckets); res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN); if (!res_table) return NULL; res_table->net = net; res_table->nhg_id = nhg_id; INIT_DELAYED_WORK(&res_table->upkeep_dw, &nh_res_table_upkeep_dw); INIT_LIST_HEAD(&res_table->uw_nh_entries); res_table->idle_timer = cfg->nh_grp_res_idle_timer; res_table->unbalanced_timer = cfg->nh_grp_res_unbalanced_timer; res_table->num_nh_buckets = num_nh_buckets; return res_table; } static void nh_base_seq_inc(struct net *net) { while (++net->nexthop.seq == 0) ; } /* no reference taken; rcu lock or rtnl must be held */ struct nexthop *nexthop_find_by_id(struct net *net, u32 id) { struct rb_node **pp, *parent = NULL, *next; pp = &net->nexthop.rb_root.rb_node; while (1) { struct nexthop *nh; next = rcu_dereference_raw(*pp); if (!next) break; parent = next; nh = rb_entry(parent, struct nexthop, rb_node); if (id < nh->id) pp = &next->rb_left; else if (id > nh->id) pp = &next->rb_right; else return nh; } return NULL; } EXPORT_SYMBOL_GPL(nexthop_find_by_id); /* used for auto id allocation; called with rtnl held */ static u32 nh_find_unused_id(struct net *net) { u32 id_start = net->nexthop.last_id_allocated; while (1) { net->nexthop.last_id_allocated++; if (net->nexthop.last_id_allocated == id_start) break; if (!nexthop_find_by_id(net, net->nexthop.last_id_allocated)) return net->nexthop.last_id_allocated; } return 0; } static void nh_res_time_set_deadline(unsigned long next_time, unsigned long *deadline) { if (time_before(next_time, *deadline)) *deadline = next_time; } static clock_t nh_res_table_unbalanced_time(struct nh_res_table *res_table) { if (list_empty(&res_table->uw_nh_entries)) return 0; return jiffies_delta_to_clock_t(jiffies - res_table->unbalanced_since); } static int nla_put_nh_group_res(struct sk_buff *skb, struct nh_group *nhg) { struct nh_res_table *res_table = rtnl_dereference(nhg->res_table); struct nlattr *nest; nest = nla_nest_start(skb, NHA_RES_GROUP); if (!nest) return -EMSGSIZE; if (nla_put_u16(skb, NHA_RES_GROUP_BUCKETS, res_table->num_nh_buckets) || nla_put_u32(skb, NHA_RES_GROUP_IDLE_TIMER, jiffies_to_clock_t(res_table->idle_timer)) || nla_put_u32(skb, NHA_RES_GROUP_UNBALANCED_TIMER, jiffies_to_clock_t(res_table->unbalanced_timer)) || nla_put_u64_64bit(skb, NHA_RES_GROUP_UNBALANCED_TIME, nh_res_table_unbalanced_time(res_table), NHA_RES_GROUP_PAD)) goto nla_put_failure; nla_nest_end(skb, nest); return 0; nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nhg) { struct nexthop_grp *p; size_t len = nhg->num_nh * sizeof(*p); struct nlattr *nla; u16 group_type = 0; int i; if (nhg->hash_threshold) group_type = NEXTHOP_GRP_TYPE_MPATH; else if (nhg->resilient) group_type = NEXTHOP_GRP_TYPE_RES; if (nla_put_u16(skb, NHA_GROUP_TYPE, group_type)) goto nla_put_failure; nla = nla_reserve(skb, NHA_GROUP, len); if (!nla) goto nla_put_failure; p = nla_data(nla); for (i = 0; i < nhg->num_nh; ++i) { p->id = nhg->nh_entries[i].nh->id; p->weight = nhg->nh_entries[i].weight - 1; p += 1; } if (nhg->resilient && nla_put_nh_group_res(skb, nhg)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh, int event, u32 portid, u32 seq, unsigned int nlflags) { struct fib6_nh *fib6_nh; struct fib_nh *fib_nh; struct nlmsghdr *nlh; struct nh_info *nhi; struct nhmsg *nhm; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags); if (!nlh) return -EMSGSIZE; nhm = nlmsg_data(nlh); nhm->nh_family = AF_UNSPEC; nhm->nh_flags = nh->nh_flags; nhm->nh_protocol = nh->protocol; nhm->nh_scope = 0; nhm->resvd = 0; if (nla_put_u32(skb, NHA_ID, nh->id)) goto nla_put_failure; if (nh->is_group) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); if (nhg->fdb_nh && nla_put_flag(skb, NHA_FDB)) goto nla_put_failure; if (nla_put_nh_group(skb, nhg)) goto nla_put_failure; goto out; } nhi = rtnl_dereference(nh->nh_info); nhm->nh_family = nhi->family; if (nhi->reject_nh) { if (nla_put_flag(skb, NHA_BLACKHOLE)) goto nla_put_failure; goto out; } else if (nhi->fdb_nh) { if (nla_put_flag(skb, NHA_FDB)) goto nla_put_failure; } else { const struct net_device *dev; dev = nhi->fib_nhc.nhc_dev; if (dev && nla_put_u32(skb, NHA_OIF, dev->ifindex)) goto nla_put_failure; } nhm->nh_scope = nhi->fib_nhc.nhc_scope; switch (nhi->family) { case AF_INET: fib_nh = &nhi->fib_nh; if (fib_nh->fib_nh_gw_family && nla_put_be32(skb, NHA_GATEWAY, fib_nh->fib_nh_gw4)) goto nla_put_failure; break; case AF_INET6: fib6_nh = &nhi->fib6_nh; if (fib6_nh->fib_nh_gw_family && nla_put_in6_addr(skb, NHA_GATEWAY, &fib6_nh->fib_nh_gw6)) goto nla_put_failure; break; } if (nhi->fib_nhc.nhc_lwtstate && lwtunnel_fill_encap(skb, nhi->fib_nhc.nhc_lwtstate, NHA_ENCAP, NHA_ENCAP_TYPE) < 0) goto nla_put_failure; out: nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static size_t nh_nlmsg_size_grp_res(struct nh_group *nhg) { return nla_total_size(0) + /* NHA_RES_GROUP */ nla_total_size(2) + /* NHA_RES_GROUP_BUCKETS */ nla_total_size(4) + /* NHA_RES_GROUP_IDLE_TIMER */ nla_total_size(4) + /* NHA_RES_GROUP_UNBALANCED_TIMER */ nla_total_size_64bit(8);/* NHA_RES_GROUP_UNBALANCED_TIME */ } static size_t nh_nlmsg_size_grp(struct nexthop *nh) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); size_t sz = sizeof(struct nexthop_grp) * nhg->num_nh; size_t tot = nla_total_size(sz) + nla_total_size(2); /* NHA_GROUP_TYPE */ if (nhg->resilient) tot += nh_nlmsg_size_grp_res(nhg); return tot; } static size_t nh_nlmsg_size_single(struct nexthop *nh) { struct nh_info *nhi = rtnl_dereference(nh->nh_info); size_t sz; /* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE * are mutually exclusive */ sz = nla_total_size(4); /* NHA_OIF */ switch (nhi->family) { case AF_INET: if (nhi->fib_nh.fib_nh_gw_family) sz += nla_total_size(4); /* NHA_GATEWAY */ break; case AF_INET6: /* NHA_GATEWAY */ if (nhi->fib6_nh.fib_nh_gw_family) sz += nla_total_size(sizeof(const struct in6_addr)); break; } if (nhi->fib_nhc.nhc_lwtstate) { sz += lwtunnel_get_encap_size(nhi->fib_nhc.nhc_lwtstate); sz += nla_total_size(2); /* NHA_ENCAP_TYPE */ } return sz; } static size_t nh_nlmsg_size(struct nexthop *nh) { size_t sz = NLMSG_ALIGN(sizeof(struct nhmsg)); sz += nla_total_size(4); /* NHA_ID */ if (nh->is_group) sz += nh_nlmsg_size_grp(nh); else sz += nh_nlmsg_size_single(nh); return sz; } static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info) { unsigned int nlflags = info->nlh ? info->nlh->nlmsg_flags : 0; u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(nh_nlmsg_size(nh), gfp_any()); if (!skb) goto errout; err = nh_fill_node(skb, nh, event, info->portid, seq, nlflags); if (err < 0) { /* -EMSGSIZE implies BUG in nh_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_NEXTHOP, info->nlh, gfp_any()); return; errout: if (err < 0) rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err); } static unsigned long nh_res_bucket_used_time(const struct nh_res_bucket *bucket) { return (unsigned long)atomic_long_read(&bucket->used_time); } static unsigned long nh_res_bucket_idle_point(const struct nh_res_table *res_table, const struct nh_res_bucket *bucket, unsigned long now) { unsigned long time = nh_res_bucket_used_time(bucket); /* Bucket was not used since it was migrated. The idle time is now. */ if (time == bucket->migrated_time) return now; return time + res_table->idle_timer; } static unsigned long nh_res_table_unb_point(const struct nh_res_table *res_table) { return res_table->unbalanced_since + res_table->unbalanced_timer; } static void nh_res_bucket_set_idle(const struct nh_res_table *res_table, struct nh_res_bucket *bucket) { unsigned long now = jiffies; atomic_long_set(&bucket->used_time, (long)now); bucket->migrated_time = now; } static void nh_res_bucket_set_busy(struct nh_res_bucket *bucket) { atomic_long_set(&bucket->used_time, (long)jiffies); } static clock_t nh_res_bucket_idle_time(const struct nh_res_bucket *bucket) { unsigned long used_time = nh_res_bucket_used_time(bucket); return jiffies_delta_to_clock_t(jiffies - used_time); } static int nh_fill_res_bucket(struct sk_buff *skb, struct nexthop *nh, struct nh_res_bucket *bucket, u16 bucket_index, int event, u32 portid, u32 seq, unsigned int nlflags, struct netlink_ext_ack *extack) { struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry); struct nlmsghdr *nlh; struct nlattr *nest; struct nhmsg *nhm; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags); if (!nlh) return -EMSGSIZE; nhm = nlmsg_data(nlh); nhm->nh_family = AF_UNSPEC; nhm->nh_flags = bucket->nh_flags; nhm->nh_protocol = nh->protocol; nhm->nh_scope = 0; nhm->resvd = 0; if (nla_put_u32(skb, NHA_ID, nh->id)) goto nla_put_failure; nest = nla_nest_start(skb, NHA_RES_BUCKET); if (!nest) goto nla_put_failure; if (nla_put_u16(skb, NHA_RES_BUCKET_INDEX, bucket_index) || nla_put_u32(skb, NHA_RES_BUCKET_NH_ID, nhge->nh->id) || nla_put_u64_64bit(skb, NHA_RES_BUCKET_IDLE_TIME, nh_res_bucket_idle_time(bucket), NHA_RES_BUCKET_PAD)) goto nla_put_failure_nest; nla_nest_end(skb, nest); nlmsg_end(skb, nlh); return 0; nla_put_failure_nest: nla_nest_cancel(skb, nest); nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static void nexthop_bucket_notify(struct nh_res_table *res_table, u16 bucket_index) { struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index]; struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry); struct nexthop *nh = nhge->nh_parent; struct sk_buff *skb; int err = -ENOBUFS; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) goto errout; err = nh_fill_res_bucket(skb, nh, bucket, bucket_index, RTM_NEWNEXTHOPBUCKET, 0, 0, NLM_F_REPLACE, NULL); if (err < 0) { kfree_skb(skb); goto errout; } rtnl_notify(skb, nh->net, 0, RTNLGRP_NEXTHOP, NULL, GFP_KERNEL); return; errout: if (err < 0) rtnl_set_sk_err(nh->net, RTNLGRP_NEXTHOP, err); } static bool valid_group_nh(struct nexthop *nh, unsigned int npaths, bool *is_fdb, struct netlink_ext_ack *extack) { if (nh->is_group) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); /* Nesting groups within groups is not supported. */ if (nhg->hash_threshold) { NL_SET_ERR_MSG(extack, "Hash-threshold group can not be a nexthop within a group"); return false; } if (nhg->resilient) { NL_SET_ERR_MSG(extack, "Resilient group can not be a nexthop within a group"); return false; } *is_fdb = nhg->fdb_nh; } else { struct nh_info *nhi = rtnl_dereference(nh->nh_info); if (nhi->reject_nh && npaths > 1) { NL_SET_ERR_MSG(extack, "Blackhole nexthop can not be used in a group with more than 1 path"); return false; } *is_fdb = nhi->fdb_nh; } return true; } static int nh_check_attr_fdb_group(struct nexthop *nh, u8 *nh_family, struct netlink_ext_ack *extack) { struct nh_info *nhi; nhi = rtnl_dereference(nh->nh_info); if (!nhi->fdb_nh) { NL_SET_ERR_MSG(extack, "FDB nexthop group can only have fdb nexthops"); return -EINVAL; } if (*nh_family == AF_UNSPEC) { *nh_family = nhi->family; } else if (*nh_family != nhi->family) { NL_SET_ERR_MSG(extack, "FDB nexthop group cannot have mixed family nexthops"); return -EINVAL; } return 0; } static int nh_check_attr_group(struct net *net, struct nlattr *tb[], size_t tb_size, u16 nh_grp_type, struct netlink_ext_ack *extack) { unsigned int len = nla_len(tb[NHA_GROUP]); u8 nh_family = AF_UNSPEC; struct nexthop_grp *nhg; unsigned int i, j; u8 nhg_fdb = 0; if (!len || len & (sizeof(struct nexthop_grp) - 1)) { NL_SET_ERR_MSG(extack, "Invalid length for nexthop group attribute"); return -EINVAL; } /* convert len to number of nexthop ids */ len /= sizeof(*nhg); nhg = nla_data(tb[NHA_GROUP]); for (i = 0; i < len; ++i) { if (nhg[i].resvd1 || nhg[i].resvd2) { NL_SET_ERR_MSG(extack, "Reserved fields in nexthop_grp must be 0"); return -EINVAL; } if (nhg[i].weight > 254) { NL_SET_ERR_MSG(extack, "Invalid value for weight"); return -EINVAL; } for (j = i + 1; j < len; ++j) { if (nhg[i].id == nhg[j].id) { NL_SET_ERR_MSG(extack, "Nexthop id can not be used twice in a group"); return -EINVAL; } } } if (tb[NHA_FDB]) nhg_fdb = 1; nhg = nla_data(tb[NHA_GROUP]); for (i = 0; i < len; ++i) { struct nexthop *nh; bool is_fdb_nh; nh = nexthop_find_by_id(net, nhg[i].id); if (!nh) { NL_SET_ERR_MSG(extack, "Invalid nexthop id"); return -EINVAL; } if (!valid_group_nh(nh, len, &is_fdb_nh, extack)) return -EINVAL; if (nhg_fdb && nh_check_attr_fdb_group(nh, &nh_family, extack)) return -EINVAL; if (!nhg_fdb && is_fdb_nh) { NL_SET_ERR_MSG(extack, "Non FDB nexthop group cannot have fdb nexthops"); return -EINVAL; } } for (i = NHA_GROUP_TYPE + 1; i < tb_size; ++i) { if (!tb[i]) continue; switch (i) { case NHA_FDB: continue; case NHA_RES_GROUP: if (nh_grp_type == NEXTHOP_GRP_TYPE_RES) continue; break; } NL_SET_ERR_MSG(extack, "No other attributes can be set in nexthop groups"); return -EINVAL; } return 0; } static bool ipv6_good_nh(const struct fib6_nh *nh) { int state = NUD_REACHABLE; struct neighbour *n; rcu_read_lock(); n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev, &nh->fib_nh_gw6); if (n) state = READ_ONCE(n->nud_state); rcu_read_unlock(); return !!(state & NUD_VALID); } static bool ipv4_good_nh(const struct fib_nh *nh) { int state = NUD_REACHABLE; struct neighbour *n; rcu_read_lock(); n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev, (__force u32)nh->fib_nh_gw4); if (n) state = READ_ONCE(n->nud_state); rcu_read_unlock(); return !!(state & NUD_VALID); } static bool nexthop_is_good_nh(const struct nexthop *nh) { struct nh_info *nhi = rcu_dereference(nh->nh_info); switch (nhi->family) { case AF_INET: return ipv4_good_nh(&nhi->fib_nh); case AF_INET6: return ipv6_good_nh(&nhi->fib6_nh); } return false; } static struct nexthop *nexthop_select_path_fdb(struct nh_group *nhg, int hash) { int i; for (i = 0; i < nhg->num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; if (hash > atomic_read(&nhge->hthr.upper_bound)) continue; return nhge->nh; } WARN_ON_ONCE(1); return NULL; } static struct nexthop *nexthop_select_path_hthr(struct nh_group *nhg, int hash) { struct nexthop *rc = NULL; int i; if (nhg->fdb_nh) return nexthop_select_path_fdb(nhg, hash); for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; /* nexthops always check if it is good and does * not rely on a sysctl for this behavior */ if (!nexthop_is_good_nh(nhge->nh)) continue; if (!rc) rc = nhge->nh; if (hash > atomic_read(&nhge->hthr.upper_bound)) continue; return nhge->nh; } return rc ? : nhg->nh_entries[0].nh; } static struct nexthop *nexthop_select_path_res(struct nh_group *nhg, int hash) { struct nh_res_table *res_table = rcu_dereference(nhg->res_table); u16 bucket_index = hash % res_table->num_nh_buckets; struct nh_res_bucket *bucket; struct nh_grp_entry *nhge; /* nexthop_select_path() is expected to return a non-NULL value, so * skip protocol validation and just hand out whatever there is. */ bucket = &res_table->nh_buckets[bucket_index]; nh_res_bucket_set_busy(bucket); nhge = rcu_dereference(bucket->nh_entry); return nhge->nh; } struct nexthop *nexthop_select_path(struct nexthop *nh, int hash) { struct nh_group *nhg; if (!nh->is_group) return nh; nhg = rcu_dereference(nh->nh_grp); if (nhg->hash_threshold) return nexthop_select_path_hthr(nhg, hash); else if (nhg->resilient) return nexthop_select_path_res(nhg, hash); /* Unreachable. */ return NULL; } EXPORT_SYMBOL_GPL(nexthop_select_path); int nexthop_for_each_fib6_nh(struct nexthop *nh, int (*cb)(struct fib6_nh *nh, void *arg), void *arg) { struct nh_info *nhi; int err; if (nh->is_group) { struct nh_group *nhg; int i; nhg = rcu_dereference_rtnl(nh->nh_grp); for (i = 0; i < nhg->num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; nhi = rcu_dereference_rtnl(nhge->nh->nh_info); err = cb(&nhi->fib6_nh, arg); if (err) return err; } } else { nhi = rcu_dereference_rtnl(nh->nh_info); err = cb(&nhi->fib6_nh, arg); if (err) return err; } return 0; } EXPORT_SYMBOL_GPL(nexthop_for_each_fib6_nh); static int check_src_addr(const struct in6_addr *saddr, struct netlink_ext_ack *extack) { if (!ipv6_addr_any(saddr)) { NL_SET_ERR_MSG(extack, "IPv6 routes using source address can not use nexthop objects"); return -EINVAL; } return 0; } int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg, struct netlink_ext_ack *extack) { struct nh_info *nhi; bool is_fdb_nh; /* fib6_src is unique to a fib6_info and limits the ability to cache * routes in fib6_nh within a nexthop that is potentially shared * across multiple fib entries. If the config wants to use source * routing it can not use nexthop objects. mlxsw also does not allow * fib6_src on routes. */ if (cfg && check_src_addr(&cfg->fc_src, extack) < 0) return -EINVAL; if (nh->is_group) { struct nh_group *nhg; nhg = rtnl_dereference(nh->nh_grp); if (nhg->has_v4) goto no_v4_nh; is_fdb_nh = nhg->fdb_nh; } else { nhi = rtnl_dereference(nh->nh_info); if (nhi->family == AF_INET) goto no_v4_nh; is_fdb_nh = nhi->fdb_nh; } if (is_fdb_nh) { NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); return -EINVAL; } return 0; no_v4_nh: NL_SET_ERR_MSG(extack, "IPv6 routes can not use an IPv4 nexthop"); return -EINVAL; } EXPORT_SYMBOL_GPL(fib6_check_nexthop); /* if existing nexthop has ipv6 routes linked to it, need * to verify this new spec works with ipv6 */ static int fib6_check_nh_list(struct nexthop *old, struct nexthop *new, struct netlink_ext_ack *extack) { struct fib6_info *f6i; if (list_empty(&old->f6i_list)) return 0; list_for_each_entry(f6i, &old->f6i_list, nh_list) { if (check_src_addr(&f6i->fib6_src.addr, extack) < 0) return -EINVAL; } return fib6_check_nexthop(new, NULL, extack); } static int nexthop_check_scope(struct nh_info *nhi, u8 scope, struct netlink_ext_ack *extack) { if (scope == RT_SCOPE_HOST && nhi->fib_nhc.nhc_gw_family) { NL_SET_ERR_MSG(extack, "Route with host scope can not have a gateway"); return -EINVAL; } if (nhi->fib_nhc.nhc_flags & RTNH_F_ONLINK && scope >= RT_SCOPE_LINK) { NL_SET_ERR_MSG(extack, "Scope mismatch with nexthop"); return -EINVAL; } return 0; } /* Invoked by fib add code to verify nexthop by id is ok with * config for prefix; parts of fib_check_nh not done when nexthop * object is used. */ int fib_check_nexthop(struct nexthop *nh, u8 scope, struct netlink_ext_ack *extack) { struct nh_info *nhi; int err = 0; if (nh->is_group) { struct nh_group *nhg; nhg = rtnl_dereference(nh->nh_grp); if (nhg->fdb_nh) { NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); err = -EINVAL; goto out; } if (scope == RT_SCOPE_HOST) { NL_SET_ERR_MSG(extack, "Route with host scope can not have multiple nexthops"); err = -EINVAL; goto out; } /* all nexthops in a group have the same scope */ nhi = rtnl_dereference(nhg->nh_entries[0].nh->nh_info); err = nexthop_check_scope(nhi, scope, extack); } else { nhi = rtnl_dereference(nh->nh_info); if (nhi->fdb_nh) { NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); err = -EINVAL; goto out; } err = nexthop_check_scope(nhi, scope, extack); } out: return err; } static int fib_check_nh_list(struct nexthop *old, struct nexthop *new, struct netlink_ext_ack *extack) { struct fib_info *fi; list_for_each_entry(fi, &old->fi_list, nh_list) { int err; err = fib_check_nexthop(new, fi->fib_scope, extack); if (err) return err; } return 0; } static bool nh_res_nhge_is_balanced(const struct nh_grp_entry *nhge) { return nhge->res.count_buckets == nhge->res.wants_buckets; } static bool nh_res_nhge_is_ow(const struct nh_grp_entry *nhge) { return nhge->res.count_buckets > nhge->res.wants_buckets; } static bool nh_res_nhge_is_uw(const struct nh_grp_entry *nhge) { return nhge->res.count_buckets < nhge->res.wants_buckets; } static bool nh_res_table_is_balanced(const struct nh_res_table *res_table) { return list_empty(&res_table->uw_nh_entries); } static void nh_res_bucket_unset_nh(struct nh_res_bucket *bucket) { struct nh_grp_entry *nhge; if (bucket->occupied) { nhge = nh_res_dereference(bucket->nh_entry); nhge->res.count_buckets--; bucket->occupied = false; } } static void nh_res_bucket_set_nh(struct nh_res_bucket *bucket, struct nh_grp_entry *nhge) { nh_res_bucket_unset_nh(bucket); bucket->occupied = true; rcu_assign_pointer(bucket->nh_entry, nhge); nhge->res.count_buckets++; } static bool nh_res_bucket_should_migrate(struct nh_res_table *res_table, struct nh_res_bucket *bucket, unsigned long *deadline, bool *force) { unsigned long now = jiffies; struct nh_grp_entry *nhge; unsigned long idle_point; if (!bucket->occupied) { /* The bucket is not occupied, its NHGE pointer is either * NULL or obsolete. We _have to_ migrate: set force. */ *force = true; return true; } nhge = nh_res_dereference(bucket->nh_entry); /* If the bucket is populated by an underweight or balanced * nexthop, do not migrate. */ if (!nh_res_nhge_is_ow(nhge)) return false; /* At this point we know that the bucket is populated with an * overweight nexthop. It needs to be migrated to a new nexthop if * the idle timer of unbalanced timer expired. */ idle_point = nh_res_bucket_idle_point(res_table, bucket, now); if (time_after_eq(now, idle_point)) { /* The bucket is idle. We _can_ migrate: unset force. */ *force = false; return true; } /* Unbalanced timer of 0 means "never force". */ if (res_table->unbalanced_timer) { unsigned long unb_point; unb_point = nh_res_table_unb_point(res_table); if (time_after(now, unb_point)) { /* The bucket is not idle, but the unbalanced timer * expired. We _can_ migrate, but set force anyway, * so that drivers know to ignore activity reports * from the HW. */ *force = true; return true; } nh_res_time_set_deadline(unb_point, deadline); } nh_res_time_set_deadline(idle_point, deadline); return false; } static bool nh_res_bucket_migrate(struct nh_res_table *res_table, u16 bucket_index, bool notify, bool notify_nl, bool force) { struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index]; struct nh_grp_entry *new_nhge; struct netlink_ext_ack extack; int err; new_nhge = list_first_entry_or_null(&res_table->uw_nh_entries, struct nh_grp_entry, res.uw_nh_entry); if (WARN_ON_ONCE(!new_nhge)) /* If this function is called, "bucket" is either not * occupied, or it belongs to a next hop that is * overweight. In either case, there ought to be a * corresponding underweight next hop. */ return false; if (notify) { struct nh_grp_entry *old_nhge; old_nhge = nh_res_dereference(bucket->nh_entry); err = call_nexthop_res_bucket_notifiers(res_table->net, res_table->nhg_id, bucket_index, force, old_nhge->nh, new_nhge->nh, &extack); if (err) { pr_err_ratelimited("%s\n", extack._msg); if (!force) return false; /* It is not possible to veto a forced replacement, so * just clear the hardware flags from the nexthop * bucket to indicate to user space that this bucket is * not correctly populated in hardware. */ bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP); } } nh_res_bucket_set_nh(bucket, new_nhge); nh_res_bucket_set_idle(res_table, bucket); if (notify_nl) nexthop_bucket_notify(res_table, bucket_index); if (nh_res_nhge_is_balanced(new_nhge)) list_del(&new_nhge->res.uw_nh_entry); return true; } #define NH_RES_UPKEEP_DW_MINIMUM_INTERVAL (HZ / 2) static void nh_res_table_upkeep(struct nh_res_table *res_table, bool notify, bool notify_nl) { unsigned long now = jiffies; unsigned long deadline; u16 i; /* Deadline is the next time that upkeep should be run. It is the * earliest time at which one of the buckets might be migrated. * Start at the most pessimistic estimate: either unbalanced_timer * from now, or if there is none, idle_timer from now. For each * encountered time point, call nh_res_time_set_deadline() to * refine the estimate. */ if (res_table->unbalanced_timer) deadline = now + res_table->unbalanced_timer; else deadline = now + res_table->idle_timer; for (i = 0; i < res_table->num_nh_buckets; i++) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; bool force; if (nh_res_bucket_should_migrate(res_table, bucket, &deadline, &force)) { if (!nh_res_bucket_migrate(res_table, i, notify, notify_nl, force)) { unsigned long idle_point; /* A driver can override the migration * decision if the HW reports that the * bucket is actually not idle. Therefore * remark the bucket as busy again and * update the deadline. */ nh_res_bucket_set_busy(bucket); idle_point = nh_res_bucket_idle_point(res_table, bucket, now); nh_res_time_set_deadline(idle_point, &deadline); } } } /* If the group is still unbalanced, schedule the next upkeep to * either the deadline computed above, or the minimum deadline, * whichever comes later. */ if (!nh_res_table_is_balanced(res_table)) { unsigned long now = jiffies; unsigned long min_deadline; min_deadline = now + NH_RES_UPKEEP_DW_MINIMUM_INTERVAL; if (time_before(deadline, min_deadline)) deadline = min_deadline; queue_delayed_work(system_power_efficient_wq, &res_table->upkeep_dw, deadline - now); } } static void nh_res_table_upkeep_dw(struct work_struct *work) { struct delayed_work *dw = to_delayed_work(work); struct nh_res_table *res_table; res_table = container_of(dw, struct nh_res_table, upkeep_dw); nh_res_table_upkeep(res_table, true, true); } static void nh_res_table_cancel_upkeep(struct nh_res_table *res_table) { cancel_delayed_work_sync(&res_table->upkeep_dw); } static void nh_res_group_rebalance(struct nh_group *nhg, struct nh_res_table *res_table) { int prev_upper_bound = 0; int total = 0; int w = 0; int i; INIT_LIST_HEAD(&res_table->uw_nh_entries); for (i = 0; i < nhg->num_nh; ++i) total += nhg->nh_entries[i].weight; for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; int upper_bound; w += nhge->weight; upper_bound = DIV_ROUND_CLOSEST(res_table->num_nh_buckets * w, total); nhge->res.wants_buckets = upper_bound - prev_upper_bound; prev_upper_bound = upper_bound; if (nh_res_nhge_is_uw(nhge)) { if (list_empty(&res_table->uw_nh_entries)) res_table->unbalanced_since = jiffies; list_add(&nhge->res.uw_nh_entry, &res_table->uw_nh_entries); } } } /* Migrate buckets in res_table so that they reference NHGE's from NHG with * the right NH ID. Set those buckets that do not have a corresponding NHGE * entry in NHG as not occupied. */ static void nh_res_table_migrate_buckets(struct nh_res_table *res_table, struct nh_group *nhg) { u16 i; for (i = 0; i < res_table->num_nh_buckets; i++) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; u32 id = rtnl_dereference(bucket->nh_entry)->nh->id; bool found = false; int j; for (j = 0; j < nhg->num_nh; j++) { struct nh_grp_entry *nhge = &nhg->nh_entries[j]; if (nhge->nh->id == id) { nh_res_bucket_set_nh(bucket, nhge); found = true; break; } } if (!found) nh_res_bucket_unset_nh(bucket); } } static void replace_nexthop_grp_res(struct nh_group *oldg, struct nh_group *newg) { /* For NH group replacement, the new NHG might only have a stub * hash table with 0 buckets, because the number of buckets was not * specified. For NH removal, oldg and newg both reference the same * res_table. So in any case, in the following, we want to work * with oldg->res_table. */ struct nh_res_table *old_res_table = rtnl_dereference(oldg->res_table); unsigned long prev_unbalanced_since = old_res_table->unbalanced_since; bool prev_has_uw = !list_empty(&old_res_table->uw_nh_entries); nh_res_table_cancel_upkeep(old_res_table); nh_res_table_migrate_buckets(old_res_table, newg); nh_res_group_rebalance(newg, old_res_table); if (prev_has_uw && !list_empty(&old_res_table->uw_nh_entries)) old_res_table->unbalanced_since = prev_unbalanced_since; nh_res_table_upkeep(old_res_table, true, false); } static void nh_hthr_group_rebalance(struct nh_group *nhg) { int total = 0; int w = 0; int i; for (i = 0; i < nhg->num_nh; ++i) total += nhg->nh_entries[i].weight; for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; int upper_bound; w += nhge->weight; upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1; atomic_set(&nhge->hthr.upper_bound, upper_bound); } } static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, struct nl_info *nlinfo) { struct nh_grp_entry *nhges, *new_nhges; struct nexthop *nhp = nhge->nh_parent; struct netlink_ext_ack extack; struct nexthop *nh = nhge->nh; struct nh_group *nhg, *newg; int i, j, err; WARN_ON(!nh); nhg = rtnl_dereference(nhp->nh_grp); newg = nhg->spare; /* last entry, keep it visible and remove the parent */ if (nhg->num_nh == 1) { remove_nexthop(net, nhp, nlinfo); return; } newg->has_v4 = false; newg->is_multipath = nhg->is_multipath; newg->hash_threshold = nhg->hash_threshold; newg->resilient = nhg->resilient; newg->fdb_nh = nhg->fdb_nh; newg->num_nh = nhg->num_nh; /* copy old entries to new except the one getting removed */ nhges = nhg->nh_entries; new_nhges = newg->nh_entries; for (i = 0, j = 0; i < nhg->num_nh; ++i) { struct nh_info *nhi; /* current nexthop getting removed */ if (nhg->nh_entries[i].nh == nh) { newg->num_nh--; continue; } nhi = rtnl_dereference(nhges[i].nh->nh_info); if (nhi->family == AF_INET) newg->has_v4 = true; list_del(&nhges[i].nh_list); new_nhges[j].nh_parent = nhges[i].nh_parent; new_nhges[j].nh = nhges[i].nh; new_nhges[j].weight = nhges[i].weight; list_add(&new_nhges[j].nh_list, &new_nhges[j].nh->grp_list); j++; } if (newg->hash_threshold) nh_hthr_group_rebalance(newg); else if (newg->resilient) replace_nexthop_grp_res(nhg, newg); rcu_assign_pointer(nhp->nh_grp, newg); list_del(&nhge->nh_list); nexthop_put(nhge->nh); /* Removal of a NH from a resilient group is notified through * bucket notifications. */ if (newg->hash_threshold) { err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp, &extack); if (err) pr_err("%s\n", extack._msg); } if (nlinfo) nexthop_notify(RTM_NEWNEXTHOP, nhp, nlinfo); } static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh, struct nl_info *nlinfo) { struct nh_grp_entry *nhge, *tmp; list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list) remove_nh_grp_entry(net, nhge, nlinfo); /* make sure all see the newly published array before releasing rtnl */ synchronize_net(); } static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo) { struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp); struct nh_res_table *res_table; int i, num_nh = nhg->num_nh; for (i = 0; i < num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; if (WARN_ON(!nhge->nh)) continue; list_del_init(&nhge->nh_list); } if (nhg->resilient) { res_table = rtnl_dereference(nhg->res_table); nh_res_table_cancel_upkeep(res_table); } } /* not called for nexthop replace */ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh) { struct fib6_info *f6i, *tmp; bool do_flush = false; struct fib_info *fi; list_for_each_entry(fi, &nh->fi_list, nh_list) { fi->fib_flags |= RTNH_F_DEAD; do_flush = true; } if (do_flush) fib_flush(net); /* ip6_del_rt removes the entry from this list hence the _safe */ list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) { /* __ip6_del_rt does a release, so do a hold here */ fib6_info_hold(f6i); ipv6_stub->ip6_del_rt(net, f6i, !READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode)); } } static void __remove_nexthop(struct net *net, struct nexthop *nh, struct nl_info *nlinfo) { __remove_nexthop_fib(net, nh); if (nh->is_group) { remove_nexthop_group(nh, nlinfo); } else { struct nh_info *nhi; nhi = rtnl_dereference(nh->nh_info); if (nhi->fib_nhc.nhc_dev) hlist_del(&nhi->dev_hash); remove_nexthop_from_groups(net, nh, nlinfo); } } static void remove_nexthop(struct net *net, struct nexthop *nh, struct nl_info *nlinfo) { call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh, NULL); /* remove from the tree */ rb_erase(&nh->rb_node, &net->nexthop.rb_root); if (nlinfo) nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo); __remove_nexthop(net, nh, nlinfo); nh_base_seq_inc(net); nexthop_put(nh); } /* if any FIB entries reference this nexthop, any dst entries * need to be regenerated */ static void nh_rt_cache_flush(struct net *net, struct nexthop *nh, struct nexthop *replaced_nh) { struct fib6_info *f6i; struct nh_group *nhg; int i; if (!list_empty(&nh->fi_list)) rt_cache_flush(net); list_for_each_entry(f6i, &nh->f6i_list, nh_list) ipv6_stub->fib6_update_sernum(net, f6i); /* if an IPv6 group was replaced, we have to release all old * dsts to make sure all refcounts are released */ if (!replaced_nh->is_group) return; nhg = rtnl_dereference(replaced_nh->nh_grp); for (i = 0; i < nhg->num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; struct nh_info *nhi = rtnl_dereference(nhge->nh->nh_info); if (nhi->family == AF_INET6) ipv6_stub->fib6_nh_release_dsts(&nhi->fib6_nh); } } static int replace_nexthop_grp(struct net *net, struct nexthop *old, struct nexthop *new, const struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nh_res_table *tmp_table = NULL; struct nh_res_table *new_res_table; struct nh_res_table *old_res_table; struct nh_group *oldg, *newg; int i, err; if (!new->is_group) { NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with a nexthop."); return -EINVAL; } oldg = rtnl_dereference(old->nh_grp); newg = rtnl_dereference(new->nh_grp); if (newg->hash_threshold != oldg->hash_threshold) { NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with one of a different type."); return -EINVAL; } if (newg->hash_threshold) { err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack); if (err) return err; } else if (newg->resilient) { new_res_table = rtnl_dereference(newg->res_table); old_res_table = rtnl_dereference(oldg->res_table); /* Accept if num_nh_buckets was not given, but if it was * given, demand that the value be correct. */ if (cfg->nh_grp_res_has_num_buckets && cfg->nh_grp_res_num_buckets != old_res_table->num_nh_buckets) { NL_SET_ERR_MSG(extack, "Can not change number of buckets of a resilient nexthop group."); return -EINVAL; } /* Emit a pre-replace notification so that listeners could veto * a potentially unsupported configuration. Otherwise, * individual bucket replacement notifications would need to be * vetoed, which is something that should only happen if the * bucket is currently active. */ err = call_nexthop_res_table_notifiers(net, new, extack); if (err) return err; if (cfg->nh_grp_res_has_idle_timer) old_res_table->idle_timer = cfg->nh_grp_res_idle_timer; if (cfg->nh_grp_res_has_unbalanced_timer) old_res_table->unbalanced_timer = cfg->nh_grp_res_unbalanced_timer; replace_nexthop_grp_res(oldg, newg); tmp_table = new_res_table; rcu_assign_pointer(newg->res_table, old_res_table); rcu_assign_pointer(newg->spare->res_table, old_res_table); } /* update parents - used by nexthop code for cleanup */ for (i = 0; i < newg->num_nh; i++) newg->nh_entries[i].nh_parent = old; rcu_assign_pointer(old->nh_grp, newg); /* Make sure concurrent readers are not using 'oldg' anymore. */ synchronize_net(); if (newg->resilient) { rcu_assign_pointer(oldg->res_table, tmp_table); rcu_assign_pointer(oldg->spare->res_table, tmp_table); } for (i = 0; i < oldg->num_nh; i++) oldg->nh_entries[i].nh_parent = new; rcu_assign_pointer(new->nh_grp, oldg); return 0; } static void nh_group_v4_update(struct nh_group *nhg) { struct nh_grp_entry *nhges; bool has_v4 = false; int i; nhges = nhg->nh_entries; for (i = 0; i < nhg->num_nh; i++) { struct nh_info *nhi; nhi = rtnl_dereference(nhges[i].nh->nh_info); if (nhi->family == AF_INET) has_v4 = true; } nhg->has_v4 = has_v4; } static int replace_nexthop_single_notify_res(struct net *net, struct nh_res_table *res_table, struct nexthop *old, struct nh_info *oldi, struct nh_info *newi, struct netlink_ext_ack *extack) { u32 nhg_id = res_table->nhg_id; int err; u16 i; for (i = 0; i < res_table->num_nh_buckets; i++) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; struct nh_grp_entry *nhge; nhge = rtnl_dereference(bucket->nh_entry); if (nhge->nh == old) { err = __call_nexthop_res_bucket_notifiers(net, nhg_id, i, true, oldi, newi, extack); if (err) goto err_notify; } } return 0; err_notify: while (i-- > 0) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; struct nh_grp_entry *nhge; nhge = rtnl_dereference(bucket->nh_entry); if (nhge->nh == old) __call_nexthop_res_bucket_notifiers(net, nhg_id, i, true, newi, oldi, extack); } return err; } static int replace_nexthop_single_notify(struct net *net, struct nexthop *group_nh, struct nexthop *old, struct nh_info *oldi, struct nh_info *newi, struct netlink_ext_ack *extack) { struct nh_group *nhg = rtnl_dereference(group_nh->nh_grp); struct nh_res_table *res_table; if (nhg->hash_threshold) { return call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, group_nh, extack); } else if (nhg->resilient) { res_table = rtnl_dereference(nhg->res_table); return replace_nexthop_single_notify_res(net, res_table, old, oldi, newi, extack); } return -EINVAL; } static int replace_nexthop_single(struct net *net, struct nexthop *old, struct nexthop *new, struct netlink_ext_ack *extack) { u8 old_protocol, old_nh_flags; struct nh_info *oldi, *newi; struct nh_grp_entry *nhge; int err; if (new->is_group) { NL_SET_ERR_MSG(extack, "Can not replace a nexthop with a nexthop group."); return -EINVAL; } err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack); if (err) return err; /* Hardware flags were set on 'old' as 'new' is not in the red-black * tree. Therefore, inherit the flags from 'old' to 'new'. */ new->nh_flags |= old->nh_flags & (RTNH_F_OFFLOAD | RTNH_F_TRAP); oldi = rtnl_dereference(old->nh_info); newi = rtnl_dereference(new->nh_info); newi->nh_parent = old; oldi->nh_parent = new; old_protocol = old->protocol; old_nh_flags = old->nh_flags; old->protocol = new->protocol; old->nh_flags = new->nh_flags; rcu_assign_pointer(old->nh_info, newi); rcu_assign_pointer(new->nh_info, oldi); /* Send a replace notification for all the groups using the nexthop. */ list_for_each_entry(nhge, &old->grp_list, nh_list) { struct nexthop *nhp = nhge->nh_parent; err = replace_nexthop_single_notify(net, nhp, old, oldi, newi, extack); if (err) goto err_notify; } /* When replacing an IPv4 nexthop with an IPv6 nexthop, potentially * update IPv4 indication in all the groups using the nexthop. */ if (oldi->family == AF_INET && newi->family == AF_INET6) { list_for_each_entry(nhge, &old->grp_list, nh_list) { struct nexthop *nhp = nhge->nh_parent; struct nh_group *nhg; nhg = rtnl_dereference(nhp->nh_grp); nh_group_v4_update(nhg); } } return 0; err_notify: rcu_assign_pointer(new->nh_info, newi); rcu_assign_pointer(old->nh_info, oldi); old->nh_flags = old_nh_flags; old->protocol = old_protocol; oldi->nh_parent = old; newi->nh_parent = new; list_for_each_entry_continue_reverse(nhge, &old->grp_list, nh_list) { struct nexthop *nhp = nhge->nh_parent; replace_nexthop_single_notify(net, nhp, old, newi, oldi, NULL); } call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, old, extack); return err; } static void __nexthop_replace_notify(struct net *net, struct nexthop *nh, struct nl_info *info) { struct fib6_info *f6i; if (!list_empty(&nh->fi_list)) { struct fib_info *fi; /* expectation is a few fib_info per nexthop and then * a lot of routes per fib_info. So mark the fib_info * and then walk the fib tables once */ list_for_each_entry(fi, &nh->fi_list, nh_list) fi->nh_updated = true; fib_info_notify_update(net, info); list_for_each_entry(fi, &nh->fi_list, nh_list) fi->nh_updated = false; } list_for_each_entry(f6i, &nh->f6i_list, nh_list) ipv6_stub->fib6_rt_update(net, f6i, info); } /* send RTM_NEWROUTE with REPLACE flag set for all FIB entries * linked to this nexthop and for all groups that the nexthop * is a member of */ static void nexthop_replace_notify(struct net *net, struct nexthop *nh, struct nl_info *info) { struct nh_grp_entry *nhge; __nexthop_replace_notify(net, nh, info); list_for_each_entry(nhge, &nh->grp_list, nh_list) __nexthop_replace_notify(net, nhge->nh_parent, info); } static int replace_nexthop(struct net *net, struct nexthop *old, struct nexthop *new, const struct nh_config *cfg, struct netlink_ext_ack *extack) { bool new_is_reject = false; struct nh_grp_entry *nhge; int err; /* check that existing FIB entries are ok with the * new nexthop definition */ err = fib_check_nh_list(old, new, extack); if (err) return err; err = fib6_check_nh_list(old, new, extack); if (err) return err; if (!new->is_group) { struct nh_info *nhi = rtnl_dereference(new->nh_info); new_is_reject = nhi->reject_nh; } list_for_each_entry(nhge, &old->grp_list, nh_list) { /* if new nexthop is a blackhole, any groups using this * nexthop cannot have more than 1 path */ if (new_is_reject && nexthop_num_path(nhge->nh_parent) > 1) { NL_SET_ERR_MSG(extack, "Blackhole nexthop can not be a member of a group with more than one path"); return -EINVAL; } err = fib_check_nh_list(nhge->nh_parent, new, extack); if (err) return err; err = fib6_check_nh_list(nhge->nh_parent, new, extack); if (err) return err; } if (old->is_group) err = replace_nexthop_grp(net, old, new, cfg, extack); else err = replace_nexthop_single(net, old, new, extack); if (!err) { nh_rt_cache_flush(net, old, new); __remove_nexthop(net, new, NULL); nexthop_put(new); } return err; } /* called with rtnl_lock held */ static int insert_nexthop(struct net *net, struct nexthop *new_nh, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct rb_node **pp, *parent = NULL, *next; struct rb_root *root = &net->nexthop.rb_root; bool replace = !!(cfg->nlflags & NLM_F_REPLACE); bool create = !!(cfg->nlflags & NLM_F_CREATE); u32 new_id = new_nh->id; int replace_notify = 0; int rc = -EEXIST; pp = &root->rb_node; while (1) { struct nexthop *nh; next = *pp; if (!next) break; parent = next; nh = rb_entry(parent, struct nexthop, rb_node); if (new_id < nh->id) { pp = &next->rb_left; } else if (new_id > nh->id) { pp = &next->rb_right; } else if (replace) { rc = replace_nexthop(net, nh, new_nh, cfg, extack); if (!rc) { new_nh = nh; /* send notification with old nh */ replace_notify = 1; } goto out; } else { /* id already exists and not a replace */ goto out; } } if (replace && !create) { NL_SET_ERR_MSG(extack, "Replace specified without create and no entry exists"); rc = -ENOENT; goto out; } if (new_nh->is_group) { struct nh_group *nhg = rtnl_dereference(new_nh->nh_grp); struct nh_res_table *res_table; if (nhg->resilient) { res_table = rtnl_dereference(nhg->res_table); /* Not passing the number of buckets is OK when * replacing, but not when creating a new group. */ if (!cfg->nh_grp_res_has_num_buckets) { NL_SET_ERR_MSG(extack, "Number of buckets not specified for nexthop group insertion"); rc = -EINVAL; goto out; } nh_res_group_rebalance(nhg, res_table); /* Do not send bucket notifications, we do full * notification below. */ nh_res_table_upkeep(res_table, false, false); } } rb_link_node_rcu(&new_nh->rb_node, parent, pp); rb_insert_color(&new_nh->rb_node, root); /* The initial insertion is a full notification for hash-threshold as * well as resilient groups. */ rc = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new_nh, extack); if (rc) rb_erase(&new_nh->rb_node, &net->nexthop.rb_root); out: if (!rc) { nh_base_seq_inc(net); nexthop_notify(RTM_NEWNEXTHOP, new_nh, &cfg->nlinfo); if (replace_notify && READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode)) nexthop_replace_notify(net, new_nh, &cfg->nlinfo); } return rc; } /* rtnl */ /* remove all nexthops tied to a device being deleted */ static void nexthop_flush_dev(struct net_device *dev, unsigned long event) { unsigned int hash = nh_dev_hashfn(dev->ifindex); struct net *net = dev_net(dev); struct hlist_head *head = &net->nexthop.devhash[hash]; struct hlist_node *n; struct nh_info *nhi; hlist_for_each_entry_safe(nhi, n, head, dev_hash) { if (nhi->fib_nhc.nhc_dev != dev) continue; if (nhi->reject_nh && (event == NETDEV_DOWN || event == NETDEV_CHANGE)) continue; remove_nexthop(net, nhi->nh_parent, NULL); } } /* rtnl; called when net namespace is deleted */ static void flush_all_nexthops(struct net *net) { struct rb_root *root = &net->nexthop.rb_root; struct rb_node *node; struct nexthop *nh; while ((node = rb_first(root))) { nh = rb_entry(node, struct nexthop, rb_node); remove_nexthop(net, nh, NULL); cond_resched(); } } static struct nexthop *nexthop_create_group(struct net *net, struct nh_config *cfg) { struct nlattr *grps_attr = cfg->nh_grp; struct nexthop_grp *entry = nla_data(grps_attr); u16 num_nh = nla_len(grps_attr) / sizeof(*entry); struct nh_group *nhg; struct nexthop *nh; int err; int i; if (WARN_ON(!num_nh)) return ERR_PTR(-EINVAL); nh = nexthop_alloc(); if (!nh) return ERR_PTR(-ENOMEM); nh->is_group = 1; nhg = nexthop_grp_alloc(num_nh); if (!nhg) { kfree(nh); return ERR_PTR(-ENOMEM); } /* spare group used for removals */ nhg->spare = nexthop_grp_alloc(num_nh); if (!nhg->spare) { kfree(nhg); kfree(nh); return ERR_PTR(-ENOMEM); } nhg->spare->spare = nhg; for (i = 0; i < nhg->num_nh; ++i) { struct nexthop *nhe; struct nh_info *nhi; nhe = nexthop_find_by_id(net, entry[i].id); if (!nexthop_get(nhe)) { err = -ENOENT; goto out_no_nh; } nhi = rtnl_dereference(nhe->nh_info); if (nhi->family == AF_INET) nhg->has_v4 = true; nhg->nh_entries[i].nh = nhe; nhg->nh_entries[i].weight = entry[i].weight + 1; list_add(&nhg->nh_entries[i].nh_list, &nhe->grp_list); nhg->nh_entries[i].nh_parent = nh; } if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) { nhg->hash_threshold = 1; nhg->is_multipath = true; } else if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES) { struct nh_res_table *res_table; res_table = nexthop_res_table_alloc(net, cfg->nh_id, cfg); if (!res_table) { err = -ENOMEM; goto out_no_nh; } rcu_assign_pointer(nhg->spare->res_table, res_table); rcu_assign_pointer(nhg->res_table, res_table); nhg->resilient = true; nhg->is_multipath = true; } WARN_ON_ONCE(nhg->hash_threshold + nhg->resilient != 1); if (nhg->hash_threshold) nh_hthr_group_rebalance(nhg); if (cfg->nh_fdb) nhg->fdb_nh = 1; rcu_assign_pointer(nh->nh_grp, nhg); return nh; out_no_nh: for (i--; i >= 0; --i) { list_del(&nhg->nh_entries[i].nh_list); nexthop_put(nhg->nh_entries[i].nh); } kfree(nhg->spare); kfree(nhg); kfree(nh); return ERR_PTR(err); } static int nh_create_ipv4(struct net *net, struct nexthop *nh, struct nh_info *nhi, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct fib_nh *fib_nh = &nhi->fib_nh; struct fib_config fib_cfg = { .fc_oif = cfg->nh_ifindex, .fc_gw4 = cfg->gw.ipv4, .fc_gw_family = cfg->gw.ipv4 ? AF_INET : 0, .fc_flags = cfg->nh_flags, .fc_nlinfo = cfg->nlinfo, .fc_encap = cfg->nh_encap, .fc_encap_type = cfg->nh_encap_type, }; u32 tb_id = (cfg->dev ? l3mdev_fib_table(cfg->dev) : RT_TABLE_MAIN); int err; err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack); if (err) { fib_nh_release(net, fib_nh); goto out; } if (nhi->fdb_nh) goto out; /* sets nh_dev if successful */ err = fib_check_nh(net, fib_nh, tb_id, 0, extack); if (!err) { nh->nh_flags = fib_nh->fib_nh_flags; fib_info_update_nhc_saddr(net, &fib_nh->nh_common, !fib_nh->fib_nh_scope ? 0 : fib_nh->fib_nh_scope - 1); } else { fib_nh_release(net, fib_nh); } out: return err; } static int nh_create_ipv6(struct net *net, struct nexthop *nh, struct nh_info *nhi, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct fib6_nh *fib6_nh = &nhi->fib6_nh; struct fib6_config fib6_cfg = { .fc_table = l3mdev_fib_table(cfg->dev), .fc_ifindex = cfg->nh_ifindex, .fc_gateway = cfg->gw.ipv6, .fc_flags = cfg->nh_flags, .fc_nlinfo = cfg->nlinfo, .fc_encap = cfg->nh_encap, .fc_encap_type = cfg->nh_encap_type, .fc_is_fdb = cfg->nh_fdb, }; int err; if (!ipv6_addr_any(&cfg->gw.ipv6)) fib6_cfg.fc_flags |= RTF_GATEWAY; /* sets nh_dev if successful */ err = ipv6_stub->fib6_nh_init(net, fib6_nh, &fib6_cfg, GFP_KERNEL, extack); if (err) { /* IPv6 is not enabled, don't call fib6_nh_release */ if (err == -EAFNOSUPPORT) goto out; ipv6_stub->fib6_nh_release(fib6_nh); } else { nh->nh_flags = fib6_nh->fib_nh_flags; } out: return err; } static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nh_info *nhi; struct nexthop *nh; int err = 0; nh = nexthop_alloc(); if (!nh) return ERR_PTR(-ENOMEM); nhi = kzalloc(sizeof(*nhi), GFP_KERNEL); if (!nhi) { kfree(nh); return ERR_PTR(-ENOMEM); } nh->nh_flags = cfg->nh_flags; nh->net = net; nhi->nh_parent = nh; nhi->family = cfg->nh_family; nhi->fib_nhc.nhc_scope = RT_SCOPE_LINK; if (cfg->nh_fdb) nhi->fdb_nh = 1; if (cfg->nh_blackhole) { nhi->reject_nh = 1; cfg->nh_ifindex = net->loopback_dev->ifindex; } switch (cfg->nh_family) { case AF_INET: err = nh_create_ipv4(net, nh, nhi, cfg, extack); break; case AF_INET6: err = nh_create_ipv6(net, nh, nhi, cfg, extack); break; } if (err) { kfree(nhi); kfree(nh); return ERR_PTR(err); } /* add the entry to the device based hash */ if (!nhi->fdb_nh) nexthop_devhash_add(net, nhi); rcu_assign_pointer(nh->nh_info, nhi); return nh; } /* called with rtnl lock held */ static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nexthop *nh; int err; if (cfg->nlflags & NLM_F_REPLACE && !cfg->nh_id) { NL_SET_ERR_MSG(extack, "Replace requires nexthop id"); return ERR_PTR(-EINVAL); } if (!cfg->nh_id) { cfg->nh_id = nh_find_unused_id(net); if (!cfg->nh_id) { NL_SET_ERR_MSG(extack, "No unused id"); return ERR_PTR(-EINVAL); } } if (cfg->nh_grp) nh = nexthop_create_group(net, cfg); else nh = nexthop_create(net, cfg, extack); if (IS_ERR(nh)) return nh; refcount_set(&nh->refcnt, 1); nh->id = cfg->nh_id; nh->protocol = cfg->nh_protocol; nh->net = net; err = insert_nexthop(net, nh, cfg, extack); if (err) { __remove_nexthop(net, nh, NULL); nexthop_put(nh); nh = ERR_PTR(err); } return nh; } static int rtm_nh_get_timer(struct nlattr *attr, unsigned long fallback, unsigned long *timer_p, bool *has_p, struct netlink_ext_ack *extack) { unsigned long timer; u32 value; if (!attr) { *timer_p = fallback; *has_p = false; return 0; } value = nla_get_u32(attr); timer = clock_t_to_jiffies(value); if (timer == ~0UL) { NL_SET_ERR_MSG(extack, "Timer value too large"); return -EINVAL; } *timer_p = timer; *has_p = true; return 0; } static int rtm_to_nh_config_grp_res(struct nlattr *res, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_policy_new)] = {}; int err; if (res) { err = nla_parse_nested(tb, ARRAY_SIZE(rtm_nh_res_policy_new) - 1, res, rtm_nh_res_policy_new, extack); if (err < 0) return err; } if (tb[NHA_RES_GROUP_BUCKETS]) { cfg->nh_grp_res_num_buckets = nla_get_u16(tb[NHA_RES_GROUP_BUCKETS]); cfg->nh_grp_res_has_num_buckets = true; if (!cfg->nh_grp_res_num_buckets) { NL_SET_ERR_MSG(extack, "Number of buckets needs to be non-0"); return -EINVAL; } } err = rtm_nh_get_timer(tb[NHA_RES_GROUP_IDLE_TIMER], NH_RES_DEFAULT_IDLE_TIMER, &cfg->nh_grp_res_idle_timer, &cfg->nh_grp_res_has_idle_timer, extack); if (err) return err; return rtm_nh_get_timer(tb[NHA_RES_GROUP_UNBALANCED_TIMER], NH_RES_DEFAULT_UNBALANCED_TIMER, &cfg->nh_grp_res_unbalanced_timer, &cfg->nh_grp_res_has_unbalanced_timer, extack); } static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nhmsg *nhm = nlmsg_data(nlh); struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_new)]; int err; err = nlmsg_parse(nlh, sizeof(*nhm), tb, ARRAY_SIZE(rtm_nh_policy_new) - 1, rtm_nh_policy_new, extack); if (err < 0) return err; err = -EINVAL; if (nhm->resvd || nhm->nh_scope) { NL_SET_ERR_MSG(extack, "Invalid values in ancillary header"); goto out; } if (nhm->nh_flags & ~NEXTHOP_VALID_USER_FLAGS) { NL_SET_ERR_MSG(extack, "Invalid nexthop flags in ancillary header"); goto out; } switch (nhm->nh_family) { case AF_INET: case AF_INET6: break; case AF_UNSPEC: if (tb[NHA_GROUP]) break; fallthrough; default: NL_SET_ERR_MSG(extack, "Invalid address family"); goto out; } memset(cfg, 0, sizeof(*cfg)); cfg->nlflags = nlh->nlmsg_flags; cfg->nlinfo.portid = NETLINK_CB(skb).portid; cfg->nlinfo.nlh = nlh; cfg->nlinfo.nl_net = net; cfg->nh_family = nhm->nh_family; cfg->nh_protocol = nhm->nh_protocol; cfg->nh_flags = nhm->nh_flags; if (tb[NHA_ID]) cfg->nh_id = nla_get_u32(tb[NHA_ID]); if (tb[NHA_FDB]) { if (tb[NHA_OIF] || tb[NHA_BLACKHOLE] || tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE]) { NL_SET_ERR_MSG(extack, "Fdb attribute can not be used with encap, oif or blackhole"); goto out; } if (nhm->nh_flags) { NL_SET_ERR_MSG(extack, "Unsupported nexthop flags in ancillary header"); goto out; } cfg->nh_fdb = nla_get_flag(tb[NHA_FDB]); } if (tb[NHA_GROUP]) { if (nhm->nh_family != AF_UNSPEC) { NL_SET_ERR_MSG(extack, "Invalid family for group"); goto out; } cfg->nh_grp = tb[NHA_GROUP]; cfg->nh_grp_type = NEXTHOP_GRP_TYPE_MPATH; if (tb[NHA_GROUP_TYPE]) cfg->nh_grp_type = nla_get_u16(tb[NHA_GROUP_TYPE]); if (cfg->nh_grp_type > NEXTHOP_GRP_TYPE_MAX) { NL_SET_ERR_MSG(extack, "Invalid group type"); goto out; } err = nh_check_attr_group(net, tb, ARRAY_SIZE(tb), cfg->nh_grp_type, extack); if (err) goto out; if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES) err = rtm_to_nh_config_grp_res(tb[NHA_RES_GROUP], cfg, extack); /* no other attributes should be set */ goto out; } if (tb[NHA_BLACKHOLE]) { if (tb[NHA_GATEWAY] || tb[NHA_OIF] || tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE] || tb[NHA_FDB]) { NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway, oif, encap or fdb"); goto out; } cfg->nh_blackhole = 1; err = 0; goto out; } if (!cfg->nh_fdb && !tb[NHA_OIF]) { NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole and non-fdb nexthops"); goto out; } if (!cfg->nh_fdb && tb[NHA_OIF]) { cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]); if (cfg->nh_ifindex) cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex); if (!cfg->dev) { NL_SET_ERR_MSG(extack, "Invalid device index"); goto out; } else if (!(cfg->dev->flags & IFF_UP)) { NL_SET_ERR_MSG(extack, "Nexthop device is not up"); err = -ENETDOWN; goto out; } else if (!netif_carrier_ok(cfg->dev)) { NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down"); err = -ENETDOWN; goto out; } } err = -EINVAL; if (tb[NHA_GATEWAY]) { struct nlattr *gwa = tb[NHA_GATEWAY]; switch (cfg->nh_family) { case AF_INET: if (nla_len(gwa) != sizeof(u32)) { NL_SET_ERR_MSG(extack, "Invalid gateway"); goto out; } cfg->gw.ipv4 = nla_get_be32(gwa); break; case AF_INET6: if (nla_len(gwa) != sizeof(struct in6_addr)) { NL_SET_ERR_MSG(extack, "Invalid gateway"); goto out; } cfg->gw.ipv6 = nla_get_in6_addr(gwa); break; default: NL_SET_ERR_MSG(extack, "Unknown address family for gateway"); goto out; } } else { /* device only nexthop (no gateway) */ if (cfg->nh_flags & RTNH_F_ONLINK) { NL_SET_ERR_MSG(extack, "ONLINK flag can not be set for nexthop without a gateway"); goto out; } } if (tb[NHA_ENCAP]) { cfg->nh_encap = tb[NHA_ENCAP]; if (!tb[NHA_ENCAP_TYPE]) { NL_SET_ERR_MSG(extack, "LWT encapsulation type is missing"); goto out; } cfg->nh_encap_type = nla_get_u16(tb[NHA_ENCAP_TYPE]); err = lwtunnel_valid_encap_type(cfg->nh_encap_type, extack); if (err < 0) goto out; } else if (tb[NHA_ENCAP_TYPE]) { NL_SET_ERR_MSG(extack, "LWT encapsulation attribute is missing"); goto out; } err = 0; out: return err; } /* rtnl */ static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nh_config cfg; struct nexthop *nh; int err; err = rtm_to_nh_config(net, skb, nlh, &cfg, extack); if (!err) { nh = nexthop_add(net, &cfg, extack); if (IS_ERR(nh)) err = PTR_ERR(nh); } return err; } static int __nh_valid_get_del_req(const struct nlmsghdr *nlh, struct nlattr **tb, u32 *id, struct netlink_ext_ack *extack) { struct nhmsg *nhm = nlmsg_data(nlh); if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) { NL_SET_ERR_MSG(extack, "Invalid values in header"); return -EINVAL; } if (!tb[NHA_ID]) { NL_SET_ERR_MSG(extack, "Nexthop id is missing"); return -EINVAL; } *id = nla_get_u32(tb[NHA_ID]); if (!(*id)) { NL_SET_ERR_MSG(extack, "Invalid nexthop id"); return -EINVAL; } return 0; } static int nh_valid_get_del_req(const struct nlmsghdr *nlh, u32 *id, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get)]; int err; err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, ARRAY_SIZE(rtm_nh_policy_get) - 1, rtm_nh_policy_get, extack); if (err < 0) return err; return __nh_valid_get_del_req(nlh, tb, id, extack); } /* rtnl */ static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nl_info nlinfo = { .nlh = nlh, .nl_net = net, .portid = NETLINK_CB(skb).portid, }; struct nexthop *nh; int err; u32 id; err = nh_valid_get_del_req(nlh, &id, extack); if (err) return err; nh = nexthop_find_by_id(net, id); if (!nh) return -ENOENT; remove_nexthop(net, nh, &nlinfo); return 0; } /* rtnl */ static int rtm_get_nexthop(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct sk_buff *skb = NULL; struct nexthop *nh; int err; u32 id; err = nh_valid_get_del_req(nlh, &id, extack); if (err) return err; err = -ENOBUFS; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) goto out; err = -ENOENT; nh = nexthop_find_by_id(net, id); if (!nh) goto errout_free; err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0); if (err < 0) { WARN_ON(err == -EMSGSIZE); goto errout_free; } err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); out: return err; errout_free: kfree_skb(skb); goto out; } struct nh_dump_filter { u32 nh_id; int dev_idx; int master_idx; bool group_filter; bool fdb_filter; u32 res_bucket_nh_id; }; static bool nh_dump_filtered(struct nexthop *nh, struct nh_dump_filter *filter, u8 family) { const struct net_device *dev; const struct nh_info *nhi; if (filter->group_filter && !nh->is_group) return true; if (!filter->dev_idx && !filter->master_idx && !family) return false; if (nh->is_group) return true; nhi = rtnl_dereference(nh->nh_info); if (family && nhi->family != family) return true; dev = nhi->fib_nhc.nhc_dev; if (filter->dev_idx && (!dev || dev->ifindex != filter->dev_idx)) return true; if (filter->master_idx) { struct net_device *master; if (!dev) return true; master = netdev_master_upper_dev_get((struct net_device *)dev); if (!master || master->ifindex != filter->master_idx) return true; } return false; } static int __nh_valid_dump_req(const struct nlmsghdr *nlh, struct nlattr **tb, struct nh_dump_filter *filter, struct netlink_ext_ack *extack) { struct nhmsg *nhm; u32 idx; if (tb[NHA_OIF]) { idx = nla_get_u32(tb[NHA_OIF]); if (idx > INT_MAX) { NL_SET_ERR_MSG(extack, "Invalid device index"); return -EINVAL; } filter->dev_idx = idx; } if (tb[NHA_MASTER]) { idx = nla_get_u32(tb[NHA_MASTER]); if (idx > INT_MAX) { NL_SET_ERR_MSG(extack, "Invalid master device index"); return -EINVAL; } filter->master_idx = idx; } filter->group_filter = nla_get_flag(tb[NHA_GROUPS]); filter->fdb_filter = nla_get_flag(tb[NHA_FDB]); nhm = nlmsg_data(nlh); if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) { NL_SET_ERR_MSG(extack, "Invalid values in header for nexthop dump request"); return -EINVAL; } return 0; } static int nh_valid_dump_req(const struct nlmsghdr *nlh, struct nh_dump_filter *filter, struct netlink_callback *cb) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_dump)]; int err; err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, ARRAY_SIZE(rtm_nh_policy_dump) - 1, rtm_nh_policy_dump, cb->extack); if (err < 0) return err; return __nh_valid_dump_req(nlh, tb, filter, cb->extack); } struct rtm_dump_nh_ctx { u32 idx; }; static struct rtm_dump_nh_ctx * rtm_dump_nh_ctx(struct netlink_callback *cb) { struct rtm_dump_nh_ctx *ctx = (void *)cb->ctx; BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); return ctx; } static int rtm_dump_walk_nexthops(struct sk_buff *skb, struct netlink_callback *cb, struct rb_root *root, struct rtm_dump_nh_ctx *ctx, int (*nh_cb)(struct sk_buff *skb, struct netlink_callback *cb, struct nexthop *nh, void *data), void *data) { struct rb_node *node; int s_idx; int err; s_idx = ctx->idx; for (node = rb_first(root); node; node = rb_next(node)) { struct nexthop *nh; nh = rb_entry(node, struct nexthop, rb_node); if (nh->id < s_idx) continue; ctx->idx = nh->id; err = nh_cb(skb, cb, nh, data); if (err) return err; } return 0; } static int rtm_dump_nexthop_cb(struct sk_buff *skb, struct netlink_callback *cb, struct nexthop *nh, void *data) { struct nhmsg *nhm = nlmsg_data(cb->nlh); struct nh_dump_filter *filter = data; if (nh_dump_filtered(nh, filter, nhm->nh_family)) return 0; return nh_fill_node(skb, nh, RTM_NEWNEXTHOP, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI); } /* rtnl */ static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb) { struct rtm_dump_nh_ctx *ctx = rtm_dump_nh_ctx(cb); struct net *net = sock_net(skb->sk); struct rb_root *root = &net->nexthop.rb_root; struct nh_dump_filter filter = {}; int err; err = nh_valid_dump_req(cb->nlh, &filter, cb); if (err < 0) return err; err = rtm_dump_walk_nexthops(skb, cb, root, ctx, &rtm_dump_nexthop_cb, &filter); if (err < 0) { if (likely(skb->len)) err = skb->len; } cb->seq = net->nexthop.seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); return err; } static struct nexthop * nexthop_find_group_resilient(struct net *net, u32 id, struct netlink_ext_ack *extack) { struct nh_group *nhg; struct nexthop *nh; nh = nexthop_find_by_id(net, id); if (!nh) return ERR_PTR(-ENOENT); if (!nh->is_group) { NL_SET_ERR_MSG(extack, "Not a nexthop group"); return ERR_PTR(-EINVAL); } nhg = rtnl_dereference(nh->nh_grp); if (!nhg->resilient) { NL_SET_ERR_MSG(extack, "Nexthop group not of type resilient"); return ERR_PTR(-EINVAL); } return nh; } static int nh_valid_dump_nhid(struct nlattr *attr, u32 *nh_id_p, struct netlink_ext_ack *extack) { u32 idx; if (attr) { idx = nla_get_u32(attr); if (!idx) { NL_SET_ERR_MSG(extack, "Invalid nexthop id"); return -EINVAL; } *nh_id_p = idx; } else { *nh_id_p = 0; } return 0; } static int nh_valid_dump_bucket_req(const struct nlmsghdr *nlh, struct nh_dump_filter *filter, struct netlink_callback *cb) { struct nlattr *res_tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_dump)]; struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_dump_bucket)]; int err; err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, ARRAY_SIZE(rtm_nh_policy_dump_bucket) - 1, rtm_nh_policy_dump_bucket, NULL); if (err < 0) return err; err = nh_valid_dump_nhid(tb[NHA_ID], &filter->nh_id, cb->extack); if (err) return err; if (tb[NHA_RES_BUCKET]) { size_t max = ARRAY_SIZE(rtm_nh_res_bucket_policy_dump) - 1; err = nla_parse_nested(res_tb, max, tb[NHA_RES_BUCKET], rtm_nh_res_bucket_policy_dump, cb->extack); if (err < 0) return err; err = nh_valid_dump_nhid(res_tb[NHA_RES_BUCKET_NH_ID], &filter->res_bucket_nh_id, cb->extack); if (err) return err; } return __nh_valid_dump_req(nlh, tb, filter, cb->extack); } struct rtm_dump_res_bucket_ctx { struct rtm_dump_nh_ctx nh; u16 bucket_index; }; static struct rtm_dump_res_bucket_ctx * rtm_dump_res_bucket_ctx(struct netlink_callback *cb) { struct rtm_dump_res_bucket_ctx *ctx = (void *)cb->ctx; BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); return ctx; } struct rtm_dump_nexthop_bucket_data { struct rtm_dump_res_bucket_ctx *ctx; struct nh_dump_filter filter; }; static int rtm_dump_nexthop_bucket_nh(struct sk_buff *skb, struct netlink_callback *cb, struct nexthop *nh, struct rtm_dump_nexthop_bucket_data *dd) { u32 portid = NETLINK_CB(cb->skb).portid; struct nhmsg *nhm = nlmsg_data(cb->nlh); struct nh_res_table *res_table; struct nh_group *nhg; u16 bucket_index; int err; nhg = rtnl_dereference(nh->nh_grp); res_table = rtnl_dereference(nhg->res_table); for (bucket_index = dd->ctx->bucket_index; bucket_index < res_table->num_nh_buckets; bucket_index++) { struct nh_res_bucket *bucket; struct nh_grp_entry *nhge; bucket = &res_table->nh_buckets[bucket_index]; nhge = rtnl_dereference(bucket->nh_entry); if (nh_dump_filtered(nhge->nh, &dd->filter, nhm->nh_family)) continue; if (dd->filter.res_bucket_nh_id && dd->filter.res_bucket_nh_id != nhge->nh->id) continue; dd->ctx->bucket_index = bucket_index; err = nh_fill_res_bucket(skb, nh, bucket, bucket_index, RTM_NEWNEXTHOPBUCKET, portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->extack); if (err) return err; } dd->ctx->bucket_index = 0; return 0; } static int rtm_dump_nexthop_bucket_cb(struct sk_buff *skb, struct netlink_callback *cb, struct nexthop *nh, void *data) { struct rtm_dump_nexthop_bucket_data *dd = data; struct nh_group *nhg; if (!nh->is_group) return 0; nhg = rtnl_dereference(nh->nh_grp); if (!nhg->resilient) return 0; return rtm_dump_nexthop_bucket_nh(skb, cb, nh, dd); } /* rtnl */ static int rtm_dump_nexthop_bucket(struct sk_buff *skb, struct netlink_callback *cb) { struct rtm_dump_res_bucket_ctx *ctx = rtm_dump_res_bucket_ctx(cb); struct rtm_dump_nexthop_bucket_data dd = { .ctx = ctx }; struct net *net = sock_net(skb->sk); struct nexthop *nh; int err; err = nh_valid_dump_bucket_req(cb->nlh, &dd.filter, cb); if (err) return err; if (dd.filter.nh_id) { nh = nexthop_find_group_resilient(net, dd.filter.nh_id, cb->extack); if (IS_ERR(nh)) return PTR_ERR(nh); err = rtm_dump_nexthop_bucket_nh(skb, cb, nh, &dd); } else { struct rb_root *root = &net->nexthop.rb_root; err = rtm_dump_walk_nexthops(skb, cb, root, &ctx->nh, &rtm_dump_nexthop_bucket_cb, &dd); } if (err < 0) { if (likely(skb->len)) err = skb->len; } cb->seq = net->nexthop.seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); return err; } static int nh_valid_get_bucket_req_res_bucket(struct nlattr *res, u16 *bucket_index, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_get)]; int err; err = nla_parse_nested(tb, ARRAY_SIZE(rtm_nh_res_bucket_policy_get) - 1, res, rtm_nh_res_bucket_policy_get, extack); if (err < 0) return err; if (!tb[NHA_RES_BUCKET_INDEX]) { NL_SET_ERR_MSG(extack, "Bucket index is missing"); return -EINVAL; } *bucket_index = nla_get_u16(tb[NHA_RES_BUCKET_INDEX]); return 0; } static int nh_valid_get_bucket_req(const struct nlmsghdr *nlh, u32 *id, u16 *bucket_index, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get_bucket)]; int err; err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, ARRAY_SIZE(rtm_nh_policy_get_bucket) - 1, rtm_nh_policy_get_bucket, extack); if (err < 0) return err; err = __nh_valid_get_del_req(nlh, tb, id, extack); if (err) return err; if (!tb[NHA_RES_BUCKET]) { NL_SET_ERR_MSG(extack, "Bucket information is missing"); return -EINVAL; } err = nh_valid_get_bucket_req_res_bucket(tb[NHA_RES_BUCKET], bucket_index, extack); if (err) return err; return 0; } /* rtnl */ static int rtm_get_nexthop_bucket(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct nh_res_table *res_table; struct sk_buff *skb = NULL; struct nh_group *nhg; struct nexthop *nh; u16 bucket_index; int err; u32 id; err = nh_valid_get_bucket_req(nlh, &id, &bucket_index, extack); if (err) return err; nh = nexthop_find_group_resilient(net, id, extack); if (IS_ERR(nh)) return PTR_ERR(nh); nhg = rtnl_dereference(nh->nh_grp); res_table = rtnl_dereference(nhg->res_table); if (bucket_index >= res_table->num_nh_buckets) { NL_SET_ERR_MSG(extack, "Bucket index out of bounds"); return -ENOENT; } skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; err = nh_fill_res_bucket(skb, nh, &res_table->nh_buckets[bucket_index], bucket_index, RTM_NEWNEXTHOPBUCKET, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0, extack); if (err < 0) { WARN_ON(err == -EMSGSIZE); goto errout_free; } return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout_free: kfree_skb(skb); return err; } static void nexthop_sync_mtu(struct net_device *dev, u32 orig_mtu) { unsigned int hash = nh_dev_hashfn(dev->ifindex); struct net *net = dev_net(dev); struct hlist_head *head = &net->nexthop.devhash[hash]; struct hlist_node *n; struct nh_info *nhi; hlist_for_each_entry_safe(nhi, n, head, dev_hash) { if (nhi->fib_nhc.nhc_dev == dev) { if (nhi->family == AF_INET) fib_nhc_update_mtu(&nhi->fib_nhc, dev->mtu, orig_mtu); } } } /* rtnl */ static int nh_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netdev_notifier_info_ext *info_ext; switch (event) { case NETDEV_DOWN: case NETDEV_UNREGISTER: nexthop_flush_dev(dev, event); break; case NETDEV_CHANGE: if (!(dev_get_flags(dev) & (IFF_RUNNING | IFF_LOWER_UP))) nexthop_flush_dev(dev, event); break; case NETDEV_CHANGEMTU: info_ext = ptr; nexthop_sync_mtu(dev, info_ext->ext.mtu); rt_cache_flush(dev_net(dev)); break; } return NOTIFY_DONE; } static struct notifier_block nh_netdev_notifier = { .notifier_call = nh_netdev_event, }; static int nexthops_dump(struct net *net, struct notifier_block *nb, enum nexthop_event_type event_type, struct netlink_ext_ack *extack) { struct rb_root *root = &net->nexthop.rb_root; struct rb_node *node; int err = 0; for (node = rb_first(root); node; node = rb_next(node)) { struct nexthop *nh; nh = rb_entry(node, struct nexthop, rb_node); err = call_nexthop_notifier(nb, net, event_type, nh, extack); if (err) break; } return err; } int register_nexthop_notifier(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { int err; rtnl_lock(); err = nexthops_dump(net, nb, NEXTHOP_EVENT_REPLACE, extack); if (err) goto unlock; err = blocking_notifier_chain_register(&net->nexthop.notifier_chain, nb); unlock: rtnl_unlock(); return err; } EXPORT_SYMBOL(register_nexthop_notifier); int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb) { int err; rtnl_lock(); err = blocking_notifier_chain_unregister(&net->nexthop.notifier_chain, nb); if (err) goto unlock; nexthops_dump(net, nb, NEXTHOP_EVENT_DEL, NULL); unlock: rtnl_unlock(); return err; } EXPORT_SYMBOL(unregister_nexthop_notifier); void nexthop_set_hw_flags(struct net *net, u32 id, bool offload, bool trap) { struct nexthop *nexthop; rcu_read_lock(); nexthop = nexthop_find_by_id(net, id); if (!nexthop) goto out; nexthop->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP); if (offload) nexthop->nh_flags |= RTNH_F_OFFLOAD; if (trap) nexthop->nh_flags |= RTNH_F_TRAP; out: rcu_read_unlock(); } EXPORT_SYMBOL(nexthop_set_hw_flags); void nexthop_bucket_set_hw_flags(struct net *net, u32 id, u16 bucket_index, bool offload, bool trap) { struct nh_res_table *res_table; struct nh_res_bucket *bucket; struct nexthop *nexthop; struct nh_group *nhg; rcu_read_lock(); nexthop = nexthop_find_by_id(net, id); if (!nexthop || !nexthop->is_group) goto out; nhg = rcu_dereference(nexthop->nh_grp); if (!nhg->resilient) goto out; if (bucket_index >= nhg->res_table->num_nh_buckets) goto out; res_table = rcu_dereference(nhg->res_table); bucket = &res_table->nh_buckets[bucket_index]; bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP); if (offload) bucket->nh_flags |= RTNH_F_OFFLOAD; if (trap) bucket->nh_flags |= RTNH_F_TRAP; out: rcu_read_unlock(); } EXPORT_SYMBOL(nexthop_bucket_set_hw_flags); void nexthop_res_grp_activity_update(struct net *net, u32 id, u16 num_buckets, unsigned long *activity) { struct nh_res_table *res_table; struct nexthop *nexthop; struct nh_group *nhg; u16 i; rcu_read_lock(); nexthop = nexthop_find_by_id(net, id); if (!nexthop || !nexthop->is_group) goto out; nhg = rcu_dereference(nexthop->nh_grp); if (!nhg->resilient) goto out; /* Instead of silently ignoring some buckets, demand that the sizes * be the same. */ res_table = rcu_dereference(nhg->res_table); if (num_buckets != res_table->num_nh_buckets) goto out; for (i = 0; i < num_buckets; i++) { if (test_bit(i, activity)) nh_res_bucket_set_busy(&res_table->nh_buckets[i]); } out: rcu_read_unlock(); } EXPORT_SYMBOL(nexthop_res_grp_activity_update); static void __net_exit nexthop_net_exit_batch(struct list_head *net_list) { struct net *net; rtnl_lock(); list_for_each_entry(net, net_list, exit_list) { flush_all_nexthops(net); kfree(net->nexthop.devhash); } rtnl_unlock(); } static int __net_init nexthop_net_init(struct net *net) { size_t sz = sizeof(struct hlist_head) * NH_DEV_HASHSIZE; net->nexthop.rb_root = RB_ROOT; net->nexthop.devhash = kzalloc(sz, GFP_KERNEL); if (!net->nexthop.devhash) return -ENOMEM; BLOCKING_INIT_NOTIFIER_HEAD(&net->nexthop.notifier_chain); return 0; } static struct pernet_operations nexthop_net_ops = { .init = nexthop_net_init, .exit_batch = nexthop_net_exit_batch, }; static int __init nexthop_init(void) { register_pernet_subsys(&nexthop_net_ops); register_netdevice_notifier(&nh_netdev_notifier); rtnl_register(PF_UNSPEC, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELNEXTHOP, rtm_del_nexthop, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETNEXTHOP, rtm_get_nexthop, rtm_dump_nexthop, 0); rtnl_register(PF_INET, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0); rtnl_register(PF_INET, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0); rtnl_register(PF_INET6, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0); rtnl_register(PF_INET6, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0); rtnl_register(PF_UNSPEC, RTM_GETNEXTHOPBUCKET, rtm_get_nexthop_bucket, rtm_dump_nexthop_bucket, 0); return 0; } subsys_initcall(nexthop_init);
6 26 6 6 26 26 2 2 1 1 7 6 6 6 6 6 6 6 7 4 4 2 4 2 2 2 4 4 3 2 2 4 4 3 2 2 2 4 1 1 1 26 26 26 26 26 32 32 29 29 27 27 32 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) ST-Ericsson AB 2010 * Author: Sjur Brendeland */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/filter.h> #include <linux/fs.h> #include <linux/init.h> #include <linux/module.h> #include <linux/sched/signal.h> #include <linux/spinlock.h> #include <linux/mutex.h> #include <linux/list.h> #include <linux/wait.h> #include <linux/poll.h> #include <linux/tcp.h> #include <linux/uaccess.h> #include <linux/debugfs.h> #include <linux/caif/caif_socket.h> #include <linux/pkt_sched.h> #include <net/sock.h> #include <net/tcp_states.h> #include <net/caif/caif_layer.h> #include <net/caif/caif_dev.h> #include <net/caif/cfpkt.h> MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(AF_CAIF); /* * CAIF state is re-using the TCP socket states. * caif_states stored in sk_state reflect the state as reported by * the CAIF stack, while sk_socket->state is the state of the socket. */ enum caif_states { CAIF_CONNECTED = TCP_ESTABLISHED, CAIF_CONNECTING = TCP_SYN_SENT, CAIF_DISCONNECTED = TCP_CLOSE }; #define TX_FLOW_ON_BIT 1 #define RX_FLOW_ON_BIT 2 struct caifsock { struct sock sk; /* must be first member */ struct cflayer layer; unsigned long flow_state; struct caif_connect_request conn_req; struct mutex readlock; struct dentry *debugfs_socket_dir; int headroom, tailroom, maxframe; }; static int rx_flow_is_on(struct caifsock *cf_sk) { return test_bit(RX_FLOW_ON_BIT, &cf_sk->flow_state); } static int tx_flow_is_on(struct caifsock *cf_sk) { return test_bit(TX_FLOW_ON_BIT, &cf_sk->flow_state); } static void set_rx_flow_off(struct caifsock *cf_sk) { clear_bit(RX_FLOW_ON_BIT, &cf_sk->flow_state); } static void set_rx_flow_on(struct caifsock *cf_sk) { set_bit(RX_FLOW_ON_BIT, &cf_sk->flow_state); } static void set_tx_flow_off(struct caifsock *cf_sk) { clear_bit(TX_FLOW_ON_BIT, &cf_sk->flow_state); } static void set_tx_flow_on(struct caifsock *cf_sk) { set_bit(TX_FLOW_ON_BIT, &cf_sk->flow_state); } static void caif_read_lock(struct sock *sk) { struct caifsock *cf_sk; cf_sk = container_of(sk, struct caifsock, sk); mutex_lock(&cf_sk->readlock); } static void caif_read_unlock(struct sock *sk) { struct caifsock *cf_sk; cf_sk = container_of(sk, struct caifsock, sk); mutex_unlock(&cf_sk->readlock); } static int sk_rcvbuf_lowwater(struct caifsock *cf_sk) { /* A quarter of full buffer is used a low water mark */ return cf_sk->sk.sk_rcvbuf / 4; } static void caif_flow_ctrl(struct sock *sk, int mode) { struct caifsock *cf_sk; cf_sk = container_of(sk, struct caifsock, sk); if (cf_sk->layer.dn && cf_sk->layer.dn->modemcmd) cf_sk->layer.dn->modemcmd(cf_sk->layer.dn, mode); } /* * Copied from sock.c:sock_queue_rcv_skb(), but changed so packets are * not dropped, but CAIF is sending flow off instead. */ static void caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int err; unsigned long flags; struct sk_buff_head *list = &sk->sk_receive_queue; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); bool queued = false; if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= (unsigned int)sk->sk_rcvbuf && rx_flow_is_on(cf_sk)) { net_dbg_ratelimited("sending flow OFF (queue len = %d %d)\n", atomic_read(&cf_sk->sk.sk_rmem_alloc), sk_rcvbuf_lowwater(cf_sk)); set_rx_flow_off(cf_sk); caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_OFF_REQ); } err = sk_filter(sk, skb); if (err) goto out; if (!sk_rmem_schedule(sk, skb, skb->truesize) && rx_flow_is_on(cf_sk)) { set_rx_flow_off(cf_sk); net_dbg_ratelimited("sending flow OFF due to rmem_schedule\n"); caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_OFF_REQ); } skb->dev = NULL; skb_set_owner_r(skb, sk); spin_lock_irqsave(&list->lock, flags); queued = !sock_flag(sk, SOCK_DEAD); if (queued) __skb_queue_tail(list, skb); spin_unlock_irqrestore(&list->lock, flags); out: if (queued) sk->sk_data_ready(sk); else kfree_skb(skb); } /* Packet Receive Callback function called from CAIF Stack */ static int caif_sktrecv_cb(struct cflayer *layr, struct cfpkt *pkt) { struct caifsock *cf_sk; struct sk_buff *skb; cf_sk = container_of(layr, struct caifsock, layer); skb = cfpkt_tonative(pkt); if (unlikely(cf_sk->sk.sk_state != CAIF_CONNECTED)) { kfree_skb(skb); return 0; } caif_queue_rcv_skb(&cf_sk->sk, skb); return 0; } static void cfsk_hold(struct cflayer *layr) { struct caifsock *cf_sk = container_of(layr, struct caifsock, layer); sock_hold(&cf_sk->sk); } static void cfsk_put(struct cflayer *layr) { struct caifsock *cf_sk = container_of(layr, struct caifsock, layer); sock_put(&cf_sk->sk); } /* Packet Control Callback function called from CAIF */ static void caif_ctrl_cb(struct cflayer *layr, enum caif_ctrlcmd flow, int phyid) { struct caifsock *cf_sk = container_of(layr, struct caifsock, layer); switch (flow) { case CAIF_CTRLCMD_FLOW_ON_IND: /* OK from modem to start sending again */ set_tx_flow_on(cf_sk); cf_sk->sk.sk_state_change(&cf_sk->sk); break; case CAIF_CTRLCMD_FLOW_OFF_IND: /* Modem asks us to shut up */ set_tx_flow_off(cf_sk); cf_sk->sk.sk_state_change(&cf_sk->sk); break; case CAIF_CTRLCMD_INIT_RSP: /* We're now connected */ caif_client_register_refcnt(&cf_sk->layer, cfsk_hold, cfsk_put); cf_sk->sk.sk_state = CAIF_CONNECTED; set_tx_flow_on(cf_sk); cf_sk->sk.sk_shutdown = 0; cf_sk->sk.sk_state_change(&cf_sk->sk); break; case CAIF_CTRLCMD_DEINIT_RSP: /* We're now disconnected */ cf_sk->sk.sk_state = CAIF_DISCONNECTED; cf_sk->sk.sk_state_change(&cf_sk->sk); break; case CAIF_CTRLCMD_INIT_FAIL_RSP: /* Connect request failed */ cf_sk->sk.sk_err = ECONNREFUSED; cf_sk->sk.sk_state = CAIF_DISCONNECTED; cf_sk->sk.sk_shutdown = SHUTDOWN_MASK; /* * Socket "standards" seems to require POLLOUT to * be set at connect failure. */ set_tx_flow_on(cf_sk); cf_sk->sk.sk_state_change(&cf_sk->sk); break; case CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND: /* Modem has closed this connection, or device is down. */ cf_sk->sk.sk_shutdown = SHUTDOWN_MASK; cf_sk->sk.sk_err = ECONNRESET; set_rx_flow_on(cf_sk); sk_error_report(&cf_sk->sk); break; default: pr_debug("Unexpected flow command %d\n", flow); } } static void caif_check_flow_release(struct sock *sk) { struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); if (rx_flow_is_on(cf_sk)) return; if (atomic_read(&sk->sk_rmem_alloc) <= sk_rcvbuf_lowwater(cf_sk)) { set_rx_flow_on(cf_sk); caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_ON_REQ); } } /* * Copied from unix_dgram_recvmsg, but removed credit checks, * changed locking, address handling and added MSG_TRUNC. */ static int caif_seqpkt_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; int ret; int copylen; ret = -EOPNOTSUPP; if (flags & MSG_OOB) goto read_error; skb = skb_recv_datagram(sk, flags, &ret); if (!skb) goto read_error; copylen = skb->len; if (len < copylen) { m->msg_flags |= MSG_TRUNC; copylen = len; } ret = skb_copy_datagram_msg(skb, 0, m, copylen); if (ret) goto out_free; ret = (flags & MSG_TRUNC) ? skb->len : copylen; out_free: skb_free_datagram(sk, skb); caif_check_flow_release(sk); return ret; read_error: return ret; } /* Copied from unix_stream_wait_data, identical except for lock call. */ static long caif_stream_data_wait(struct sock *sk, long timeo) { DEFINE_WAIT(wait); lock_sock(sk); for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (!skb_queue_empty(&sk->sk_receive_queue) || sk->sk_err || sk->sk_state != CAIF_CONNECTED || sock_flag(sk, SOCK_DEAD) || (sk->sk_shutdown & RCV_SHUTDOWN) || signal_pending(current) || !timeo) break; sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); release_sock(sk); timeo = schedule_timeout(timeo); lock_sock(sk); if (sock_flag(sk, SOCK_DEAD)) break; sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); } finish_wait(sk_sleep(sk), &wait); release_sock(sk); return timeo; } /* * Copied from unix_stream_recvmsg, but removed credit checks, * changed locking calls, changed address handling. */ static int caif_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; int copied = 0; int target; int err = 0; long timeo; err = -EOPNOTSUPP; if (flags&MSG_OOB) goto out; /* * Lock the socket to prevent queue disordering * while sleeps in memcpy_tomsg */ err = -EAGAIN; if (sk->sk_state == CAIF_CONNECTING) goto out; caif_read_lock(sk); target = sock_rcvlowat(sk, flags&MSG_WAITALL, size); timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT); do { int chunk; struct sk_buff *skb; lock_sock(sk); if (sock_flag(sk, SOCK_DEAD)) { err = -ECONNRESET; goto unlock; } skb = skb_dequeue(&sk->sk_receive_queue); caif_check_flow_release(sk); if (skb == NULL) { if (copied >= target) goto unlock; /* * POSIX 1003.1g mandates this order. */ err = sock_error(sk); if (err) goto unlock; err = -ECONNRESET; if (sk->sk_shutdown & RCV_SHUTDOWN) goto unlock; err = -EPIPE; if (sk->sk_state != CAIF_CONNECTED) goto unlock; if (sock_flag(sk, SOCK_DEAD)) goto unlock; release_sock(sk); err = -EAGAIN; if (!timeo) break; caif_read_unlock(sk); timeo = caif_stream_data_wait(sk, timeo); if (signal_pending(current)) { err = sock_intr_errno(timeo); goto out; } caif_read_lock(sk); continue; unlock: release_sock(sk); break; } release_sock(sk); chunk = min_t(unsigned int, skb->len, size); if (memcpy_to_msg(msg, skb->data, chunk)) { skb_queue_head(&sk->sk_receive_queue, skb); if (copied == 0) copied = -EFAULT; break; } copied += chunk; size -= chunk; /* Mark read part of skb as used */ if (!(flags & MSG_PEEK)) { skb_pull(skb, chunk); /* put the skb back if we didn't use it up. */ if (skb->len) { skb_queue_head(&sk->sk_receive_queue, skb); break; } kfree_skb(skb); } else { /* * It is questionable, see note in unix_dgram_recvmsg. */ /* put message back and return */ skb_queue_head(&sk->sk_receive_queue, skb); break; } } while (size); caif_read_unlock(sk); out: return copied ? : err; } /* * Copied from sock.c:sock_wait_for_wmem, but change to wait for * CAIF flow-on and sock_writable. */ static long caif_wait_for_flow_on(struct caifsock *cf_sk, int wait_writeable, long timeo, int *err) { struct sock *sk = &cf_sk->sk; DEFINE_WAIT(wait); for (;;) { *err = 0; if (tx_flow_is_on(cf_sk) && (!wait_writeable || sock_writeable(&cf_sk->sk))) break; *err = -ETIMEDOUT; if (!timeo) break; *err = -ERESTARTSYS; if (signal_pending(current)) break; prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); *err = -ECONNRESET; if (sk->sk_shutdown & SHUTDOWN_MASK) break; *err = -sk->sk_err; if (sk->sk_err) break; *err = -EPIPE; if (cf_sk->sk.sk_state != CAIF_CONNECTED) break; timeo = schedule_timeout(timeo); } finish_wait(sk_sleep(sk), &wait); return timeo; } /* * Transmit a SKB. The device may temporarily request re-transmission * by returning EAGAIN. */ static int transmit_skb(struct sk_buff *skb, struct caifsock *cf_sk, int noblock, long timeo) { struct cfpkt *pkt; pkt = cfpkt_fromnative(CAIF_DIR_OUT, skb); memset(skb->cb, 0, sizeof(struct caif_payload_info)); cfpkt_set_prio(pkt, cf_sk->sk.sk_priority); if (cf_sk->layer.dn == NULL) { kfree_skb(skb); return -EINVAL; } return cf_sk->layer.dn->transmit(cf_sk->layer.dn, pkt); } /* Copied from af_unix:unix_dgram_sendmsg, and adapted to CAIF */ static int caif_seqpkt_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); int buffer_size; int ret = 0; struct sk_buff *skb = NULL; int noblock; long timeo; caif_assert(cf_sk); ret = sock_error(sk); if (ret) goto err; ret = -EOPNOTSUPP; if (msg->msg_flags&MSG_OOB) goto err; ret = -EOPNOTSUPP; if (msg->msg_namelen) goto err; noblock = msg->msg_flags & MSG_DONTWAIT; timeo = sock_sndtimeo(sk, noblock); timeo = caif_wait_for_flow_on(container_of(sk, struct caifsock, sk), 1, timeo, &ret); if (ret) goto err; ret = -EPIPE; if (cf_sk->sk.sk_state != CAIF_CONNECTED || sock_flag(sk, SOCK_DEAD) || (sk->sk_shutdown & RCV_SHUTDOWN)) goto err; /* Error if trying to write more than maximum frame size. */ ret = -EMSGSIZE; if (len > cf_sk->maxframe && cf_sk->sk.sk_protocol != CAIFPROTO_RFM) goto err; buffer_size = len + cf_sk->headroom + cf_sk->tailroom; ret = -ENOMEM; skb = sock_alloc_send_skb(sk, buffer_size, noblock, &ret); if (!skb || skb_tailroom(skb) < buffer_size) goto err; skb_reserve(skb, cf_sk->headroom); ret = memcpy_from_msg(skb_put(skb, len), msg, len); if (ret) goto err; ret = transmit_skb(skb, cf_sk, noblock, timeo); if (ret < 0) /* skb is already freed */ return ret; return len; err: kfree_skb(skb); return ret; } /* * Copied from unix_stream_sendmsg and adapted to CAIF: * Changed removed permission handling and added waiting for flow on * and other minor adaptations. */ static int caif_stream_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); int err, size; struct sk_buff *skb; int sent = 0; long timeo; err = -EOPNOTSUPP; if (unlikely(msg->msg_flags&MSG_OOB)) goto out_err; if (unlikely(msg->msg_namelen)) goto out_err; timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); timeo = caif_wait_for_flow_on(cf_sk, 1, timeo, &err); if (unlikely(sk->sk_shutdown & SEND_SHUTDOWN)) goto pipe_err; while (sent < len) { size = len-sent; if (size > cf_sk->maxframe) size = cf_sk->maxframe; /* If size is more than half of sndbuf, chop up message */ if (size > ((sk->sk_sndbuf >> 1) - 64)) size = (sk->sk_sndbuf >> 1) - 64; if (size > SKB_MAX_ALLOC) size = SKB_MAX_ALLOC; skb = sock_alloc_send_skb(sk, size + cf_sk->headroom + cf_sk->tailroom, msg->msg_flags&MSG_DONTWAIT, &err); if (skb == NULL) goto out_err; skb_reserve(skb, cf_sk->headroom); /* * If you pass two values to the sock_alloc_send_skb * it tries to grab the large buffer with GFP_NOFS * (which can fail easily), and if it fails grab the * fallback size buffer which is under a page and will * succeed. [Alan] */ size = min_t(int, size, skb_tailroom(skb)); err = memcpy_from_msg(skb_put(skb, size), msg, size); if (err) { kfree_skb(skb); goto out_err; } err = transmit_skb(skb, cf_sk, msg->msg_flags&MSG_DONTWAIT, timeo); if (err < 0) /* skb is already freed */ goto pipe_err; sent += size; } return sent; pipe_err: if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL)) send_sig(SIGPIPE, current, 0); err = -EPIPE; out_err: return sent ? : err; } static int setsockopt(struct socket *sock, int lvl, int opt, sockptr_t ov, unsigned int ol) { struct sock *sk = sock->sk; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); int linksel; if (cf_sk->sk.sk_socket->state != SS_UNCONNECTED) return -ENOPROTOOPT; switch (opt) { case CAIFSO_LINK_SELECT: if (ol < sizeof(int)) return -EINVAL; if (lvl != SOL_CAIF) goto bad_sol; if (copy_from_sockptr(&linksel, ov, sizeof(int))) return -EINVAL; lock_sock(&(cf_sk->sk)); cf_sk->conn_req.link_selector = linksel; release_sock(&cf_sk->sk); return 0; case CAIFSO_REQ_PARAM: if (lvl != SOL_CAIF) goto bad_sol; if (cf_sk->sk.sk_protocol != CAIFPROTO_UTIL) return -ENOPROTOOPT; lock_sock(&(cf_sk->sk)); if (ol > sizeof(cf_sk->conn_req.param.data) || copy_from_sockptr(&cf_sk->conn_req.param.data, ov, ol)) { release_sock(&cf_sk->sk); return -EINVAL; } cf_sk->conn_req.param.size = ol; release_sock(&cf_sk->sk); return 0; default: return -ENOPROTOOPT; } return 0; bad_sol: return -ENOPROTOOPT; } /* * caif_connect() - Connect a CAIF Socket * Copied and modified af_irda.c:irda_connect(). * * Note : by consulting "errno", the user space caller may learn the cause * of the failure. Most of them are visible in the function, others may come * from subroutines called and are listed here : * o -EAFNOSUPPORT: bad socket family or type. * o -ESOCKTNOSUPPORT: bad socket type or protocol * o -EINVAL: bad socket address, or CAIF link type * o -ECONNREFUSED: remote end refused the connection. * o -EINPROGRESS: connect request sent but timed out (or non-blocking) * o -EISCONN: already connected. * o -ETIMEDOUT: Connection timed out (send timeout) * o -ENODEV: No link layer to send request * o -ECONNRESET: Received Shutdown indication or lost link layer * o -ENOMEM: Out of memory * * State Strategy: * o sk_state: holds the CAIF_* protocol state, it's updated by * caif_ctrl_cb. * o sock->state: holds the SS_* socket state and is updated by connect and * disconnect. */ static int caif_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); long timeo; int err; int ifindex, headroom, tailroom; unsigned int mtu; struct net_device *dev; lock_sock(sk); err = -EINVAL; if (addr_len < offsetofend(struct sockaddr, sa_family)) goto out; err = -EAFNOSUPPORT; if (uaddr->sa_family != AF_CAIF) goto out; switch (sock->state) { case SS_UNCONNECTED: /* Normal case, a fresh connect */ caif_assert(sk->sk_state == CAIF_DISCONNECTED); break; case SS_CONNECTING: switch (sk->sk_state) { case CAIF_CONNECTED: sock->state = SS_CONNECTED; err = -EISCONN; goto out; case CAIF_DISCONNECTED: /* Reconnect allowed */ break; case CAIF_CONNECTING: err = -EALREADY; if (flags & O_NONBLOCK) goto out; goto wait_connect; } break; case SS_CONNECTED: caif_assert(sk->sk_state == CAIF_CONNECTED || sk->sk_state == CAIF_DISCONNECTED); if (sk->sk_shutdown & SHUTDOWN_MASK) { /* Allow re-connect after SHUTDOWN_IND */ caif_disconnect_client(sock_net(sk), &cf_sk->layer); caif_free_client(&cf_sk->layer); break; } /* No reconnect on a seqpacket socket */ err = -EISCONN; goto out; case SS_DISCONNECTING: case SS_FREE: caif_assert(1); /*Should never happen */ break; } sk->sk_state = CAIF_DISCONNECTED; sock->state = SS_UNCONNECTED; sk_stream_kill_queues(&cf_sk->sk); err = -EINVAL; if (addr_len != sizeof(struct sockaddr_caif)) goto out; memcpy(&cf_sk->conn_req.sockaddr, uaddr, sizeof(struct sockaddr_caif)); /* Move to connecting socket, start sending Connect Requests */ sock->state = SS_CONNECTING; sk->sk_state = CAIF_CONNECTING; /* Check priority value comming from socket */ /* if priority value is out of range it will be ajusted */ if (cf_sk->sk.sk_priority > CAIF_PRIO_MAX) cf_sk->conn_req.priority = CAIF_PRIO_MAX; else if (cf_sk->sk.sk_priority < CAIF_PRIO_MIN) cf_sk->conn_req.priority = CAIF_PRIO_MIN; else cf_sk->conn_req.priority = cf_sk->sk.sk_priority; /*ifindex = id of the interface.*/ cf_sk->conn_req.ifindex = cf_sk->sk.sk_bound_dev_if; cf_sk->layer.receive = caif_sktrecv_cb; err = caif_connect_client(sock_net(sk), &cf_sk->conn_req, &cf_sk->layer, &ifindex, &headroom, &tailroom); if (err < 0) { cf_sk->sk.sk_socket->state = SS_UNCONNECTED; cf_sk->sk.sk_state = CAIF_DISCONNECTED; goto out; } err = -ENODEV; rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), ifindex); if (!dev) { rcu_read_unlock(); goto out; } cf_sk->headroom = LL_RESERVED_SPACE_EXTRA(dev, headroom); mtu = dev->mtu; rcu_read_unlock(); cf_sk->tailroom = tailroom; cf_sk->maxframe = mtu - (headroom + tailroom); if (cf_sk->maxframe < 1) { pr_warn("CAIF Interface MTU too small (%d)\n", dev->mtu); err = -ENODEV; goto out; } err = -EINPROGRESS; wait_connect: if (sk->sk_state != CAIF_CONNECTED && (flags & O_NONBLOCK)) goto out; timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); release_sock(sk); err = -ERESTARTSYS; timeo = wait_event_interruptible_timeout(*sk_sleep(sk), sk->sk_state != CAIF_CONNECTING, timeo); lock_sock(sk); if (timeo < 0) goto out; /* -ERESTARTSYS */ err = -ETIMEDOUT; if (timeo == 0 && sk->sk_state != CAIF_CONNECTED) goto out; if (sk->sk_state != CAIF_CONNECTED) { sock->state = SS_UNCONNECTED; err = sock_error(sk); if (!err) err = -ECONNREFUSED; goto out; } sock->state = SS_CONNECTED; err = 0; out: release_sock(sk); return err; } /* * caif_release() - Disconnect a CAIF Socket * Copied and modified af_irda.c:irda_release(). */ static int caif_release(struct socket *sock) { struct sock *sk = sock->sk; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); if (!sk) return 0; set_tx_flow_off(cf_sk); /* * Ensure that packets are not queued after this point in time. * caif_queue_rcv_skb checks SOCK_DEAD holding the queue lock, * this ensures no packets when sock is dead. */ spin_lock_bh(&sk->sk_receive_queue.lock); sock_set_flag(sk, SOCK_DEAD); spin_unlock_bh(&sk->sk_receive_queue.lock); sock->sk = NULL; WARN_ON(IS_ERR(cf_sk->debugfs_socket_dir)); debugfs_remove_recursive(cf_sk->debugfs_socket_dir); lock_sock(&(cf_sk->sk)); sk->sk_state = CAIF_DISCONNECTED; sk->sk_shutdown = SHUTDOWN_MASK; caif_disconnect_client(sock_net(sk), &cf_sk->layer); cf_sk->sk.sk_socket->state = SS_DISCONNECTING; wake_up_interruptible_poll(sk_sleep(sk), EPOLLERR|EPOLLHUP); sock_orphan(sk); sk_stream_kill_queues(&cf_sk->sk); release_sock(sk); sock_put(sk); return 0; } /* Copied from af_unix.c:unix_poll(), added CAIF tx_flow handling */ static __poll_t caif_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; __poll_t mask; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); sock_poll_wait(file, sock, wait); mask = 0; /* exceptional events? */ if (sk->sk_err) mask |= EPOLLERR; if (sk->sk_shutdown == SHUTDOWN_MASK) mask |= EPOLLHUP; if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= EPOLLRDHUP; /* readable? */ if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || (sk->sk_shutdown & RCV_SHUTDOWN)) mask |= EPOLLIN | EPOLLRDNORM; /* * we set writable also when the other side has shut down the * connection. This prevents stuck sockets. */ if (sock_writeable(sk) && tx_flow_is_on(cf_sk)) mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; return mask; } static const struct proto_ops caif_seqpacket_ops = { .family = PF_CAIF, .owner = THIS_MODULE, .release = caif_release, .bind = sock_no_bind, .connect = caif_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .poll = caif_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = setsockopt, .sendmsg = caif_seqpkt_sendmsg, .recvmsg = caif_seqpkt_recvmsg, .mmap = sock_no_mmap, }; static const struct proto_ops caif_stream_ops = { .family = PF_CAIF, .owner = THIS_MODULE, .release = caif_release, .bind = sock_no_bind, .connect = caif_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .poll = caif_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = setsockopt, .sendmsg = caif_stream_sendmsg, .recvmsg = caif_stream_recvmsg, .mmap = sock_no_mmap, }; /* This function is called when a socket is finally destroyed. */ static void caif_sock_destructor(struct sock *sk) { struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); caif_assert(!refcount_read(&sk->sk_wmem_alloc)); caif_assert(sk_unhashed(sk)); caif_assert(!sk->sk_socket); if (!sock_flag(sk, SOCK_DEAD)) { pr_debug("Attempt to release alive CAIF socket: %p\n", sk); return; } sk_stream_kill_queues(&cf_sk->sk); WARN_ON_ONCE(sk->sk_forward_alloc); caif_free_client(&cf_sk->layer); } static int caif_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk = NULL; struct caifsock *cf_sk = NULL; static struct proto prot = {.name = "PF_CAIF", .owner = THIS_MODULE, .obj_size = sizeof(struct caifsock), .useroffset = offsetof(struct caifsock, conn_req.param), .usersize = sizeof_field(struct caifsock, conn_req.param) }; if (!capable(CAP_SYS_ADMIN) && !capable(CAP_NET_ADMIN)) return -EPERM; /* * The sock->type specifies the socket type to use. * The CAIF socket is a packet stream in the sense * that it is packet based. CAIF trusts the reliability * of the link, no resending is implemented. */ if (sock->type == SOCK_SEQPACKET) sock->ops = &caif_seqpacket_ops; else if (sock->type == SOCK_STREAM) sock->ops = &caif_stream_ops; else return -ESOCKTNOSUPPORT; if (protocol < 0 || protocol >= CAIFPROTO_MAX) return -EPROTONOSUPPORT; /* * Set the socket state to unconnected. The socket state * is really not used at all in the net/core or socket.c but the * initialization makes sure that sock->state is not uninitialized. */ sk = sk_alloc(net, PF_CAIF, GFP_KERNEL, &prot, kern); if (!sk) return -ENOMEM; cf_sk = container_of(sk, struct caifsock, sk); /* Store the protocol */ sk->sk_protocol = (unsigned char) protocol; /* Initialize default priority for well-known cases */ switch (protocol) { case CAIFPROTO_AT: sk->sk_priority = TC_PRIO_CONTROL; break; case CAIFPROTO_RFM: sk->sk_priority = TC_PRIO_INTERACTIVE_BULK; break; default: sk->sk_priority = TC_PRIO_BESTEFFORT; } /* * Lock in order to try to stop someone from opening the socket * too early. */ lock_sock(&(cf_sk->sk)); /* Initialize the nozero default sock structure data. */ sock_init_data(sock, sk); sk->sk_destruct = caif_sock_destructor; mutex_init(&cf_sk->readlock); /* single task reading lock */ cf_sk->layer.ctrlcmd = caif_ctrl_cb; cf_sk->sk.sk_socket->state = SS_UNCONNECTED; cf_sk->sk.sk_state = CAIF_DISCONNECTED; set_tx_flow_off(cf_sk); set_rx_flow_on(cf_sk); /* Set default options on configuration */ cf_sk->conn_req.link_selector = CAIF_LINK_LOW_LATENCY; cf_sk->conn_req.protocol = protocol; release_sock(&cf_sk->sk); return 0; } static const struct net_proto_family caif_family_ops = { .family = PF_CAIF, .create = caif_create, .owner = THIS_MODULE, }; static int __init caif_sktinit_module(void) { return sock_register(&caif_family_ops); } static void __exit caif_sktexit_module(void) { sock_unregister(PF_CAIF); } module_init(caif_sktinit_module); module_exit(caif_sktexit_module);
1537 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 // SPDX-License-Identifier: GPL-2.0-only #include <linux/etherdevice.h> #include "ipvlan.h" #include <linux/if_vlan.h> #include <linux/if_tap.h> #include <linux/interrupt.h> #include <linux/nsproxy.h> #include <linux/compat.h> #include <linux/if_tun.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/cache.h> #include <linux/sched.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/wait.h> #include <linux/cdev.h> #include <linux/idr.h> #include <linux/fs.h> #include <linux/uio.h> #include <net/net_namespace.h> #include <net/rtnetlink.h> #include <net/sock.h> #include <linux/virtio_net.h> #define TUN_OFFLOADS (NETIF_F_HW_CSUM | NETIF_F_TSO_ECN | NETIF_F_TSO | \ NETIF_F_TSO6) static dev_t ipvtap_major; static struct cdev ipvtap_cdev; static const void *ipvtap_net_namespace(const struct device *d) { const struct net_device *dev = to_net_dev(d->parent); return dev_net(dev); } static struct class ipvtap_class = { .name = "ipvtap", .ns_type = &net_ns_type_operations, .namespace = ipvtap_net_namespace, }; struct ipvtap_dev { struct ipvl_dev vlan; struct tap_dev tap; }; static void ipvtap_count_tx_dropped(struct tap_dev *tap) { struct ipvtap_dev *vlantap = container_of(tap, struct ipvtap_dev, tap); struct ipvl_dev *vlan = &vlantap->vlan; this_cpu_inc(vlan->pcpu_stats->tx_drps); } static void ipvtap_count_rx_dropped(struct tap_dev *tap) { struct ipvtap_dev *vlantap = container_of(tap, struct ipvtap_dev, tap); struct ipvl_dev *vlan = &vlantap->vlan; ipvlan_count_rx(vlan, 0, 0, 0); } static void ipvtap_update_features(struct tap_dev *tap, netdev_features_t features) { struct ipvtap_dev *vlantap = container_of(tap, struct ipvtap_dev, tap); struct ipvl_dev *vlan = &vlantap->vlan; vlan->sfeatures = features; netdev_update_features(vlan->dev); } static int ipvtap_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ipvtap_dev *vlantap = netdev_priv(dev); int err; INIT_LIST_HEAD(&vlantap->tap.queue_list); /* Since macvlan supports all offloads by default, make * tap support all offloads also. */ vlantap->tap.tap_features = TUN_OFFLOADS; vlantap->tap.count_tx_dropped = ipvtap_count_tx_dropped; vlantap->tap.update_features = ipvtap_update_features; vlantap->tap.count_rx_dropped = ipvtap_count_rx_dropped; err = netdev_rx_handler_register(dev, tap_handle_frame, &vlantap->tap); if (err) return err; /* Don't put anything that may fail after macvlan_common_newlink * because we can't undo what it does. */ err = ipvlan_link_new(src_net, dev, tb, data, extack); if (err) { netdev_rx_handler_unregister(dev); return err; } vlantap->tap.dev = vlantap->vlan.dev; return err; } static void ipvtap_dellink(struct net_device *dev, struct list_head *head) { struct ipvtap_dev *vlan = netdev_priv(dev); netdev_rx_handler_unregister(dev); tap_del_queues(&vlan->tap); ipvlan_link_delete(dev, head); } static void ipvtap_setup(struct net_device *dev) { ipvlan_link_setup(dev); dev->tx_queue_len = TUN_READQ_SIZE; dev->priv_flags &= ~IFF_NO_QUEUE; } static struct rtnl_link_ops ipvtap_link_ops __read_mostly = { .kind = "ipvtap", .setup = ipvtap_setup, .newlink = ipvtap_newlink, .dellink = ipvtap_dellink, .priv_size = sizeof(struct ipvtap_dev), }; static int ipvtap_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct ipvtap_dev *vlantap; struct device *classdev; dev_t devt; int err; char tap_name[IFNAMSIZ]; if (dev->rtnl_link_ops != &ipvtap_link_ops) return NOTIFY_DONE; snprintf(tap_name, IFNAMSIZ, "tap%d", dev->ifindex); vlantap = netdev_priv(dev); switch (event) { case NETDEV_REGISTER: /* Create the device node here after the network device has * been registered but before register_netdevice has * finished running. */ err = tap_get_minor(ipvtap_major, &vlantap->tap); if (err) return notifier_from_errno(err); devt = MKDEV(MAJOR(ipvtap_major), vlantap->tap.minor); classdev = device_create(&ipvtap_class, &dev->dev, devt, dev, "%s", tap_name); if (IS_ERR(classdev)) { tap_free_minor(ipvtap_major, &vlantap->tap); return notifier_from_errno(PTR_ERR(classdev)); } err = sysfs_create_link(&dev->dev.kobj, &classdev->kobj, tap_name); if (err) return notifier_from_errno(err); break; case NETDEV_UNREGISTER: /* vlan->minor == 0 if NETDEV_REGISTER above failed */ if (vlantap->tap.minor == 0) break; sysfs_remove_link(&dev->dev.kobj, tap_name); devt = MKDEV(MAJOR(ipvtap_major), vlantap->tap.minor); device_destroy(&ipvtap_class, devt); tap_free_minor(ipvtap_major, &vlantap->tap); break; case NETDEV_CHANGE_TX_QUEUE_LEN: if (tap_queue_resize(&vlantap->tap)) return NOTIFY_BAD; break; } return NOTIFY_DONE; } static struct notifier_block ipvtap_notifier_block __read_mostly = { .notifier_call = ipvtap_device_event, }; static int __init ipvtap_init(void) { int err; err = tap_create_cdev(&ipvtap_cdev, &ipvtap_major, "ipvtap", THIS_MODULE); if (err) goto out1; err = class_register(&ipvtap_class); if (err) goto out2; err = register_netdevice_notifier(&ipvtap_notifier_block); if (err) goto out3; err = ipvlan_link_register(&ipvtap_link_ops); if (err) goto out4; return 0; out4: unregister_netdevice_notifier(&ipvtap_notifier_block); out3: class_unregister(&ipvtap_class); out2: tap_destroy_cdev(ipvtap_major, &ipvtap_cdev); out1: return err; } module_init(ipvtap_init); static void __exit ipvtap_exit(void) { rtnl_link_unregister(&ipvtap_link_ops); unregister_netdevice_notifier(&ipvtap_notifier_block); class_unregister(&ipvtap_class); tap_destroy_cdev(ipvtap_major, &ipvtap_cdev); } module_exit(ipvtap_exit); MODULE_ALIAS_RTNL_LINK("ipvtap"); MODULE_AUTHOR("Sainath Grandhi <sainath.grandhi@intel.com>"); MODULE_LICENSE("GPL");
23 23 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ #include "queueing.h" #include <linux/skb_array.h> struct multicore_worker __percpu * wg_packet_percpu_multicore_worker_alloc(work_func_t function, void *ptr) { int cpu; struct multicore_worker __percpu *worker = alloc_percpu(struct multicore_worker); if (!worker) return NULL; for_each_possible_cpu(cpu) { per_cpu_ptr(worker, cpu)->ptr = ptr; INIT_WORK(&per_cpu_ptr(worker, cpu)->work, function); } return worker; } int wg_packet_queue_init(struct crypt_queue *queue, work_func_t function, unsigned int len) { int ret; memset(queue, 0, sizeof(*queue)); queue->last_cpu = -1; ret = ptr_ring_init(&queue->ring, len, GFP_KERNEL); if (ret) return ret; queue->worker = wg_packet_percpu_multicore_worker_alloc(function, queue); if (!queue->worker) { ptr_ring_cleanup(&queue->ring, NULL); return -ENOMEM; } return 0; } void wg_packet_queue_free(struct crypt_queue *queue, bool purge) { free_percpu(queue->worker); WARN_ON(!purge && !__ptr_ring_empty(&queue->ring)); ptr_ring_cleanup(&queue->ring, purge ? __skb_array_destroy_skb : NULL); } #define NEXT(skb) ((skb)->prev) #define STUB(queue) ((struct sk_buff *)&queue->empty) void wg_prev_queue_init(struct prev_queue *queue) { NEXT(STUB(queue)) = NULL; queue->head = queue->tail = STUB(queue); queue->peeked = NULL; atomic_set(&queue->count, 0); BUILD_BUG_ON( offsetof(struct sk_buff, next) != offsetof(struct prev_queue, empty.next) - offsetof(struct prev_queue, empty) || offsetof(struct sk_buff, prev) != offsetof(struct prev_queue, empty.prev) - offsetof(struct prev_queue, empty)); } static void __wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb) { WRITE_ONCE(NEXT(skb), NULL); WRITE_ONCE(NEXT(xchg_release(&queue->head, skb)), skb); } bool wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb) { if (!atomic_add_unless(&queue->count, 1, MAX_QUEUED_PACKETS)) return false; __wg_prev_queue_enqueue(queue, skb); return true; } struct sk_buff *wg_prev_queue_dequeue(struct prev_queue *queue) { struct sk_buff *tail = queue->tail, *next = smp_load_acquire(&NEXT(tail)); if (tail == STUB(queue)) { if (!next) return NULL; queue->tail = next; tail = next; next = smp_load_acquire(&NEXT(next)); } if (next) { queue->tail = next; atomic_dec(&queue->count); return tail; } if (tail != READ_ONCE(queue->head)) return NULL; __wg_prev_queue_enqueue(queue, STUB(queue)); next = smp_load_acquire(&NEXT(tail)); if (next) { queue->tail = next; atomic_dec(&queue->count); return tail; } return NULL; } #undef NEXT #undef STUB
3764 3764 3764 225 225 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 // SPDX-License-Identifier: GPL-2.0-only /* * fs/crypto/hooks.c * * Encryption hooks for higher-level filesystem operations. */ #include "fscrypt_private.h" /** * fscrypt_file_open() - prepare to open a possibly-encrypted regular file * @inode: the inode being opened * @filp: the struct file being set up * * Currently, an encrypted regular file can only be opened if its encryption key * is available; access to the raw encrypted contents is not supported. * Therefore, we first set up the inode's encryption key (if not already done) * and return an error if it's unavailable. * * We also verify that if the parent directory (from the path via which the file * is being opened) is encrypted, then the inode being opened uses the same * encryption policy. This is needed as part of the enforcement that all files * in an encrypted directory tree use the same encryption policy, as a * protection against certain types of offline attacks. Note that this check is * needed even when opening an *unencrypted* file, since it's forbidden to have * an unencrypted file in an encrypted directory. * * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code */ int fscrypt_file_open(struct inode *inode, struct file *filp) { int err; struct dentry *dir; err = fscrypt_require_key(inode); if (err) return err; dir = dget_parent(file_dentry(filp)); if (IS_ENCRYPTED(d_inode(dir)) && !fscrypt_has_permitted_context(d_inode(dir), inode)) { fscrypt_warn(inode, "Inconsistent encryption context (parent directory: %lu)", d_inode(dir)->i_ino); err = -EPERM; } dput(dir); return err; } EXPORT_SYMBOL_GPL(fscrypt_file_open); int __fscrypt_prepare_link(struct inode *inode, struct inode *dir, struct dentry *dentry) { if (fscrypt_is_nokey_name(dentry)) return -ENOKEY; /* * We don't need to separately check that the directory inode's key is * available, as it's implied by the dentry not being a no-key name. */ if (!fscrypt_has_permitted_context(dir, inode)) return -EXDEV; return 0; } EXPORT_SYMBOL_GPL(__fscrypt_prepare_link); int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { if (fscrypt_is_nokey_name(old_dentry) || fscrypt_is_nokey_name(new_dentry)) return -ENOKEY; /* * We don't need to separately check that the directory inodes' keys are * available, as it's implied by the dentries not being no-key names. */ if (old_dir != new_dir) { if (IS_ENCRYPTED(new_dir) && !fscrypt_has_permitted_context(new_dir, d_inode(old_dentry))) return -EXDEV; if ((flags & RENAME_EXCHANGE) && IS_ENCRYPTED(old_dir) && !fscrypt_has_permitted_context(old_dir, d_inode(new_dentry))) return -EXDEV; } return 0; } EXPORT_SYMBOL_GPL(__fscrypt_prepare_rename); int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, struct fscrypt_name *fname) { int err = fscrypt_setup_filename(dir, &dentry->d_name, 1, fname); if (err && err != -ENOENT) return err; if (fname->is_nokey_name) { spin_lock(&dentry->d_lock); dentry->d_flags |= DCACHE_NOKEY_NAME; spin_unlock(&dentry->d_lock); } return err; } EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup); /** * fscrypt_prepare_lookup_partial() - prepare lookup without filename setup * @dir: the encrypted directory being searched * @dentry: the dentry being looked up in @dir * * This function should be used by the ->lookup and ->atomic_open methods of * filesystems that handle filename encryption and no-key name encoding * themselves and thus can't use fscrypt_prepare_lookup(). Like * fscrypt_prepare_lookup(), this will try to set up the directory's encryption * key and will set DCACHE_NOKEY_NAME on the dentry if the key is unavailable. * However, this function doesn't set up a struct fscrypt_name for the filename. * * Return: 0 on success; -errno on error. Note that the encryption key being * unavailable is not considered an error. It is also not an error if * the encryption policy is unsupported by this kernel; that is treated * like the key being unavailable, so that files can still be deleted. */ int fscrypt_prepare_lookup_partial(struct inode *dir, struct dentry *dentry) { int err = fscrypt_get_encryption_info(dir, true); if (!err && !fscrypt_has_encryption_key(dir)) { spin_lock(&dentry->d_lock); dentry->d_flags |= DCACHE_NOKEY_NAME; spin_unlock(&dentry->d_lock); } return err; } EXPORT_SYMBOL_GPL(fscrypt_prepare_lookup_partial); int __fscrypt_prepare_readdir(struct inode *dir) { return fscrypt_get_encryption_info(dir, true); } EXPORT_SYMBOL_GPL(__fscrypt_prepare_readdir); int __fscrypt_prepare_setattr(struct dentry *dentry, struct iattr *attr) { if (attr->ia_valid & ATTR_SIZE) return fscrypt_require_key(d_inode(dentry)); return 0; } EXPORT_SYMBOL_GPL(__fscrypt_prepare_setattr); /** * fscrypt_prepare_setflags() - prepare to change flags with FS_IOC_SETFLAGS * @inode: the inode on which flags are being changed * @oldflags: the old flags * @flags: the new flags * * The caller should be holding i_rwsem for write. * * Return: 0 on success; -errno if the flags change isn't allowed or if * another error occurs. */ int fscrypt_prepare_setflags(struct inode *inode, unsigned int oldflags, unsigned int flags) { struct fscrypt_info *ci; struct fscrypt_master_key *mk; int err; /* * When the CASEFOLD flag is set on an encrypted directory, we must * derive the secret key needed for the dirhash. This is only possible * if the directory uses a v2 encryption policy. */ if (IS_ENCRYPTED(inode) && (flags & ~oldflags & FS_CASEFOLD_FL)) { err = fscrypt_require_key(inode); if (err) return err; ci = inode->i_crypt_info; if (ci->ci_policy.version != FSCRYPT_POLICY_V2) return -EINVAL; mk = ci->ci_master_key; down_read(&mk->mk_sem); if (is_master_key_secret_present(&mk->mk_secret)) err = fscrypt_derive_dirhash_key(ci, mk); else err = -ENOKEY; up_read(&mk->mk_sem); return err; } return 0; } /** * fscrypt_prepare_symlink() - prepare to create a possibly-encrypted symlink * @dir: directory in which the symlink is being created * @target: plaintext symlink target * @len: length of @target excluding null terminator * @max_len: space the filesystem has available to store the symlink target * @disk_link: (out) the on-disk symlink target being prepared * * This function computes the size the symlink target will require on-disk, * stores it in @disk_link->len, and validates it against @max_len. An * encrypted symlink may be longer than the original. * * Additionally, @disk_link->name is set to @target if the symlink will be * unencrypted, but left NULL if the symlink will be encrypted. For encrypted * symlinks, the filesystem must call fscrypt_encrypt_symlink() to create the * on-disk target later. (The reason for the two-step process is that some * filesystems need to know the size of the symlink target before creating the * inode, e.g. to determine whether it will be a "fast" or "slow" symlink.) * * Return: 0 on success, -ENAMETOOLONG if the symlink target is too long, * -ENOKEY if the encryption key is missing, or another -errno code if a problem * occurred while setting up the encryption key. */ int fscrypt_prepare_symlink(struct inode *dir, const char *target, unsigned int len, unsigned int max_len, struct fscrypt_str *disk_link) { const union fscrypt_policy *policy; /* * To calculate the size of the encrypted symlink target we need to know * the amount of NUL padding, which is determined by the flags set in * the encryption policy which will be inherited from the directory. */ policy = fscrypt_policy_to_inherit(dir); if (policy == NULL) { /* Not encrypted */ disk_link->name = (unsigned char *)target; disk_link->len = len + 1; if (disk_link->len > max_len) return -ENAMETOOLONG; return 0; } if (IS_ERR(policy)) return PTR_ERR(policy); /* * Calculate the size of the encrypted symlink and verify it won't * exceed max_len. Note that for historical reasons, encrypted symlink * targets are prefixed with the ciphertext length, despite this * actually being redundant with i_size. This decreases by 2 bytes the * longest symlink target we can accept. * * We could recover 1 byte by not counting a null terminator, but * counting it (even though it is meaningless for ciphertext) is simpler * for now since filesystems will assume it is there and subtract it. */ if (!__fscrypt_fname_encrypted_size(policy, len, max_len - sizeof(struct fscrypt_symlink_data) - 1, &disk_link->len)) return -ENAMETOOLONG; disk_link->len += sizeof(struct fscrypt_symlink_data) + 1; disk_link->name = NULL; return 0; } EXPORT_SYMBOL_GPL(fscrypt_prepare_symlink); int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, unsigned int len, struct fscrypt_str *disk_link) { int err; struct qstr iname = QSTR_INIT(target, len); struct fscrypt_symlink_data *sd; unsigned int ciphertext_len; /* * fscrypt_prepare_new_inode() should have already set up the new * symlink inode's encryption key. We don't wait until now to do it, * since we may be in a filesystem transaction now. */ if (WARN_ON_ONCE(!fscrypt_has_encryption_key(inode))) return -ENOKEY; if (disk_link->name) { /* filesystem-provided buffer */ sd = (struct fscrypt_symlink_data *)disk_link->name; } else { sd = kmalloc(disk_link->len, GFP_NOFS); if (!sd) return -ENOMEM; } ciphertext_len = disk_link->len - sizeof(*sd) - 1; sd->len = cpu_to_le16(ciphertext_len); err = fscrypt_fname_encrypt(inode, &iname, sd->encrypted_path, ciphertext_len); if (err) goto err_free_sd; /* * Null-terminating the ciphertext doesn't make sense, but we still * count the null terminator in the length, so we might as well * initialize it just in case the filesystem writes it out. */ sd->encrypted_path[ciphertext_len] = '\0'; /* Cache the plaintext symlink target for later use by get_link() */ err = -ENOMEM; inode->i_link = kmemdup(target, len + 1, GFP_NOFS); if (!inode->i_link) goto err_free_sd; if (!disk_link->name) disk_link->name = (unsigned char *)sd; return 0; err_free_sd: if (!disk_link->name) kfree(sd); return err; } EXPORT_SYMBOL_GPL(__fscrypt_encrypt_symlink); /** * fscrypt_get_symlink() - get the target of an encrypted symlink * @inode: the symlink inode * @caddr: the on-disk contents of the symlink * @max_size: size of @caddr buffer * @done: if successful, will be set up to free the returned target if needed * * If the symlink's encryption key is available, we decrypt its target. * Otherwise, we encode its target for presentation. * * This may sleep, so the filesystem must have dropped out of RCU mode already. * * Return: the presentable symlink target or an ERR_PTR() */ const char *fscrypt_get_symlink(struct inode *inode, const void *caddr, unsigned int max_size, struct delayed_call *done) { const struct fscrypt_symlink_data *sd; struct fscrypt_str cstr, pstr; bool has_key; int err; /* This is for encrypted symlinks only */ if (WARN_ON_ONCE(!IS_ENCRYPTED(inode))) return ERR_PTR(-EINVAL); /* If the decrypted target is already cached, just return it. */ pstr.name = READ_ONCE(inode->i_link); if (pstr.name) return pstr.name; /* * Try to set up the symlink's encryption key, but we can continue * regardless of whether the key is available or not. */ err = fscrypt_get_encryption_info(inode, false); if (err) return ERR_PTR(err); has_key = fscrypt_has_encryption_key(inode); /* * For historical reasons, encrypted symlink targets are prefixed with * the ciphertext length, even though this is redundant with i_size. */ if (max_size < sizeof(*sd) + 1) return ERR_PTR(-EUCLEAN); sd = caddr; cstr.name = (unsigned char *)sd->encrypted_path; cstr.len = le16_to_cpu(sd->len); if (cstr.len == 0) return ERR_PTR(-EUCLEAN); if (cstr.len + sizeof(*sd) > max_size) return ERR_PTR(-EUCLEAN); err = fscrypt_fname_alloc_buffer(cstr.len, &pstr); if (err) return ERR_PTR(err); err = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr); if (err) goto err_kfree; err = -EUCLEAN; if (pstr.name[0] == '\0') goto err_kfree; pstr.name[pstr.len] = '\0'; /* * Cache decrypted symlink targets in i_link for later use. Don't cache * symlink targets encoded without the key, since those become outdated * once the key is added. This pairs with the READ_ONCE() above and in * the VFS path lookup code. */ if (!has_key || cmpxchg_release(&inode->i_link, NULL, pstr.name) != NULL) set_delayed_call(done, kfree_link, pstr.name); return pstr.name; err_kfree: kfree(pstr.name); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(fscrypt_get_symlink); /** * fscrypt_symlink_getattr() - set the correct st_size for encrypted symlinks * @path: the path for the encrypted symlink being queried * @stat: the struct being filled with the symlink's attributes * * Override st_size of encrypted symlinks to be the length of the decrypted * symlink target (or the no-key encoded symlink target, if the key is * unavailable) rather than the length of the encrypted symlink target. This is * necessary for st_size to match the symlink target that userspace actually * sees. POSIX requires this, and some userspace programs depend on it. * * This requires reading the symlink target from disk if needed, setting up the * inode's encryption key if possible, and then decrypting or encoding the * symlink target. This makes lstat() more heavyweight than is normally the * case. However, decrypted symlink targets will be cached in ->i_link, so * usually the symlink won't have to be read and decrypted again later if/when * it is actually followed, readlink() is called, or lstat() is called again. * * Return: 0 on success, -errno on failure */ int fscrypt_symlink_getattr(const struct path *path, struct kstat *stat) { struct dentry *dentry = path->dentry; struct inode *inode = d_inode(dentry); const char *link; DEFINE_DELAYED_CALL(done); /* * To get the symlink target that userspace will see (whether it's the * decrypted target or the no-key encoded target), we can just get it in * the same way the VFS does during path resolution and readlink(). */ link = READ_ONCE(inode->i_link); if (!link) { link = inode->i_op->get_link(dentry, inode, &done); if (IS_ERR(link)) return PTR_ERR(link); } stat->size = strlen(link); do_delayed_call(&done); return 0; } EXPORT_SYMBOL_GPL(fscrypt_symlink_getattr);
304 302 603 303 602 603 602 603 622 621 621 306 303 603 309 305 225 14 14 13 7 14 14 14 14 18 17 14 4190 4192 1933 1494 4195 641 622 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 // SPDX-License-Identifier: GPL-2.0 /* * security/tomoyo/network.c * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #include "common.h" #include <linux/slab.h> /* Structure for holding inet domain socket's address. */ struct tomoyo_inet_addr_info { __be16 port; /* In network byte order. */ const __be32 *address; /* In network byte order. */ bool is_ipv6; }; /* Structure for holding unix domain socket's address. */ struct tomoyo_unix_addr_info { u8 *addr; /* This may not be '\0' terminated string. */ unsigned int addr_len; }; /* Structure for holding socket address. */ struct tomoyo_addr_info { u8 protocol; u8 operation; struct tomoyo_inet_addr_info inet; struct tomoyo_unix_addr_info unix0; }; /* String table for socket's protocols. */ const char * const tomoyo_proto_keyword[TOMOYO_SOCK_MAX] = { [SOCK_STREAM] = "stream", [SOCK_DGRAM] = "dgram", [SOCK_RAW] = "raw", [SOCK_SEQPACKET] = "seqpacket", [0] = " ", /* Dummy for avoiding NULL pointer dereference. */ [4] = " ", /* Dummy for avoiding NULL pointer dereference. */ }; /** * tomoyo_parse_ipaddr_union - Parse an IP address. * * @param: Pointer to "struct tomoyo_acl_param". * @ptr: Pointer to "struct tomoyo_ipaddr_union". * * Returns true on success, false otherwise. */ bool tomoyo_parse_ipaddr_union(struct tomoyo_acl_param *param, struct tomoyo_ipaddr_union *ptr) { u8 * const min = ptr->ip[0].in6_u.u6_addr8; u8 * const max = ptr->ip[1].in6_u.u6_addr8; char *address = tomoyo_read_token(param); const char *end; if (!strchr(address, ':') && in4_pton(address, -1, min, '-', &end) > 0) { ptr->is_ipv6 = false; if (!*end) ptr->ip[1].s6_addr32[0] = ptr->ip[0].s6_addr32[0]; else if (*end++ != '-' || in4_pton(end, -1, max, '\0', &end) <= 0 || *end) return false; return true; } if (in6_pton(address, -1, min, '-', &end) > 0) { ptr->is_ipv6 = true; if (!*end) memmove(max, min, sizeof(u16) * 8); else if (*end++ != '-' || in6_pton(end, -1, max, '\0', &end) <= 0 || *end) return false; return true; } return false; } /** * tomoyo_print_ipv4 - Print an IPv4 address. * * @buffer: Buffer to write to. * @buffer_len: Size of @buffer. * @min_ip: Pointer to __be32. * @max_ip: Pointer to __be32. * * Returns nothing. */ static void tomoyo_print_ipv4(char *buffer, const unsigned int buffer_len, const __be32 *min_ip, const __be32 *max_ip) { snprintf(buffer, buffer_len, "%pI4%c%pI4", min_ip, *min_ip == *max_ip ? '\0' : '-', max_ip); } /** * tomoyo_print_ipv6 - Print an IPv6 address. * * @buffer: Buffer to write to. * @buffer_len: Size of @buffer. * @min_ip: Pointer to "struct in6_addr". * @max_ip: Pointer to "struct in6_addr". * * Returns nothing. */ static void tomoyo_print_ipv6(char *buffer, const unsigned int buffer_len, const struct in6_addr *min_ip, const struct in6_addr *max_ip) { snprintf(buffer, buffer_len, "%pI6c%c%pI6c", min_ip, !memcmp(min_ip, max_ip, 16) ? '\0' : '-', max_ip); } /** * tomoyo_print_ip - Print an IP address. * * @buf: Buffer to write to. * @size: Size of @buf. * @ptr: Pointer to "struct ipaddr_union". * * Returns nothing. */ void tomoyo_print_ip(char *buf, const unsigned int size, const struct tomoyo_ipaddr_union *ptr) { if (ptr->is_ipv6) tomoyo_print_ipv6(buf, size, &ptr->ip[0], &ptr->ip[1]); else tomoyo_print_ipv4(buf, size, &ptr->ip[0].s6_addr32[0], &ptr->ip[1].s6_addr32[0]); } /* * Mapping table from "enum tomoyo_network_acl_index" to * "enum tomoyo_mac_index" for inet domain socket. */ static const u8 tomoyo_inet2mac [TOMOYO_SOCK_MAX][TOMOYO_MAX_NETWORK_OPERATION] = { [SOCK_STREAM] = { [TOMOYO_NETWORK_BIND] = TOMOYO_MAC_NETWORK_INET_STREAM_BIND, [TOMOYO_NETWORK_LISTEN] = TOMOYO_MAC_NETWORK_INET_STREAM_LISTEN, [TOMOYO_NETWORK_CONNECT] = TOMOYO_MAC_NETWORK_INET_STREAM_CONNECT, }, [SOCK_DGRAM] = { [TOMOYO_NETWORK_BIND] = TOMOYO_MAC_NETWORK_INET_DGRAM_BIND, [TOMOYO_NETWORK_SEND] = TOMOYO_MAC_NETWORK_INET_DGRAM_SEND, }, [SOCK_RAW] = { [TOMOYO_NETWORK_BIND] = TOMOYO_MAC_NETWORK_INET_RAW_BIND, [TOMOYO_NETWORK_SEND] = TOMOYO_MAC_NETWORK_INET_RAW_SEND, }, }; /* * Mapping table from "enum tomoyo_network_acl_index" to * "enum tomoyo_mac_index" for unix domain socket. */ static const u8 tomoyo_unix2mac [TOMOYO_SOCK_MAX][TOMOYO_MAX_NETWORK_OPERATION] = { [SOCK_STREAM] = { [TOMOYO_NETWORK_BIND] = TOMOYO_MAC_NETWORK_UNIX_STREAM_BIND, [TOMOYO_NETWORK_LISTEN] = TOMOYO_MAC_NETWORK_UNIX_STREAM_LISTEN, [TOMOYO_NETWORK_CONNECT] = TOMOYO_MAC_NETWORK_UNIX_STREAM_CONNECT, }, [SOCK_DGRAM] = { [TOMOYO_NETWORK_BIND] = TOMOYO_MAC_NETWORK_UNIX_DGRAM_BIND, [TOMOYO_NETWORK_SEND] = TOMOYO_MAC_NETWORK_UNIX_DGRAM_SEND, }, [SOCK_SEQPACKET] = { [TOMOYO_NETWORK_BIND] = TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_BIND, [TOMOYO_NETWORK_LISTEN] = TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_LISTEN, [TOMOYO_NETWORK_CONNECT] = TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_CONNECT, }, }; /** * tomoyo_same_inet_acl - Check for duplicated "struct tomoyo_inet_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * * Returns true if @a == @b except permission bits, false otherwise. */ static bool tomoyo_same_inet_acl(const struct tomoyo_acl_info *a, const struct tomoyo_acl_info *b) { const struct tomoyo_inet_acl *p1 = container_of(a, typeof(*p1), head); const struct tomoyo_inet_acl *p2 = container_of(b, typeof(*p2), head); return p1->protocol == p2->protocol && tomoyo_same_ipaddr_union(&p1->address, &p2->address) && tomoyo_same_number_union(&p1->port, &p2->port); } /** * tomoyo_same_unix_acl - Check for duplicated "struct tomoyo_unix_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * * Returns true if @a == @b except permission bits, false otherwise. */ static bool tomoyo_same_unix_acl(const struct tomoyo_acl_info *a, const struct tomoyo_acl_info *b) { const struct tomoyo_unix_acl *p1 = container_of(a, typeof(*p1), head); const struct tomoyo_unix_acl *p2 = container_of(b, typeof(*p2), head); return p1->protocol == p2->protocol && tomoyo_same_name_union(&p1->name, &p2->name); } /** * tomoyo_merge_inet_acl - Merge duplicated "struct tomoyo_inet_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * @is_delete: True for @a &= ~@b, false for @a |= @b. * * Returns true if @a is empty, false otherwise. */ static bool tomoyo_merge_inet_acl(struct tomoyo_acl_info *a, struct tomoyo_acl_info *b, const bool is_delete) { u8 * const a_perm = &container_of(a, struct tomoyo_inet_acl, head)->perm; u8 perm = READ_ONCE(*a_perm); const u8 b_perm = container_of(b, struct tomoyo_inet_acl, head)->perm; if (is_delete) perm &= ~b_perm; else perm |= b_perm; WRITE_ONCE(*a_perm, perm); return !perm; } /** * tomoyo_merge_unix_acl - Merge duplicated "struct tomoyo_unix_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * @is_delete: True for @a &= ~@b, false for @a |= @b. * * Returns true if @a is empty, false otherwise. */ static bool tomoyo_merge_unix_acl(struct tomoyo_acl_info *a, struct tomoyo_acl_info *b, const bool is_delete) { u8 * const a_perm = &container_of(a, struct tomoyo_unix_acl, head)->perm; u8 perm = READ_ONCE(*a_perm); const u8 b_perm = container_of(b, struct tomoyo_unix_acl, head)->perm; if (is_delete) perm &= ~b_perm; else perm |= b_perm; WRITE_ONCE(*a_perm, perm); return !perm; } /** * tomoyo_write_inet_network - Write "struct tomoyo_inet_acl" list. * * @param: Pointer to "struct tomoyo_acl_param". * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ int tomoyo_write_inet_network(struct tomoyo_acl_param *param) { struct tomoyo_inet_acl e = { .head.type = TOMOYO_TYPE_INET_ACL }; int error = -EINVAL; u8 type; const char *protocol = tomoyo_read_token(param); const char *operation = tomoyo_read_token(param); for (e.protocol = 0; e.protocol < TOMOYO_SOCK_MAX; e.protocol++) if (!strcmp(protocol, tomoyo_proto_keyword[e.protocol])) break; for (type = 0; type < TOMOYO_MAX_NETWORK_OPERATION; type++) if (tomoyo_permstr(operation, tomoyo_socket_keyword[type])) e.perm |= 1 << type; if (e.protocol == TOMOYO_SOCK_MAX || !e.perm) return -EINVAL; if (param->data[0] == '@') { param->data++; e.address.group = tomoyo_get_group(param, TOMOYO_ADDRESS_GROUP); if (!e.address.group) return -ENOMEM; } else { if (!tomoyo_parse_ipaddr_union(param, &e.address)) goto out; } if (!tomoyo_parse_number_union(param, &e.port) || e.port.values[1] > 65535) goto out; error = tomoyo_update_domain(&e.head, sizeof(e), param, tomoyo_same_inet_acl, tomoyo_merge_inet_acl); out: tomoyo_put_group(e.address.group); tomoyo_put_number_union(&e.port); return error; } /** * tomoyo_write_unix_network - Write "struct tomoyo_unix_acl" list. * * @param: Pointer to "struct tomoyo_acl_param". * * Returns 0 on success, negative value otherwise. */ int tomoyo_write_unix_network(struct tomoyo_acl_param *param) { struct tomoyo_unix_acl e = { .head.type = TOMOYO_TYPE_UNIX_ACL }; int error; u8 type; const char *protocol = tomoyo_read_token(param); const char *operation = tomoyo_read_token(param); for (e.protocol = 0; e.protocol < TOMOYO_SOCK_MAX; e.protocol++) if (!strcmp(protocol, tomoyo_proto_keyword[e.protocol])) break; for (type = 0; type < TOMOYO_MAX_NETWORK_OPERATION; type++) if (tomoyo_permstr(operation, tomoyo_socket_keyword[type])) e.perm |= 1 << type; if (e.protocol == TOMOYO_SOCK_MAX || !e.perm) return -EINVAL; if (!tomoyo_parse_name_union(param, &e.name)) return -EINVAL; error = tomoyo_update_domain(&e.head, sizeof(e), param, tomoyo_same_unix_acl, tomoyo_merge_unix_acl); tomoyo_put_name_union(&e.name); return error; } /** * tomoyo_audit_net_log - Audit network log. * * @r: Pointer to "struct tomoyo_request_info". * @family: Name of socket family ("inet" or "unix"). * @protocol: Name of protocol in @family. * @operation: Name of socket operation. * @address: Name of address. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_audit_net_log(struct tomoyo_request_info *r, const char *family, const u8 protocol, const u8 operation, const char *address) { return tomoyo_supervisor(r, "network %s %s %s %s\n", family, tomoyo_proto_keyword[protocol], tomoyo_socket_keyword[operation], address); } /** * tomoyo_audit_inet_log - Audit INET network log. * * @r: Pointer to "struct tomoyo_request_info". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_audit_inet_log(struct tomoyo_request_info *r) { char buf[128]; int len; const __be32 *address = r->param.inet_network.address; if (r->param.inet_network.is_ipv6) tomoyo_print_ipv6(buf, sizeof(buf), (const struct in6_addr *) address, (const struct in6_addr *) address); else tomoyo_print_ipv4(buf, sizeof(buf), address, address); len = strlen(buf); snprintf(buf + len, sizeof(buf) - len, " %u", r->param.inet_network.port); return tomoyo_audit_net_log(r, "inet", r->param.inet_network.protocol, r->param.inet_network.operation, buf); } /** * tomoyo_audit_unix_log - Audit UNIX network log. * * @r: Pointer to "struct tomoyo_request_info". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_audit_unix_log(struct tomoyo_request_info *r) { return tomoyo_audit_net_log(r, "unix", r->param.unix_network.protocol, r->param.unix_network.operation, r->param.unix_network.address->name); } /** * tomoyo_check_inet_acl - Check permission for inet domain socket operation. * * @r: Pointer to "struct tomoyo_request_info". * @ptr: Pointer to "struct tomoyo_acl_info". * * Returns true if granted, false otherwise. */ static bool tomoyo_check_inet_acl(struct tomoyo_request_info *r, const struct tomoyo_acl_info *ptr) { const struct tomoyo_inet_acl *acl = container_of(ptr, typeof(*acl), head); const u8 size = r->param.inet_network.is_ipv6 ? 16 : 4; if (!(acl->perm & (1 << r->param.inet_network.operation)) || !tomoyo_compare_number_union(r->param.inet_network.port, &acl->port)) return false; if (acl->address.group) return tomoyo_address_matches_group (r->param.inet_network.is_ipv6, r->param.inet_network.address, acl->address.group); return acl->address.is_ipv6 == r->param.inet_network.is_ipv6 && memcmp(&acl->address.ip[0], r->param.inet_network.address, size) <= 0 && memcmp(r->param.inet_network.address, &acl->address.ip[1], size) <= 0; } /** * tomoyo_check_unix_acl - Check permission for unix domain socket operation. * * @r: Pointer to "struct tomoyo_request_info". * @ptr: Pointer to "struct tomoyo_acl_info". * * Returns true if granted, false otherwise. */ static bool tomoyo_check_unix_acl(struct tomoyo_request_info *r, const struct tomoyo_acl_info *ptr) { const struct tomoyo_unix_acl *acl = container_of(ptr, typeof(*acl), head); return (acl->perm & (1 << r->param.unix_network.operation)) && tomoyo_compare_name_union(r->param.unix_network.address, &acl->name); } /** * tomoyo_inet_entry - Check permission for INET network operation. * * @address: Pointer to "struct tomoyo_addr_info". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_inet_entry(const struct tomoyo_addr_info *address) { const int idx = tomoyo_read_lock(); struct tomoyo_request_info r; int error = 0; const u8 type = tomoyo_inet2mac[address->protocol][address->operation]; if (type && tomoyo_init_request_info(&r, NULL, type) != TOMOYO_CONFIG_DISABLED) { r.param_type = TOMOYO_TYPE_INET_ACL; r.param.inet_network.protocol = address->protocol; r.param.inet_network.operation = address->operation; r.param.inet_network.is_ipv6 = address->inet.is_ipv6; r.param.inet_network.address = address->inet.address; r.param.inet_network.port = ntohs(address->inet.port); do { tomoyo_check_acl(&r, tomoyo_check_inet_acl); error = tomoyo_audit_inet_log(&r); } while (error == TOMOYO_RETRY_REQUEST); } tomoyo_read_unlock(idx); return error; } /** * tomoyo_check_inet_address - Check permission for inet domain socket's operation. * * @addr: Pointer to "struct sockaddr". * @addr_len: Size of @addr. * @port: Port number. * @address: Pointer to "struct tomoyo_addr_info". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_check_inet_address(const struct sockaddr *addr, const unsigned int addr_len, const u16 port, struct tomoyo_addr_info *address) { struct tomoyo_inet_addr_info *i = &address->inet; if (addr_len < offsetofend(struct sockaddr, sa_family)) return 0; switch (addr->sa_family) { case AF_INET6: if (addr_len < SIN6_LEN_RFC2133) goto skip; i->is_ipv6 = true; i->address = (__be32 *) ((struct sockaddr_in6 *) addr)->sin6_addr.s6_addr; i->port = ((struct sockaddr_in6 *) addr)->sin6_port; break; case AF_INET: if (addr_len < sizeof(struct sockaddr_in)) goto skip; i->is_ipv6 = false; i->address = (__be32 *) &((struct sockaddr_in *) addr)->sin_addr; i->port = ((struct sockaddr_in *) addr)->sin_port; break; default: goto skip; } if (address->protocol == SOCK_RAW) i->port = htons(port); return tomoyo_inet_entry(address); skip: return 0; } /** * tomoyo_unix_entry - Check permission for UNIX network operation. * * @address: Pointer to "struct tomoyo_addr_info". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_unix_entry(const struct tomoyo_addr_info *address) { const int idx = tomoyo_read_lock(); struct tomoyo_request_info r; int error = 0; const u8 type = tomoyo_unix2mac[address->protocol][address->operation]; if (type && tomoyo_init_request_info(&r, NULL, type) != TOMOYO_CONFIG_DISABLED) { char *buf = address->unix0.addr; int len = address->unix0.addr_len - sizeof(sa_family_t); if (len <= 0) { buf = "anonymous"; len = 9; } else if (buf[0]) { len = strnlen(buf, len); } buf = tomoyo_encode2(buf, len); if (buf) { struct tomoyo_path_info addr; addr.name = buf; tomoyo_fill_path_info(&addr); r.param_type = TOMOYO_TYPE_UNIX_ACL; r.param.unix_network.protocol = address->protocol; r.param.unix_network.operation = address->operation; r.param.unix_network.address = &addr; do { tomoyo_check_acl(&r, tomoyo_check_unix_acl); error = tomoyo_audit_unix_log(&r); } while (error == TOMOYO_RETRY_REQUEST); kfree(buf); } else error = -ENOMEM; } tomoyo_read_unlock(idx); return error; } /** * tomoyo_check_unix_address - Check permission for unix domain socket's operation. * * @addr: Pointer to "struct sockaddr". * @addr_len: Size of @addr. * @address: Pointer to "struct tomoyo_addr_info". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_check_unix_address(struct sockaddr *addr, const unsigned int addr_len, struct tomoyo_addr_info *address) { struct tomoyo_unix_addr_info *u = &address->unix0; if (addr_len < offsetofend(struct sockaddr, sa_family)) return 0; if (addr->sa_family != AF_UNIX) return 0; u->addr = ((struct sockaddr_un *) addr)->sun_path; u->addr_len = addr_len; return tomoyo_unix_entry(address); } /** * tomoyo_kernel_service - Check whether I'm kernel service or not. * * Returns true if I'm kernel service, false otherwise. */ static bool tomoyo_kernel_service(void) { /* Nothing to do if I am a kernel service. */ return current->flags & PF_KTHREAD; } /** * tomoyo_sock_family - Get socket's family. * * @sk: Pointer to "struct sock". * * Returns one of PF_INET, PF_INET6, PF_UNIX or 0. */ static u8 tomoyo_sock_family(struct sock *sk) { u8 family; if (tomoyo_kernel_service()) return 0; family = sk->sk_family; switch (family) { case PF_INET: case PF_INET6: case PF_UNIX: return family; default: return 0; } } /** * tomoyo_socket_listen_permission - Check permission for listening a socket. * * @sock: Pointer to "struct socket". * * Returns 0 on success, negative value otherwise. */ int tomoyo_socket_listen_permission(struct socket *sock) { struct tomoyo_addr_info address; const u8 family = tomoyo_sock_family(sock->sk); const unsigned int type = sock->type; struct sockaddr_storage addr; int addr_len; if (!family || (type != SOCK_STREAM && type != SOCK_SEQPACKET)) return 0; { const int error = sock->ops->getname(sock, (struct sockaddr *) &addr, 0); if (error < 0) return error; addr_len = error; } address.protocol = type; address.operation = TOMOYO_NETWORK_LISTEN; if (family == PF_UNIX) return tomoyo_check_unix_address((struct sockaddr *) &addr, addr_len, &address); return tomoyo_check_inet_address((struct sockaddr *) &addr, addr_len, 0, &address); } /** * tomoyo_socket_connect_permission - Check permission for setting the remote address of a socket. * * @sock: Pointer to "struct socket". * @addr: Pointer to "struct sockaddr". * @addr_len: Size of @addr. * * Returns 0 on success, negative value otherwise. */ int tomoyo_socket_connect_permission(struct socket *sock, struct sockaddr *addr, int addr_len) { struct tomoyo_addr_info address; const u8 family = tomoyo_sock_family(sock->sk); const unsigned int type = sock->type; if (!family) return 0; address.protocol = type; switch (type) { case SOCK_DGRAM: case SOCK_RAW: address.operation = TOMOYO_NETWORK_SEND; break; case SOCK_STREAM: case SOCK_SEQPACKET: address.operation = TOMOYO_NETWORK_CONNECT; break; default: return 0; } if (family == PF_UNIX) return tomoyo_check_unix_address(addr, addr_len, &address); return tomoyo_check_inet_address(addr, addr_len, sock->sk->sk_protocol, &address); } /** * tomoyo_socket_bind_permission - Check permission for setting the local address of a socket. * * @sock: Pointer to "struct socket". * @addr: Pointer to "struct sockaddr". * @addr_len: Size of @addr. * * Returns 0 on success, negative value otherwise. */ int tomoyo_socket_bind_permission(struct socket *sock, struct sockaddr *addr, int addr_len) { struct tomoyo_addr_info address; const u8 family = tomoyo_sock_family(sock->sk); const unsigned int type = sock->type; if (!family) return 0; switch (type) { case SOCK_STREAM: case SOCK_DGRAM: case SOCK_RAW: case SOCK_SEQPACKET: address.protocol = type; address.operation = TOMOYO_NETWORK_BIND; break; default: return 0; } if (family == PF_UNIX) return tomoyo_check_unix_address(addr, addr_len, &address); return tomoyo_check_inet_address(addr, addr_len, sock->sk->sk_protocol, &address); } /** * tomoyo_socket_sendmsg_permission - Check permission for sending a datagram. * * @sock: Pointer to "struct socket". * @msg: Pointer to "struct msghdr". * @size: Unused. * * Returns 0 on success, negative value otherwise. */ int tomoyo_socket_sendmsg_permission(struct socket *sock, struct msghdr *msg, int size) { struct tomoyo_addr_info address; const u8 family = tomoyo_sock_family(sock->sk); const unsigned int type = sock->type; if (!msg->msg_name || !family || (type != SOCK_DGRAM && type != SOCK_RAW)) return 0; address.protocol = type; address.operation = TOMOYO_NETWORK_SEND; if (family == PF_UNIX) return tomoyo_check_unix_address((struct sockaddr *) msg->msg_name, msg->msg_namelen, &address); return tomoyo_check_inet_address((struct sockaddr *) msg->msg_name, msg->msg_namelen, sock->sk->sk_protocol, &address); }
333 2 331 13 13 106 7 339 345 13 13 105 13 113 113 106 112 1 1 1 1 1 1 1 8 1 1 1 331 333 333 333 333 332 102 102 346 346 333 331 2 360 360 47 360 333 47 46 333 333 333 332 333 333 333 333 333 333 333 333 331 331 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Generic INET transport hashtables * * Authors: Lotsa people, from code originally in tcp */ #include <linux/module.h> #include <linux/random.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/wait.h> #include <linux/vmalloc.h> #include <linux/memblock.h> #include <net/addrconf.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/inet6_hashtables.h> #endif #include <net/secure_seq.h> #include <net/ip.h> #include <net/tcp.h> #include <net/sock_reuseport.h> u32 inet_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, const __be32 faddr, const __be16 fport) { static u32 inet_ehash_secret __read_mostly; net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret)); return __inet_ehashfn(laddr, lport, faddr, fport, inet_ehash_secret + net_hash_mix(net)); } EXPORT_SYMBOL_GPL(inet_ehashfn); /* This function handles inet_sock, but also timewait and request sockets * for IPv4/IPv6. */ static u32 sk_ehashfn(const struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6 && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) return inet6_ehashfn(sock_net(sk), &sk->sk_v6_rcv_saddr, sk->sk_num, &sk->sk_v6_daddr, sk->sk_dport); #endif return inet_ehashfn(sock_net(sk), sk->sk_rcv_saddr, sk->sk_num, sk->sk_daddr, sk->sk_dport); } /* * Allocate and initialize a new local port bind bucket. * The bindhash mutex for snum's hash chain must be held here. */ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, const unsigned short snum, int l3mdev) { struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); if (tb) { write_pnet(&tb->ib_net, net); tb->l3mdev = l3mdev; tb->port = snum; tb->fastreuse = 0; tb->fastreuseport = 0; INIT_HLIST_HEAD(&tb->owners); hlist_add_head(&tb->node, &head->chain); } return tb; } /* * Caller must hold hashbucket lock for this tb with local BH disabled */ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb) { if (hlist_empty(&tb->owners)) { __hlist_del(&tb->node); kmem_cache_free(cachep, tb); } } bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net, unsigned short port, int l3mdev) { return net_eq(ib_net(tb), net) && tb->port == port && tb->l3mdev == l3mdev; } static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb, struct net *net, struct inet_bind_hashbucket *head, unsigned short port, int l3mdev, const struct sock *sk) { write_pnet(&tb->ib_net, net); tb->l3mdev = l3mdev; tb->port = port; #if IS_ENABLED(CONFIG_IPV6) tb->family = sk->sk_family; if (sk->sk_family == AF_INET6) tb->v6_rcv_saddr = sk->sk_v6_rcv_saddr; else #endif tb->rcv_saddr = sk->sk_rcv_saddr; INIT_HLIST_HEAD(&tb->owners); INIT_HLIST_HEAD(&tb->deathrow); hlist_add_head(&tb->node, &head->chain); } struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, unsigned short port, int l3mdev, const struct sock *sk) { struct inet_bind2_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); if (tb) inet_bind2_bucket_init(tb, net, head, port, l3mdev, sk); return tb; } /* Caller must hold hashbucket lock for this tb with local BH disabled */ void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb) { if (hlist_empty(&tb->owners) && hlist_empty(&tb->deathrow)) { __hlist_del(&tb->node); kmem_cache_free(cachep, tb); } } static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2, const struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family != tb2->family) return false; if (sk->sk_family == AF_INET6) return ipv6_addr_equal(&tb2->v6_rcv_saddr, &sk->sk_v6_rcv_saddr); #endif return tb2->rcv_saddr == sk->sk_rcv_saddr; } void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, struct inet_bind2_bucket *tb2, unsigned short port) { inet_sk(sk)->inet_num = port; sk_add_bind_node(sk, &tb->owners); inet_csk(sk)->icsk_bind_hash = tb; sk_add_bind2_node(sk, &tb2->owners); inet_csk(sk)->icsk_bind2_hash = tb2; } /* * Get rid of any references to a local port held by the given sock. */ static void __inet_put_port(struct sock *sk) { struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); struct inet_bind_hashbucket *head, *head2; struct net *net = sock_net(sk); struct inet_bind_bucket *tb; int bhash; bhash = inet_bhashfn(net, inet_sk(sk)->inet_num, hashinfo->bhash_size); head = &hashinfo->bhash[bhash]; head2 = inet_bhashfn_portaddr(hashinfo, sk, net, inet_sk(sk)->inet_num); spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; __sk_del_bind_node(sk); inet_csk(sk)->icsk_bind_hash = NULL; inet_sk(sk)->inet_num = 0; inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); spin_lock(&head2->lock); if (inet_csk(sk)->icsk_bind2_hash) { struct inet_bind2_bucket *tb2 = inet_csk(sk)->icsk_bind2_hash; __sk_del_bind2_node(sk); inet_csk(sk)->icsk_bind2_hash = NULL; inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2); } spin_unlock(&head2->lock); spin_unlock(&head->lock); } void inet_put_port(struct sock *sk) { local_bh_disable(); __inet_put_port(sk); local_bh_enable(); } EXPORT_SYMBOL(inet_put_port); int __inet_inherit_port(const struct sock *sk, struct sock *child) { struct inet_hashinfo *table = tcp_or_dccp_get_hashinfo(sk); unsigned short port = inet_sk(child)->inet_num; struct inet_bind_hashbucket *head, *head2; bool created_inet_bind_bucket = false; struct net *net = sock_net(sk); bool update_fastreuse = false; struct inet_bind2_bucket *tb2; struct inet_bind_bucket *tb; int bhash, l3mdev; bhash = inet_bhashfn(net, port, table->bhash_size); head = &table->bhash[bhash]; head2 = inet_bhashfn_portaddr(table, child, net, port); spin_lock(&head->lock); spin_lock(&head2->lock); tb = inet_csk(sk)->icsk_bind_hash; tb2 = inet_csk(sk)->icsk_bind2_hash; if (unlikely(!tb || !tb2)) { spin_unlock(&head2->lock); spin_unlock(&head->lock); return -ENOENT; } if (tb->port != port) { l3mdev = inet_sk_bound_l3mdev(sk); /* NOTE: using tproxy and redirecting skbs to a proxy * on a different listener port breaks the assumption * that the listener socket's icsk_bind_hash is the same * as that of the child socket. We have to look up or * create a new bind bucket for the child here. */ inet_bind_bucket_for_each(tb, &head->chain) { if (inet_bind_bucket_match(tb, net, port, l3mdev)) break; } if (!tb) { tb = inet_bind_bucket_create(table->bind_bucket_cachep, net, head, port, l3mdev); if (!tb) { spin_unlock(&head2->lock); spin_unlock(&head->lock); return -ENOMEM; } created_inet_bind_bucket = true; } update_fastreuse = true; goto bhash2_find; } else if (!inet_bind2_bucket_addr_match(tb2, child)) { l3mdev = inet_sk_bound_l3mdev(sk); bhash2_find: tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, child); if (!tb2) { tb2 = inet_bind2_bucket_create(table->bind2_bucket_cachep, net, head2, port, l3mdev, child); if (!tb2) goto error; } } if (update_fastreuse) inet_csk_update_fastreuse(tb, child); inet_bind_hash(child, tb, tb2, port); spin_unlock(&head2->lock); spin_unlock(&head->lock); return 0; error: if (created_inet_bind_bucket) inet_bind_bucket_destroy(table->bind_bucket_cachep, tb); spin_unlock(&head2->lock); spin_unlock(&head->lock); return -ENOMEM; } EXPORT_SYMBOL_GPL(__inet_inherit_port); static struct inet_listen_hashbucket * inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk) { u32 hash; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) hash = ipv6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, inet_sk(sk)->inet_num); else #endif hash = ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, inet_sk(sk)->inet_num); return inet_lhash2_bucket(h, hash); } static inline int compute_score(struct sock *sk, struct net *net, const unsigned short hnum, const __be32 daddr, const int dif, const int sdif) { int score = -1; if (net_eq(sock_net(sk), net) && sk->sk_num == hnum && !ipv6_only_sock(sk)) { if (sk->sk_rcv_saddr != daddr) return -1; if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) return -1; score = sk->sk_bound_dev_if ? 2 : 1; if (sk->sk_family == PF_INET) score++; if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; } return score; } /** * inet_lookup_reuseport() - execute reuseport logic on AF_INET socket if necessary. * @net: network namespace. * @sk: AF_INET socket, must be in TCP_LISTEN state for TCP or TCP_CLOSE for UDP. * @skb: context for a potential SK_REUSEPORT program. * @doff: header offset. * @saddr: source address. * @sport: source port. * @daddr: destination address. * @hnum: destination port in host byte order. * @ehashfn: hash function used to generate the fallback hash. * * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to * the selected sock or an error. */ struct sock *inet_lookup_reuseport(struct net *net, struct sock *sk, struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, __be32 daddr, unsigned short hnum, inet_ehashfn_t *ehashfn) { struct sock *reuse_sk = NULL; u32 phash; if (sk->sk_reuseport) { phash = INDIRECT_CALL_2(ehashfn, udp_ehashfn, inet_ehashfn, net, daddr, hnum, saddr, sport); reuse_sk = reuseport_select_sock(sk, phash, skb, doff); } return reuse_sk; } EXPORT_SYMBOL_GPL(inet_lookup_reuseport); /* * Here are some nice properties to exploit here. The BSD API * does not allow a listening sock to specify the remote port nor the * remote address for the connection. So always assume those are both * wildcarded during the search since they can never be otherwise. */ /* called with rcu_read_lock() : No refcount taken on the socket */ static struct sock *inet_lhash2_lookup(struct net *net, struct inet_listen_hashbucket *ilb2, struct sk_buff *skb, int doff, const __be32 saddr, __be16 sport, const __be32 daddr, const unsigned short hnum, const int dif, const int sdif) { struct sock *sk, *result = NULL; struct hlist_nulls_node *node; int score, hiscore = 0; sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) { score = compute_score(sk, net, hnum, daddr, dif, sdif); if (score > hiscore) { result = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum, inet_ehashfn); if (result) return result; result = sk; hiscore = score; } } return result; } struct sock *inet_lookup_run_sk_lookup(struct net *net, int protocol, struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, __be32 daddr, u16 hnum, const int dif, inet_ehashfn_t *ehashfn) { struct sock *sk, *reuse_sk; bool no_reuseport; no_reuseport = bpf_sk_lookup_run_v4(net, protocol, saddr, sport, daddr, hnum, dif, &sk); if (no_reuseport || IS_ERR_OR_NULL(sk)) return sk; reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum, ehashfn); if (reuse_sk) sk = reuse_sk; return sk; } struct sock *__inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, __be16 sport, const __be32 daddr, const unsigned short hnum, const int dif, const int sdif) { struct inet_listen_hashbucket *ilb2; struct sock *result = NULL; unsigned int hash2; /* Lookup redirect from BPF */ if (static_branch_unlikely(&bpf_sk_lookup_enabled) && hashinfo == net->ipv4.tcp_death_row.hashinfo) { result = inet_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff, saddr, sport, daddr, hnum, dif, inet_ehashfn); if (result) goto done; } hash2 = ipv4_portaddr_hash(net, daddr, hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); result = inet_lhash2_lookup(net, ilb2, skb, doff, saddr, sport, daddr, hnum, dif, sdif); if (result) goto done; /* Lookup lhash2 with INADDR_ANY */ hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); result = inet_lhash2_lookup(net, ilb2, skb, doff, saddr, sport, htonl(INADDR_ANY), hnum, dif, sdif); done: if (IS_ERR(result)) return NULL; return result; } EXPORT_SYMBOL_GPL(__inet_lookup_listener); /* All sockets share common refcount, but have different destructors */ void sock_gen_put(struct sock *sk) { if (!refcount_dec_and_test(&sk->sk_refcnt)) return; if (sk->sk_state == TCP_TIME_WAIT) inet_twsk_free(inet_twsk(sk)); else if (sk->sk_state == TCP_NEW_SYN_RECV) reqsk_free(inet_reqsk(sk)); else sk_free(sk); } EXPORT_SYMBOL_GPL(sock_gen_put); void sock_edemux(struct sk_buff *skb) { sock_gen_put(skb->sk); } EXPORT_SYMBOL(sock_edemux); struct sock *__inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo, const __be32 saddr, const __be16 sport, const __be32 daddr, const u16 hnum, const int dif, const int sdif) { INET_ADDR_COOKIE(acookie, saddr, daddr); const __portpair ports = INET_COMBINED_PORTS(sport, hnum); struct sock *sk; const struct hlist_nulls_node *node; /* Optimize here for direct hit, only listening connections can * have wildcards anyways. */ unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); unsigned int slot = hash & hashinfo->ehash_mask; struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; begin: sk_nulls_for_each_rcu(sk, node, &head->chain) { if (sk->sk_hash != hash) continue; if (likely(inet_match(net, sk, acookie, ports, dif, sdif))) { if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) goto out; if (unlikely(!inet_match(net, sk, acookie, ports, dif, sdif))) { sock_gen_put(sk); goto begin; } goto found; } } /* * if the nulls value we got at the end of this lookup is * not the expected one, we must restart lookup. * We probably met an item that was moved to another chain. */ if (get_nulls_value(node) != slot) goto begin; out: sk = NULL; found: return sk; } EXPORT_SYMBOL_GPL(__inet_lookup_established); /* called with local bh disabled */ static int __inet_check_established(struct inet_timewait_death_row *death_row, struct sock *sk, __u16 lport, struct inet_timewait_sock **twp) { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_sock *inet = inet_sk(sk); __be32 daddr = inet->inet_rcv_saddr; __be32 saddr = inet->inet_daddr; int dif = sk->sk_bound_dev_if; struct net *net = sock_net(sk); int sdif = l3mdev_master_ifindex_by_index(net, dif); INET_ADDR_COOKIE(acookie, saddr, daddr); const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); unsigned int hash = inet_ehashfn(net, daddr, lport, saddr, inet->inet_dport); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); spinlock_t *lock = inet_ehash_lockp(hinfo, hash); struct sock *sk2; const struct hlist_nulls_node *node; struct inet_timewait_sock *tw = NULL; spin_lock(lock); sk_nulls_for_each(sk2, node, &head->chain) { if (sk2->sk_hash != hash) continue; if (likely(inet_match(net, sk2, acookie, ports, dif, sdif))) { if (sk2->sk_state == TCP_TIME_WAIT) { tw = inet_twsk(sk2); if (twsk_unique(sk, sk2, twp)) break; } goto not_unique; } } /* Must record num and sport now. Otherwise we will see * in hash table socket with a funny identity. */ inet->inet_num = lport; inet->inet_sport = htons(lport); sk->sk_hash = hash; WARN_ON(!sk_unhashed(sk)); __sk_nulls_add_node_rcu(sk, &head->chain); if (tw) { sk_nulls_del_node_init_rcu((struct sock *)tw); __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED); } spin_unlock(lock); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); if (twp) { *twp = tw; } else if (tw) { /* Silly. Should hash-dance instead... */ inet_twsk_deschedule_put(tw); } return 0; not_unique: spin_unlock(lock); return -EADDRNOTAVAIL; } static u64 inet_sk_port_offset(const struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr, inet->inet_daddr, inet->inet_dport); } /* Searches for an exsiting socket in the ehash bucket list. * Returns true if found, false otherwise. */ static bool inet_ehash_lookup_by_sk(struct sock *sk, struct hlist_nulls_head *list) { const __portpair ports = INET_COMBINED_PORTS(sk->sk_dport, sk->sk_num); const int sdif = sk->sk_bound_dev_if; const int dif = sk->sk_bound_dev_if; const struct hlist_nulls_node *node; struct net *net = sock_net(sk); struct sock *esk; INET_ADDR_COOKIE(acookie, sk->sk_daddr, sk->sk_rcv_saddr); sk_nulls_for_each_rcu(esk, node, list) { if (esk->sk_hash != sk->sk_hash) continue; if (sk->sk_family == AF_INET) { if (unlikely(inet_match(net, esk, acookie, ports, dif, sdif))) { return true; } } #if IS_ENABLED(CONFIG_IPV6) else if (sk->sk_family == AF_INET6) { if (unlikely(inet6_match(net, esk, &sk->sk_v6_daddr, &sk->sk_v6_rcv_saddr, ports, dif, sdif))) { return true; } } #endif } return false; } /* Insert a socket into ehash, and eventually remove another one * (The another one can be a SYN_RECV or TIMEWAIT) * If an existing socket already exists, socket sk is not inserted, * and sets found_dup_sk parameter to true. */ bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk) { struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); struct inet_ehash_bucket *head; struct hlist_nulls_head *list; spinlock_t *lock; bool ret = true; WARN_ON_ONCE(!sk_unhashed(sk)); sk->sk_hash = sk_ehashfn(sk); head = inet_ehash_bucket(hashinfo, sk->sk_hash); list = &head->chain; lock = inet_ehash_lockp(hashinfo, sk->sk_hash); spin_lock(lock); if (osk) { WARN_ON_ONCE(sk->sk_hash != osk->sk_hash); ret = sk_nulls_del_node_init_rcu(osk); } else if (found_dup_sk) { *found_dup_sk = inet_ehash_lookup_by_sk(sk, list); if (*found_dup_sk) ret = false; } if (ret) __sk_nulls_add_node_rcu(sk, list); spin_unlock(lock); return ret; } bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk) { bool ok = inet_ehash_insert(sk, osk, found_dup_sk); if (ok) { sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } else { this_cpu_inc(*sk->sk_prot->orphan_count); inet_sk_set_state(sk, TCP_CLOSE); sock_set_flag(sk, SOCK_DEAD); inet_csk_destroy_sock(sk); } return ok; } EXPORT_SYMBOL_GPL(inet_ehash_nolisten); static int inet_reuseport_add_sock(struct sock *sk, struct inet_listen_hashbucket *ilb) { struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash; const struct hlist_nulls_node *node; struct sock *sk2; kuid_t uid = sock_i_uid(sk); sk_nulls_for_each_rcu(sk2, node, &ilb->nulls_head) { if (sk2 != sk && sk2->sk_family == sk->sk_family && ipv6_only_sock(sk2) == ipv6_only_sock(sk) && sk2->sk_bound_dev_if == sk->sk_bound_dev_if && inet_csk(sk2)->icsk_bind_hash == tb && sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && inet_rcv_saddr_equal(sk, sk2, false)) return reuseport_add_sock(sk, sk2, inet_rcv_saddr_any(sk)); } return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); } int __inet_hash(struct sock *sk, struct sock *osk) { struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); struct inet_listen_hashbucket *ilb2; int err = 0; if (sk->sk_state != TCP_LISTEN) { local_bh_disable(); inet_ehash_nolisten(sk, osk, NULL); local_bh_enable(); return 0; } WARN_ON(!sk_unhashed(sk)); ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); spin_lock(&ilb2->lock); if (sk->sk_reuseport) { err = inet_reuseport_add_sock(sk, ilb2); if (err) goto unlock; } if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && sk->sk_family == AF_INET6) __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head); else __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head); sock_set_flag(sk, SOCK_RCU_FREE); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); unlock: spin_unlock(&ilb2->lock); return err; } EXPORT_SYMBOL(__inet_hash); int inet_hash(struct sock *sk) { int err = 0; if (sk->sk_state != TCP_CLOSE) err = __inet_hash(sk, NULL); return err; } EXPORT_SYMBOL_GPL(inet_hash); void inet_unhash(struct sock *sk) { struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); if (sk_unhashed(sk)) return; if (sk->sk_state == TCP_LISTEN) { struct inet_listen_hashbucket *ilb2; ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); /* Don't disable bottom halves while acquiring the lock to * avoid circular locking dependency on PREEMPT_RT. */ spin_lock(&ilb2->lock); if (sk_unhashed(sk)) { spin_unlock(&ilb2->lock); return; } if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_stop_listen_sock(sk); __sk_nulls_del_node_init_rcu(sk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_unlock(&ilb2->lock); } else { spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); spin_lock_bh(lock); if (sk_unhashed(sk)) { spin_unlock_bh(lock); return; } __sk_nulls_del_node_init_rcu(sk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_unlock_bh(lock); } } EXPORT_SYMBOL_GPL(inet_unhash); static bool inet_bind2_bucket_match(const struct inet_bind2_bucket *tb, const struct net *net, unsigned short port, int l3mdev, const struct sock *sk) { if (!net_eq(ib2_net(tb), net) || tb->port != port || tb->l3mdev != l3mdev) return false; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family != tb->family) { if (sk->sk_family == AF_INET) return ipv6_addr_v4mapped(&tb->v6_rcv_saddr) && tb->v6_rcv_saddr.s6_addr32[3] == sk->sk_rcv_saddr; return false; } if (sk->sk_family == AF_INET6) return ipv6_addr_equal(&tb->v6_rcv_saddr, &sk->sk_v6_rcv_saddr); #endif return tb->rcv_saddr == sk->sk_rcv_saddr; } bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const struct net *net, unsigned short port, int l3mdev, const struct sock *sk) { if (!net_eq(ib2_net(tb), net) || tb->port != port || tb->l3mdev != l3mdev) return false; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family != tb->family) { if (sk->sk_family == AF_INET) return ipv6_addr_any(&tb->v6_rcv_saddr) || ipv6_addr_v4mapped_any(&tb->v6_rcv_saddr); return false; } if (sk->sk_family == AF_INET6) return ipv6_addr_any(&tb->v6_rcv_saddr); #endif return tb->rcv_saddr == 0; } /* The socket's bhash2 hashbucket spinlock must be held when this is called */ struct inet_bind2_bucket * inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, const struct net *net, unsigned short port, int l3mdev, const struct sock *sk) { struct inet_bind2_bucket *bhash2 = NULL; inet_bind_bucket_for_each(bhash2, &head->chain) if (inet_bind2_bucket_match(bhash2, net, port, l3mdev, sk)) break; return bhash2; } struct inet_bind_hashbucket * inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port) { struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); u32 hash; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) hash = ipv6_portaddr_hash(net, &in6addr_any, port); else #endif hash = ipv4_portaddr_hash(net, 0, port); return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)]; } static void inet_update_saddr(struct sock *sk, void *saddr, int family) { if (family == AF_INET) { inet_sk(sk)->inet_saddr = *(__be32 *)saddr; sk_rcv_saddr_set(sk, inet_sk(sk)->inet_saddr); } #if IS_ENABLED(CONFIG_IPV6) else { sk->sk_v6_rcv_saddr = *(struct in6_addr *)saddr; } #endif } static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, bool reset) { struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); struct inet_bind_hashbucket *head, *head2; struct inet_bind2_bucket *tb2, *new_tb2; int l3mdev = inet_sk_bound_l3mdev(sk); int port = inet_sk(sk)->inet_num; struct net *net = sock_net(sk); int bhash; if (!inet_csk(sk)->icsk_bind2_hash) { /* Not bind()ed before. */ if (reset) inet_reset_saddr(sk); else inet_update_saddr(sk, saddr, family); return 0; } /* Allocate a bind2 bucket ahead of time to avoid permanently putting * the bhash2 table in an inconsistent state if a new tb2 bucket * allocation fails. */ new_tb2 = kmem_cache_alloc(hinfo->bind2_bucket_cachep, GFP_ATOMIC); if (!new_tb2) { if (reset) { /* The (INADDR_ANY, port) bucket might have already * been freed, then we cannot fixup icsk_bind2_hash, * so we give up and unlink sk from bhash/bhash2 not * to leave inconsistency in bhash2. */ inet_put_port(sk); inet_reset_saddr(sk); } return -ENOMEM; } bhash = inet_bhashfn(net, port, hinfo->bhash_size); head = &hinfo->bhash[bhash]; head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); /* If we change saddr locklessly, another thread * iterating over bhash might see corrupted address. */ spin_lock_bh(&head->lock); spin_lock(&head2->lock); __sk_del_bind2_node(sk); inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, inet_csk(sk)->icsk_bind2_hash); spin_unlock(&head2->lock); if (reset) inet_reset_saddr(sk); else inet_update_saddr(sk, saddr, family); head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); spin_lock(&head2->lock); tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); if (!tb2) { tb2 = new_tb2; inet_bind2_bucket_init(tb2, net, head2, port, l3mdev, sk); } sk_add_bind2_node(sk, &tb2->owners); inet_csk(sk)->icsk_bind2_hash = tb2; spin_unlock(&head2->lock); spin_unlock_bh(&head->lock); if (tb2 != new_tb2) kmem_cache_free(hinfo->bind2_bucket_cachep, new_tb2); return 0; } int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family) { return __inet_bhash2_update_saddr(sk, saddr, family, false); } EXPORT_SYMBOL_GPL(inet_bhash2_update_saddr); void inet_bhash2_reset_saddr(struct sock *sk) { if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) __inet_bhash2_update_saddr(sk, NULL, 0, true); } EXPORT_SYMBOL_GPL(inet_bhash2_reset_saddr); /* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm * Note that we use 32bit integers (vs RFC 'short integers') * because 2^16 is not a multiple of num_ephemeral and this * property might be used by clever attacker. * * RFC claims using TABLE_LENGTH=10 buckets gives an improvement, though * attacks were since demonstrated, thus we use 65536 by default instead * to really give more isolation and privacy, at the expense of 256kB * of kernel memory. */ #define INET_TABLE_PERTURB_SIZE (1 << CONFIG_INET_TABLE_PERTURB_ORDER) static u32 *table_perturb; int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk, u64 port_offset, int (*check_established)(struct inet_timewait_death_row *, struct sock *, __u16, struct inet_timewait_sock **)) { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_bind_hashbucket *head, *head2; struct inet_timewait_sock *tw = NULL; int port = inet_sk(sk)->inet_num; struct net *net = sock_net(sk); struct inet_bind2_bucket *tb2; struct inet_bind_bucket *tb; bool tb_created = false; u32 remaining, offset; int ret, i, low, high; int l3mdev; u32 index; if (port) { local_bh_disable(); ret = check_established(death_row, sk, port, NULL); local_bh_enable(); return ret; } l3mdev = inet_sk_bound_l3mdev(sk); inet_sk_get_local_port_range(sk, &low, &high); high++; /* [32768, 60999] -> [32768, 61000[ */ remaining = high - low; if (likely(remaining > 1)) remaining &= ~1U; get_random_sleepable_once(table_perturb, INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb)); index = port_offset & (INET_TABLE_PERTURB_SIZE - 1); offset = READ_ONCE(table_perturb[index]) + (port_offset >> 32); offset %= remaining; /* In first pass we try ports of @low parity. * inet_csk_get_port() does the opposite choice. */ offset &= ~1U; other_parity_scan: port = low + offset; for (i = 0; i < remaining; i += 2, port += 2) { if (unlikely(port >= high)) port -= remaining; if (inet_is_local_reserved_port(net, port)) continue; head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)]; spin_lock_bh(&head->lock); /* Does not bother with rcv_saddr checks, because * the established check is already unique enough. */ inet_bind_bucket_for_each(tb, &head->chain) { if (inet_bind_bucket_match(tb, net, port, l3mdev)) { if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) goto next_port; WARN_ON(hlist_empty(&tb->owners)); if (!check_established(death_row, sk, port, &tw)) goto ok; goto next_port; } } tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, net, head, port, l3mdev); if (!tb) { spin_unlock_bh(&head->lock); return -ENOMEM; } tb_created = true; tb->fastreuse = -1; tb->fastreuseport = -1; goto ok; next_port: spin_unlock_bh(&head->lock); cond_resched(); } offset++; if ((offset & 1) && remaining > 1) goto other_parity_scan; return -EADDRNOTAVAIL; ok: /* Find the corresponding tb2 bucket since we need to * add the socket to the bhash2 table as well */ head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); spin_lock(&head2->lock); tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); if (!tb2) { tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, net, head2, port, l3mdev, sk); if (!tb2) goto error; } /* Here we want to add a little bit of randomness to the next source * port that will be chosen. We use a max() with a random here so that * on low contention the randomness is maximal and on high contention * it may be inexistent. */ i = max_t(int, i, get_random_u32_below(8) * 2); WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2); /* Head lock still held and bh's disabled */ inet_bind_hash(sk, tb, tb2, port); if (sk_unhashed(sk)) { inet_sk(sk)->inet_sport = htons(port); inet_ehash_nolisten(sk, (struct sock *)tw, NULL); } if (tw) inet_twsk_bind_unhash(tw, hinfo); spin_unlock(&head2->lock); spin_unlock(&head->lock); if (tw) inet_twsk_deschedule_put(tw); local_bh_enable(); return 0; error: spin_unlock(&head2->lock); if (tb_created) inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb); spin_unlock_bh(&head->lock); return -ENOMEM; } /* * Bind a port for a connect operation and hash it. */ int inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk) { u64 port_offset = 0; if (!inet_sk(sk)->inet_num) port_offset = inet_sk_port_offset(sk); return __inet_hash_connect(death_row, sk, port_offset, __inet_check_established); } EXPORT_SYMBOL_GPL(inet_hash_connect); static void init_hashinfo_lhash2(struct inet_hashinfo *h) { int i; for (i = 0; i <= h->lhash2_mask; i++) { spin_lock_init(&h->lhash2[i].lock); INIT_HLIST_NULLS_HEAD(&h->lhash2[i].nulls_head, i + LISTENING_NULLS_BASE); } } void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, unsigned long numentries, int scale, unsigned long low_limit, unsigned long high_limit) { h->lhash2 = alloc_large_system_hash(name, sizeof(*h->lhash2), numentries, scale, 0, NULL, &h->lhash2_mask, low_limit, high_limit); init_hashinfo_lhash2(h); /* this one is used for source ports of outgoing connections */ table_perturb = alloc_large_system_hash("Table-perturb", sizeof(*table_perturb), INET_TABLE_PERTURB_SIZE, 0, 0, NULL, NULL, INET_TABLE_PERTURB_SIZE, INET_TABLE_PERTURB_SIZE); } int inet_hashinfo2_init_mod(struct inet_hashinfo *h) { h->lhash2 = kmalloc_array(INET_LHTABLE_SIZE, sizeof(*h->lhash2), GFP_KERNEL); if (!h->lhash2) return -ENOMEM; h->lhash2_mask = INET_LHTABLE_SIZE - 1; /* INET_LHTABLE_SIZE must be a power of 2 */ BUG_ON(INET_LHTABLE_SIZE & h->lhash2_mask); init_hashinfo_lhash2(h); return 0; } EXPORT_SYMBOL_GPL(inet_hashinfo2_init_mod); int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) { unsigned int locksz = sizeof(spinlock_t); unsigned int i, nblocks = 1; if (locksz != 0) { /* allocate 2 cache lines or at least one spinlock per cpu */ nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U); nblocks = roundup_pow_of_two(nblocks * num_possible_cpus()); /* no more locks than number of hash buckets */ nblocks = min(nblocks, hashinfo->ehash_mask + 1); hashinfo->ehash_locks = kvmalloc_array(nblocks, locksz, GFP_KERNEL); if (!hashinfo->ehash_locks) return -ENOMEM; for (i = 0; i < nblocks; i++) spin_lock_init(&hashinfo->ehash_locks[i]); } hashinfo->ehash_locks_mask = nblocks - 1; return 0; } EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc); struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo, unsigned int ehash_entries) { struct inet_hashinfo *new_hashinfo; int i; new_hashinfo = kmemdup(hashinfo, sizeof(*hashinfo), GFP_KERNEL); if (!new_hashinfo) goto err; new_hashinfo->ehash = vmalloc_huge(ehash_entries * sizeof(struct inet_ehash_bucket), GFP_KERNEL_ACCOUNT); if (!new_hashinfo->ehash) goto free_hashinfo; new_hashinfo->ehash_mask = ehash_entries - 1; if (inet_ehash_locks_alloc(new_hashinfo)) goto free_ehash; for (i = 0; i < ehash_entries; i++) INIT_HLIST_NULLS_HEAD(&new_hashinfo->ehash[i].chain, i); new_hashinfo->pernet = true; return new_hashinfo; free_ehash: vfree(new_hashinfo->ehash); free_hashinfo: kfree(new_hashinfo); err: return NULL; } EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_alloc); void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo) { if (!hashinfo->pernet) return; inet_ehash_locks_free(hashinfo); vfree(hashinfo->ehash); kfree(hashinfo); } EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_free);
55 8 1 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_DEBUGREG_H #define _ASM_X86_DEBUGREG_H #include <linux/bug.h> #include <linux/percpu.h> #include <uapi/asm/debugreg.h> DECLARE_PER_CPU(unsigned long, cpu_dr7); #ifndef CONFIG_PARAVIRT_XXL /* * These special macros can be used to get or set a debugging register */ #define get_debugreg(var, register) \ (var) = native_get_debugreg(register) #define set_debugreg(value, register) \ native_set_debugreg(register, value) #endif static __always_inline unsigned long native_get_debugreg(int regno) { unsigned long val = 0; /* Damn you, gcc! */ switch (regno) { case 0: asm("mov %%db0, %0" :"=r" (val)); break; case 1: asm("mov %%db1, %0" :"=r" (val)); break; case 2: asm("mov %%db2, %0" :"=r" (val)); break; case 3: asm("mov %%db3, %0" :"=r" (val)); break; case 6: asm("mov %%db6, %0" :"=r" (val)); break; case 7: /* * Apply __FORCE_ORDER to DR7 reads to forbid re-ordering them * with other code. * * This is needed because a DR7 access can cause a #VC exception * when running under SEV-ES. Taking a #VC exception is not a * safe thing to do just anywhere in the entry code and * re-ordering might place the access into an unsafe location. * * This happened in the NMI handler, where the DR7 read was * re-ordered to happen before the call to sev_es_ist_enter(), * causing stack recursion. */ asm volatile("mov %%db7, %0" : "=r" (val) : __FORCE_ORDER); break; default: BUG(); } return val; } static __always_inline void native_set_debugreg(int regno, unsigned long value) { switch (regno) { case 0: asm("mov %0, %%db0" ::"r" (value)); break; case 1: asm("mov %0, %%db1" ::"r" (value)); break; case 2: asm("mov %0, %%db2" ::"r" (value)); break; case 3: asm("mov %0, %%db3" ::"r" (value)); break; case 6: asm("mov %0, %%db6" ::"r" (value)); break; case 7: /* * Apply __FORCE_ORDER to DR7 writes to forbid re-ordering them * with other code. * * While is didn't happen with a DR7 write (see the DR7 read * comment above which explains where it happened), add the * __FORCE_ORDER here too to avoid similar problems in the * future. */ asm volatile("mov %0, %%db7" ::"r" (value), __FORCE_ORDER); break; default: BUG(); } } static inline void hw_breakpoint_disable(void) { /* Zero the control register for HW Breakpoint */ set_debugreg(0UL, 7); /* Zero-out the individual HW breakpoint address registers */ set_debugreg(0UL, 0); set_debugreg(0UL, 1); set_debugreg(0UL, 2); set_debugreg(0UL, 3); } static __always_inline bool hw_breakpoint_active(void) { return __this_cpu_read(cpu_dr7) & DR_GLOBAL_ENABLE_MASK; } extern void hw_breakpoint_restore(void); static __always_inline unsigned long local_db_save(void) { unsigned long dr7; if (static_cpu_has(X86_FEATURE_HYPERVISOR) && !hw_breakpoint_active()) return 0; get_debugreg(dr7, 7); dr7 &= ~0x400; /* architecturally set bit */ if (dr7) set_debugreg(0, 7); /* * Ensure the compiler doesn't lower the above statements into * the critical section; disabling breakpoints late would not * be good. */ barrier(); return dr7; } static __always_inline void local_db_restore(unsigned long dr7) { /* * Ensure the compiler doesn't raise this statement into * the critical section; enabling breakpoints early would * not be good. */ barrier(); if (dr7) set_debugreg(dr7, 7); } #ifdef CONFIG_CPU_SUP_AMD extern void amd_set_dr_addr_mask(unsigned long mask, unsigned int dr); extern unsigned long amd_get_dr_addr_mask(unsigned int dr); #else static inline void amd_set_dr_addr_mask(unsigned long mask, unsigned int dr) { } static inline unsigned long amd_get_dr_addr_mask(unsigned int dr) { return 0; } #endif #endif /* _ASM_X86_DEBUGREG_H */
3 3 3 3 3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/super.c * * Copyright (C) 1991, 1992 Linus Torvalds * * super.c contains code to handle: - mount structures * - super-block tables * - filesystem drivers list * - mount system call * - umount system call * - ustat system call * * GK 2/5/95 - Changed to support mounting the root fs via NFS * * Added kerneld support: Jacques Gelinas and Bjorn Ekwall * Added change_root: Werner Almesberger & Hans Lermen, Feb '96 * Added options to /proc/mounts: * Torbjörn Lindh (torbjorn.lindh@gopta.se), April 14, 1996. * Added devfs support: Richard Gooch <rgooch@atnf.csiro.au>, 13-JAN-1998 * Heavily rewritten for 'one fs - one tree' dcache architecture. AV, Mar 2000 */ #include <linux/export.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/mount.h> #include <linux/security.h> #include <linux/writeback.h> /* for the emergency remount stuff */ #include <linux/idr.h> #include <linux/mutex.h> #include <linux/backing-dev.h> #include <linux/rculist_bl.h> #include <linux/fscrypt.h> #include <linux/fsnotify.h> #include <linux/lockdep.h> #include <linux/user_namespace.h> #include <linux/fs_context.h> #include <uapi/linux/mount.h> #include "internal.h" static int thaw_super_locked(struct super_block *sb, enum freeze_holder who); static LIST_HEAD(super_blocks); static DEFINE_SPINLOCK(sb_lock); static char *sb_writers_name[SB_FREEZE_LEVELS] = { "sb_writers", "sb_pagefaults", "sb_internal", }; static inline void __super_lock(struct super_block *sb, bool excl) { if (excl) down_write(&sb->s_umount); else down_read(&sb->s_umount); } static inline void super_unlock(struct super_block *sb, bool excl) { if (excl) up_write(&sb->s_umount); else up_read(&sb->s_umount); } static inline void __super_lock_excl(struct super_block *sb) { __super_lock(sb, true); } static inline void super_unlock_excl(struct super_block *sb) { super_unlock(sb, true); } static inline void super_unlock_shared(struct super_block *sb) { super_unlock(sb, false); } static inline bool wait_born(struct super_block *sb) { unsigned int flags; /* * Pairs with smp_store_release() in super_wake() and ensures * that we see SB_BORN or SB_DYING after we're woken. */ flags = smp_load_acquire(&sb->s_flags); return flags & (SB_BORN | SB_DYING); } /** * super_lock - wait for superblock to become ready and lock it * @sb: superblock to wait for * @excl: whether exclusive access is required * * If the superblock has neither passed through vfs_get_tree() or * generic_shutdown_super() yet wait for it to happen. Either superblock * creation will succeed and SB_BORN is set by vfs_get_tree() or we're * woken and we'll see SB_DYING. * * The caller must have acquired a temporary reference on @sb->s_count. * * Return: This returns true if SB_BORN was set, false if SB_DYING was * set. The function acquires s_umount and returns with it held. */ static __must_check bool super_lock(struct super_block *sb, bool excl) { lockdep_assert_not_held(&sb->s_umount); relock: __super_lock(sb, excl); /* * Has gone through generic_shutdown_super() in the meantime. * @sb->s_root is NULL and @sb->s_active is 0. No one needs to * grab a reference to this. Tell them so. */ if (sb->s_flags & SB_DYING) return false; /* Has called ->get_tree() successfully. */ if (sb->s_flags & SB_BORN) return true; super_unlock(sb, excl); /* wait until the superblock is ready or dying */ wait_var_event(&sb->s_flags, wait_born(sb)); /* * Neither SB_BORN nor SB_DYING are ever unset so we never loop. * Just reacquire @sb->s_umount for the caller. */ goto relock; } /* wait and acquire read-side of @sb->s_umount */ static inline bool super_lock_shared(struct super_block *sb) { return super_lock(sb, false); } /* wait and acquire write-side of @sb->s_umount */ static inline bool super_lock_excl(struct super_block *sb) { return super_lock(sb, true); } /* wake waiters */ #define SUPER_WAKE_FLAGS (SB_BORN | SB_DYING | SB_DEAD) static void super_wake(struct super_block *sb, unsigned int flag) { WARN_ON_ONCE((flag & ~SUPER_WAKE_FLAGS)); WARN_ON_ONCE(hweight32(flag & SUPER_WAKE_FLAGS) > 1); /* * Pairs with smp_load_acquire() in super_lock() to make sure * all initializations in the superblock are seen by the user * seeing SB_BORN sent. */ smp_store_release(&sb->s_flags, sb->s_flags | flag); /* * Pairs with the barrier in prepare_to_wait_event() to make sure * ___wait_var_event() either sees SB_BORN set or * waitqueue_active() check in wake_up_var() sees the waiter. */ smp_mb(); wake_up_var(&sb->s_flags); } /* * One thing we have to be careful of with a per-sb shrinker is that we don't * drop the last active reference to the superblock from within the shrinker. * If that happens we could trigger unregistering the shrinker from within the * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we * take a passive reference to the superblock to avoid this from occurring. */ static unsigned long super_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { struct super_block *sb; long fs_objects = 0; long total_objects; long freed = 0; long dentries; long inodes; sb = container_of(shrink, struct super_block, s_shrink); /* * Deadlock avoidance. We may hold various FS locks, and we don't want * to recurse into the FS that called us in clear_inode() and friends.. */ if (!(sc->gfp_mask & __GFP_FS)) return SHRINK_STOP; if (!super_trylock_shared(sb)) return SHRINK_STOP; if (sb->s_op->nr_cached_objects) fs_objects = sb->s_op->nr_cached_objects(sb, sc); inodes = list_lru_shrink_count(&sb->s_inode_lru, sc); dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc); total_objects = dentries + inodes + fs_objects + 1; if (!total_objects) total_objects = 1; /* proportion the scan between the caches */ dentries = mult_frac(sc->nr_to_scan, dentries, total_objects); inodes = mult_frac(sc->nr_to_scan, inodes, total_objects); fs_objects = mult_frac(sc->nr_to_scan, fs_objects, total_objects); /* * prune the dcache first as the icache is pinned by it, then * prune the icache, followed by the filesystem specific caches * * Ensure that we always scan at least one object - memcg kmem * accounting uses this to fully empty the caches. */ sc->nr_to_scan = dentries + 1; freed = prune_dcache_sb(sb, sc); sc->nr_to_scan = inodes + 1; freed += prune_icache_sb(sb, sc); if (fs_objects) { sc->nr_to_scan = fs_objects + 1; freed += sb->s_op->free_cached_objects(sb, sc); } super_unlock_shared(sb); return freed; } static unsigned long super_cache_count(struct shrinker *shrink, struct shrink_control *sc) { struct super_block *sb; long total_objects = 0; sb = container_of(shrink, struct super_block, s_shrink); /* * We don't call super_trylock_shared() here as it is a scalability * bottleneck, so we're exposed to partial setup state. The shrinker * rwsem does not protect filesystem operations backing * list_lru_shrink_count() or s_op->nr_cached_objects(). Counts can * change between super_cache_count and super_cache_scan, so we really * don't need locks here. * * However, if we are currently mounting the superblock, the underlying * filesystem might be in a state of partial construction and hence it * is dangerous to access it. super_trylock_shared() uses a SB_BORN check * to avoid this situation, so do the same here. The memory barrier is * matched with the one in mount_fs() as we don't hold locks here. */ if (!(sb->s_flags & SB_BORN)) return 0; smp_rmb(); if (sb->s_op && sb->s_op->nr_cached_objects) total_objects = sb->s_op->nr_cached_objects(sb, sc); total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc); total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc); if (!total_objects) return SHRINK_EMPTY; total_objects = vfs_pressure_ratio(total_objects); return total_objects; } static void destroy_super_work(struct work_struct *work) { struct super_block *s = container_of(work, struct super_block, destroy_work); int i; for (i = 0; i < SB_FREEZE_LEVELS; i++) percpu_free_rwsem(&s->s_writers.rw_sem[i]); kfree(s); } static void destroy_super_rcu(struct rcu_head *head) { struct super_block *s = container_of(head, struct super_block, rcu); INIT_WORK(&s->destroy_work, destroy_super_work); schedule_work(&s->destroy_work); } /* Free a superblock that has never been seen by anyone */ static void destroy_unused_super(struct super_block *s) { if (!s) return; super_unlock_excl(s); list_lru_destroy(&s->s_dentry_lru); list_lru_destroy(&s->s_inode_lru); security_sb_free(s); put_user_ns(s->s_user_ns); kfree(s->s_subtype); free_prealloced_shrinker(&s->s_shrink); /* no delays needed */ destroy_super_work(&s->destroy_work); } /** * alloc_super - create new superblock * @type: filesystem type superblock should belong to * @flags: the mount flags * @user_ns: User namespace for the super_block * * Allocates and initializes a new &struct super_block. alloc_super() * returns a pointer new superblock or %NULL if allocation had failed. */ static struct super_block *alloc_super(struct file_system_type *type, int flags, struct user_namespace *user_ns) { struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER); static const struct super_operations default_op; int i; if (!s) return NULL; INIT_LIST_HEAD(&s->s_mounts); s->s_user_ns = get_user_ns(user_ns); init_rwsem(&s->s_umount); lockdep_set_class(&s->s_umount, &type->s_umount_key); /* * sget() can have s_umount recursion. * * When it cannot find a suitable sb, it allocates a new * one (this one), and tries again to find a suitable old * one. * * In case that succeeds, it will acquire the s_umount * lock of the old one. Since these are clearly distrinct * locks, and this object isn't exposed yet, there's no * risk of deadlocks. * * Annotate this by putting this lock in a different * subclass. */ down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING); if (security_sb_alloc(s)) goto fail; for (i = 0; i < SB_FREEZE_LEVELS; i++) { if (__percpu_init_rwsem(&s->s_writers.rw_sem[i], sb_writers_name[i], &type->s_writers_key[i])) goto fail; } s->s_bdi = &noop_backing_dev_info; s->s_flags = flags; if (s->s_user_ns != &init_user_ns) s->s_iflags |= SB_I_NODEV; INIT_HLIST_NODE(&s->s_instances); INIT_HLIST_BL_HEAD(&s->s_roots); mutex_init(&s->s_sync_lock); INIT_LIST_HEAD(&s->s_inodes); spin_lock_init(&s->s_inode_list_lock); INIT_LIST_HEAD(&s->s_inodes_wb); spin_lock_init(&s->s_inode_wblist_lock); s->s_count = 1; atomic_set(&s->s_active, 1); mutex_init(&s->s_vfs_rename_mutex); lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key); init_rwsem(&s->s_dquot.dqio_sem); s->s_maxbytes = MAX_NON_LFS; s->s_op = &default_op; s->s_time_gran = 1000000000; s->s_time_min = TIME64_MIN; s->s_time_max = TIME64_MAX; s->s_shrink.seeks = DEFAULT_SEEKS; s->s_shrink.scan_objects = super_cache_scan; s->s_shrink.count_objects = super_cache_count; s->s_shrink.batch = 1024; s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE; if (prealloc_shrinker(&s->s_shrink, "sb-%s", type->name)) goto fail; if (list_lru_init_memcg(&s->s_dentry_lru, &s->s_shrink)) goto fail; if (list_lru_init_memcg(&s->s_inode_lru, &s->s_shrink)) goto fail; return s; fail: destroy_unused_super(s); return NULL; } /* Superblock refcounting */ /* * Drop a superblock's refcount. The caller must hold sb_lock. */ static void __put_super(struct super_block *s) { if (!--s->s_count) { list_del_init(&s->s_list); WARN_ON(s->s_dentry_lru.node); WARN_ON(s->s_inode_lru.node); WARN_ON(!list_empty(&s->s_mounts)); security_sb_free(s); put_user_ns(s->s_user_ns); kfree(s->s_subtype); call_rcu(&s->rcu, destroy_super_rcu); } } /** * put_super - drop a temporary reference to superblock * @sb: superblock in question * * Drops a temporary reference, frees superblock if there's no * references left. */ void put_super(struct super_block *sb) { spin_lock(&sb_lock); __put_super(sb); spin_unlock(&sb_lock); } static void kill_super_notify(struct super_block *sb) { lockdep_assert_not_held(&sb->s_umount); /* already notified earlier */ if (sb->s_flags & SB_DEAD) return; /* * Remove it from @fs_supers so it isn't found by new * sget{_fc}() walkers anymore. Any concurrent mounter still * managing to grab a temporary reference is guaranteed to * already see SB_DYING and will wait until we notify them about * SB_DEAD. */ spin_lock(&sb_lock); hlist_del_init(&sb->s_instances); spin_unlock(&sb_lock); /* * Let concurrent mounts know that this thing is really dead. * We don't need @sb->s_umount here as every concurrent caller * will see SB_DYING and either discard the superblock or wait * for SB_DEAD. */ super_wake(sb, SB_DEAD); } /** * deactivate_locked_super - drop an active reference to superblock * @s: superblock to deactivate * * Drops an active reference to superblock, converting it into a temporary * one if there is no other active references left. In that case we * tell fs driver to shut it down and drop the temporary reference we * had just acquired. * * Caller holds exclusive lock on superblock; that lock is released. */ void deactivate_locked_super(struct super_block *s) { struct file_system_type *fs = s->s_type; if (atomic_dec_and_test(&s->s_active)) { unregister_shrinker(&s->s_shrink); fs->kill_sb(s); kill_super_notify(s); /* * Since list_lru_destroy() may sleep, we cannot call it from * put_super(), where we hold the sb_lock. Therefore we destroy * the lru lists right now. */ list_lru_destroy(&s->s_dentry_lru); list_lru_destroy(&s->s_inode_lru); put_filesystem(fs); put_super(s); } else { super_unlock_excl(s); } } EXPORT_SYMBOL(deactivate_locked_super); /** * deactivate_super - drop an active reference to superblock * @s: superblock to deactivate * * Variant of deactivate_locked_super(), except that superblock is *not* * locked by caller. If we are going to drop the final active reference, * lock will be acquired prior to that. */ void deactivate_super(struct super_block *s) { if (!atomic_add_unless(&s->s_active, -1, 1)) { __super_lock_excl(s); deactivate_locked_super(s); } } EXPORT_SYMBOL(deactivate_super); /** * grab_super - acquire an active reference * @s: reference we are trying to make active * * Tries to acquire an active reference. grab_super() is used when we * had just found a superblock in super_blocks or fs_type->fs_supers * and want to turn it into a full-blown active reference. grab_super() * is called with sb_lock held and drops it. Returns 1 in case of * success, 0 if we had failed (superblock contents was already dead or * dying when grab_super() had been called). Note that this is only * called for superblocks not in rundown mode (== ones still on ->fs_supers * of their type), so increment of ->s_count is OK here. */ static int grab_super(struct super_block *s) __releases(sb_lock) { bool born; s->s_count++; spin_unlock(&sb_lock); born = super_lock_excl(s); if (born && atomic_inc_not_zero(&s->s_active)) { put_super(s); return 1; } super_unlock_excl(s); put_super(s); return 0; } static inline bool wait_dead(struct super_block *sb) { unsigned int flags; /* * Pairs with memory barrier in super_wake() and ensures * that we see SB_DEAD after we're woken. */ flags = smp_load_acquire(&sb->s_flags); return flags & SB_DEAD; } /** * grab_super_dead - acquire an active reference to a superblock * @sb: superblock to acquire * * Acquire a temporary reference on a superblock and try to trade it for * an active reference. This is used in sget{_fc}() to wait for a * superblock to either become SB_BORN or for it to pass through * sb->kill() and be marked as SB_DEAD. * * Return: This returns true if an active reference could be acquired, * false if not. */ static bool grab_super_dead(struct super_block *sb) { sb->s_count++; if (grab_super(sb)) { put_super(sb); lockdep_assert_held(&sb->s_umount); return true; } wait_var_event(&sb->s_flags, wait_dead(sb)); lockdep_assert_not_held(&sb->s_umount); put_super(sb); return false; } /* * super_trylock_shared - try to grab ->s_umount shared * @sb: reference we are trying to grab * * Try to prevent fs shutdown. This is used in places where we * cannot take an active reference but we need to ensure that the * filesystem is not shut down while we are working on it. It returns * false if we cannot acquire s_umount or if we lose the race and * filesystem already got into shutdown, and returns true with the s_umount * lock held in read mode in case of success. On successful return, * the caller must drop the s_umount lock when done. * * Note that unlike get_super() et.al. this one does *not* bump ->s_count. * The reason why it's safe is that we are OK with doing trylock instead * of down_read(). There's a couple of places that are OK with that, but * it's very much not a general-purpose interface. */ bool super_trylock_shared(struct super_block *sb) { if (down_read_trylock(&sb->s_umount)) { if (!(sb->s_flags & SB_DYING) && sb->s_root && (sb->s_flags & SB_BORN)) return true; super_unlock_shared(sb); } return false; } /** * retire_super - prevents superblock from being reused * @sb: superblock to retire * * The function marks superblock to be ignored in superblock test, which * prevents it from being reused for any new mounts. If the superblock has * a private bdi, it also unregisters it, but doesn't reduce the refcount * of the superblock to prevent potential races. The refcount is reduced * by generic_shutdown_super(). The function can not be called * concurrently with generic_shutdown_super(). It is safe to call the * function multiple times, subsequent calls have no effect. * * The marker will affect the re-use only for block-device-based * superblocks. Other superblocks will still get marked if this function * is used, but that will not affect their reusability. */ void retire_super(struct super_block *sb) { WARN_ON(!sb->s_bdev); __super_lock_excl(sb); if (sb->s_iflags & SB_I_PERSB_BDI) { bdi_unregister(sb->s_bdi); sb->s_iflags &= ~SB_I_PERSB_BDI; } sb->s_iflags |= SB_I_RETIRED; super_unlock_excl(sb); } EXPORT_SYMBOL(retire_super); /** * generic_shutdown_super - common helper for ->kill_sb() * @sb: superblock to kill * * generic_shutdown_super() does all fs-independent work on superblock * shutdown. Typical ->kill_sb() should pick all fs-specific objects * that need destruction out of superblock, call generic_shutdown_super() * and release aforementioned objects. Note: dentries and inodes _are_ * taken care of and do not need specific handling. * * Upon calling this function, the filesystem may no longer alter or * rearrange the set of dentries belonging to this super_block, nor may it * change the attachments of dentries to inodes. */ void generic_shutdown_super(struct super_block *sb) { const struct super_operations *sop = sb->s_op; if (sb->s_root) { shrink_dcache_for_umount(sb); sync_filesystem(sb); sb->s_flags &= ~SB_ACTIVE; cgroup_writeback_umount(); /* Evict all inodes with zero refcount. */ evict_inodes(sb); /* * Clean up and evict any inodes that still have references due * to fsnotify or the security policy. */ fsnotify_sb_delete(sb); security_sb_delete(sb); /* * Now that all potentially-encrypted inodes have been evicted, * the fscrypt keyring can be destroyed. */ fscrypt_destroy_keyring(sb); if (sb->s_dio_done_wq) { destroy_workqueue(sb->s_dio_done_wq); sb->s_dio_done_wq = NULL; } if (sop->put_super) sop->put_super(sb); if (CHECK_DATA_CORRUPTION(!list_empty(&sb->s_inodes), "VFS: Busy inodes after unmount of %s (%s)", sb->s_id, sb->s_type->name)) { /* * Adding a proper bailout path here would be hard, but * we can at least make it more likely that a later * iput_final() or such crashes cleanly. */ struct inode *inode; spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { inode->i_op = VFS_PTR_POISON; inode->i_sb = VFS_PTR_POISON; inode->i_mapping = VFS_PTR_POISON; } spin_unlock(&sb->s_inode_list_lock); } } /* * Broadcast to everyone that grabbed a temporary reference to this * superblock before we removed it from @fs_supers that the superblock * is dying. Every walker of @fs_supers outside of sget{_fc}() will now * discard this superblock and treat it as dead. * * We leave the superblock on @fs_supers so it can be found by * sget{_fc}() until we passed sb->kill_sb(). */ super_wake(sb, SB_DYING); super_unlock_excl(sb); if (sb->s_bdi != &noop_backing_dev_info) { if (sb->s_iflags & SB_I_PERSB_BDI) bdi_unregister(sb->s_bdi); bdi_put(sb->s_bdi); sb->s_bdi = &noop_backing_dev_info; } } EXPORT_SYMBOL(generic_shutdown_super); bool mount_capable(struct fs_context *fc) { if (!(fc->fs_type->fs_flags & FS_USERNS_MOUNT)) return capable(CAP_SYS_ADMIN); else return ns_capable(fc->user_ns, CAP_SYS_ADMIN); } /** * sget_fc - Find or create a superblock * @fc: Filesystem context. * @test: Comparison callback * @set: Setup callback * * Create a new superblock or find an existing one. * * The @test callback is used to find a matching existing superblock. * Whether or not the requested parameters in @fc are taken into account * is specific to the @test callback that is used. They may even be * completely ignored. * * If an extant superblock is matched, it will be returned unless: * * (1) the namespace the filesystem context @fc and the extant * superblock's namespace differ * * (2) the filesystem context @fc has requested that reusing an extant * superblock is not allowed * * In both cases EBUSY will be returned. * * If no match is made, a new superblock will be allocated and basic * initialisation will be performed (s_type, s_fs_info and s_id will be * set and the @set callback will be invoked), the superblock will be * published and it will be returned in a partially constructed state * with SB_BORN and SB_ACTIVE as yet unset. * * Return: On success, an extant or newly created superblock is * returned. On failure an error pointer is returned. */ struct super_block *sget_fc(struct fs_context *fc, int (*test)(struct super_block *, struct fs_context *), int (*set)(struct super_block *, struct fs_context *)) { struct super_block *s = NULL; struct super_block *old; struct user_namespace *user_ns = fc->global ? &init_user_ns : fc->user_ns; int err; retry: spin_lock(&sb_lock); if (test) { hlist_for_each_entry(old, &fc->fs_type->fs_supers, s_instances) { if (test(old, fc)) goto share_extant_sb; } } if (!s) { spin_unlock(&sb_lock); s = alloc_super(fc->fs_type, fc->sb_flags, user_ns); if (!s) return ERR_PTR(-ENOMEM); goto retry; } s->s_fs_info = fc->s_fs_info; err = set(s, fc); if (err) { s->s_fs_info = NULL; spin_unlock(&sb_lock); destroy_unused_super(s); return ERR_PTR(err); } fc->s_fs_info = NULL; s->s_type = fc->fs_type; s->s_iflags |= fc->s_iflags; strscpy(s->s_id, s->s_type->name, sizeof(s->s_id)); /* * Make the superblock visible on @super_blocks and @fs_supers. * It's in a nascent state and users should wait on SB_BORN or * SB_DYING to be set. */ list_add_tail(&s->s_list, &super_blocks); hlist_add_head(&s->s_instances, &s->s_type->fs_supers); spin_unlock(&sb_lock); get_filesystem(s->s_type); register_shrinker_prepared(&s->s_shrink); return s; share_extant_sb: if (user_ns != old->s_user_ns || fc->exclusive) { spin_unlock(&sb_lock); destroy_unused_super(s); if (fc->exclusive) warnfc(fc, "reusing existing filesystem not allowed"); else warnfc(fc, "reusing existing filesystem in another namespace not allowed"); return ERR_PTR(-EBUSY); } if (!grab_super_dead(old)) goto retry; destroy_unused_super(s); return old; } EXPORT_SYMBOL(sget_fc); /** * sget - find or create a superblock * @type: filesystem type superblock should belong to * @test: comparison callback * @set: setup callback * @flags: mount flags * @data: argument to each of them */ struct super_block *sget(struct file_system_type *type, int (*test)(struct super_block *,void *), int (*set)(struct super_block *,void *), int flags, void *data) { struct user_namespace *user_ns = current_user_ns(); struct super_block *s = NULL; struct super_block *old; int err; /* We don't yet pass the user namespace of the parent * mount through to here so always use &init_user_ns * until that changes. */ if (flags & SB_SUBMOUNT) user_ns = &init_user_ns; retry: spin_lock(&sb_lock); if (test) { hlist_for_each_entry(old, &type->fs_supers, s_instances) { if (!test(old, data)) continue; if (user_ns != old->s_user_ns) { spin_unlock(&sb_lock); destroy_unused_super(s); return ERR_PTR(-EBUSY); } if (!grab_super_dead(old)) goto retry; destroy_unused_super(s); return old; } } if (!s) { spin_unlock(&sb_lock); s = alloc_super(type, (flags & ~SB_SUBMOUNT), user_ns); if (!s) return ERR_PTR(-ENOMEM); goto retry; } err = set(s, data); if (err) { spin_unlock(&sb_lock); destroy_unused_super(s); return ERR_PTR(err); } s->s_type = type; strscpy(s->s_id, type->name, sizeof(s->s_id)); list_add_tail(&s->s_list, &super_blocks); hlist_add_head(&s->s_instances, &type->fs_supers); spin_unlock(&sb_lock); get_filesystem(type); register_shrinker_prepared(&s->s_shrink); return s; } EXPORT_SYMBOL(sget); void drop_super(struct super_block *sb) { super_unlock_shared(sb); put_super(sb); } EXPORT_SYMBOL(drop_super); void drop_super_exclusive(struct super_block *sb) { super_unlock_excl(sb); put_super(sb); } EXPORT_SYMBOL(drop_super_exclusive); static void __iterate_supers(void (*f)(struct super_block *)) { struct super_block *sb, *p = NULL; spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { /* Pairs with memory marrier in super_wake(). */ if (smp_load_acquire(&sb->s_flags) & SB_DYING) continue; sb->s_count++; spin_unlock(&sb_lock); f(sb); spin_lock(&sb_lock); if (p) __put_super(p); p = sb; } if (p) __put_super(p); spin_unlock(&sb_lock); } /** * iterate_supers - call function for all active superblocks * @f: function to call * @arg: argument to pass to it * * Scans the superblock list and calls given function, passing it * locked superblock and given argument. */ void iterate_supers(void (*f)(struct super_block *, void *), void *arg) { struct super_block *sb, *p = NULL; spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { bool born; sb->s_count++; spin_unlock(&sb_lock); born = super_lock_shared(sb); if (born && sb->s_root) f(sb, arg); super_unlock_shared(sb); spin_lock(&sb_lock); if (p) __put_super(p); p = sb; } if (p) __put_super(p); spin_unlock(&sb_lock); } /** * iterate_supers_type - call function for superblocks of given type * @type: fs type * @f: function to call * @arg: argument to pass to it * * Scans the superblock list and calls given function, passing it * locked superblock and given argument. */ void iterate_supers_type(struct file_system_type *type, void (*f)(struct super_block *, void *), void *arg) { struct super_block *sb, *p = NULL; spin_lock(&sb_lock); hlist_for_each_entry(sb, &type->fs_supers, s_instances) { bool born; sb->s_count++; spin_unlock(&sb_lock); born = super_lock_shared(sb); if (born && sb->s_root) f(sb, arg); super_unlock_shared(sb); spin_lock(&sb_lock); if (p) __put_super(p); p = sb; } if (p) __put_super(p); spin_unlock(&sb_lock); } EXPORT_SYMBOL(iterate_supers_type); /** * get_active_super - get an active reference to the superblock of a device * @bdev: device to get the superblock for * * Scans the superblock list and finds the superblock of the file system * mounted on the device given. Returns the superblock with an active * reference or %NULL if none was found. */ struct super_block *get_active_super(struct block_device *bdev) { struct super_block *sb; if (!bdev) return NULL; spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { if (sb->s_bdev == bdev) { if (!grab_super(sb)) return NULL; super_unlock_excl(sb); return sb; } } spin_unlock(&sb_lock); return NULL; } struct super_block *user_get_super(dev_t dev, bool excl) { struct super_block *sb; spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { if (sb->s_dev == dev) { bool born; sb->s_count++; spin_unlock(&sb_lock); /* still alive? */ born = super_lock(sb, excl); if (born && sb->s_root) return sb; super_unlock(sb, excl); /* nope, got unmounted */ spin_lock(&sb_lock); __put_super(sb); break; } } spin_unlock(&sb_lock); return NULL; } /** * reconfigure_super - asks filesystem to change superblock parameters * @fc: The superblock and configuration * * Alters the configuration parameters of a live superblock. */ int reconfigure_super(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; int retval; bool remount_ro = false; bool remount_rw = false; bool force = fc->sb_flags & SB_FORCE; if (fc->sb_flags_mask & ~MS_RMT_MASK) return -EINVAL; if (sb->s_writers.frozen != SB_UNFROZEN) return -EBUSY; retval = security_sb_remount(sb, fc->security); if (retval) return retval; if (fc->sb_flags_mask & SB_RDONLY) { #ifdef CONFIG_BLOCK if (!(fc->sb_flags & SB_RDONLY) && sb->s_bdev && bdev_read_only(sb->s_bdev)) return -EACCES; #endif remount_rw = !(fc->sb_flags & SB_RDONLY) && sb_rdonly(sb); remount_ro = (fc->sb_flags & SB_RDONLY) && !sb_rdonly(sb); } if (remount_ro) { if (!hlist_empty(&sb->s_pins)) { super_unlock_excl(sb); group_pin_kill(&sb->s_pins); __super_lock_excl(sb); if (!sb->s_root) return 0; if (sb->s_writers.frozen != SB_UNFROZEN) return -EBUSY; remount_ro = !sb_rdonly(sb); } } shrink_dcache_sb(sb); /* If we are reconfiguring to RDONLY and current sb is read/write, * make sure there are no files open for writing. */ if (remount_ro) { if (force) { sb_start_ro_state_change(sb); } else { retval = sb_prepare_remount_readonly(sb); if (retval) return retval; } } else if (remount_rw) { /* * Protect filesystem's reconfigure code from writes from * userspace until reconfigure finishes. */ sb_start_ro_state_change(sb); } if (fc->ops->reconfigure) { retval = fc->ops->reconfigure(fc); if (retval) { if (!force) goto cancel_readonly; /* If forced remount, go ahead despite any errors */ WARN(1, "forced remount of a %s fs returned %i\n", sb->s_type->name, retval); } } WRITE_ONCE(sb->s_flags, ((sb->s_flags & ~fc->sb_flags_mask) | (fc->sb_flags & fc->sb_flags_mask))); sb_end_ro_state_change(sb); /* * Some filesystems modify their metadata via some other path than the * bdev buffer cache (eg. use a private mapping, or directories in * pagecache, etc). Also file data modifications go via their own * mappings. So If we try to mount readonly then copy the filesystem * from bdev, we could get stale data, so invalidate it to give a best * effort at coherency. */ if (remount_ro && sb->s_bdev) invalidate_bdev(sb->s_bdev); return 0; cancel_readonly: sb_end_ro_state_change(sb); return retval; } static void do_emergency_remount_callback(struct super_block *sb) { bool born = super_lock_excl(sb); if (born && sb->s_root && sb->s_bdev && !sb_rdonly(sb)) { struct fs_context *fc; fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY | SB_FORCE, SB_RDONLY); if (!IS_ERR(fc)) { if (parse_monolithic_mount_data(fc, NULL) == 0) (void)reconfigure_super(fc); put_fs_context(fc); } } super_unlock_excl(sb); } static void do_emergency_remount(struct work_struct *work) { __iterate_supers(do_emergency_remount_callback); kfree(work); printk("Emergency Remount complete\n"); } void emergency_remount(void) { struct work_struct *work; work = kmalloc(sizeof(*work), GFP_ATOMIC); if (work) { INIT_WORK(work, do_emergency_remount); schedule_work(work); } } static void do_thaw_all_callback(struct super_block *sb) { bool born = super_lock_excl(sb); if (born && sb->s_root) { if (IS_ENABLED(CONFIG_BLOCK)) while (sb->s_bdev && !thaw_bdev(sb->s_bdev)) pr_warn("Emergency Thaw on %pg\n", sb->s_bdev); thaw_super_locked(sb, FREEZE_HOLDER_USERSPACE); } else { super_unlock_excl(sb); } } static void do_thaw_all(struct work_struct *work) { __iterate_supers(do_thaw_all_callback); kfree(work); printk(KERN_WARNING "Emergency Thaw complete\n"); } /** * emergency_thaw_all -- forcibly thaw every frozen filesystem * * Used for emergency unfreeze of all filesystems via SysRq */ void emergency_thaw_all(void) { struct work_struct *work; work = kmalloc(sizeof(*work), GFP_ATOMIC); if (work) { INIT_WORK(work, do_thaw_all); schedule_work(work); } } static DEFINE_IDA(unnamed_dev_ida); /** * get_anon_bdev - Allocate a block device for filesystems which don't have one. * @p: Pointer to a dev_t. * * Filesystems which don't use real block devices can call this function * to allocate a virtual block device. * * Context: Any context. Frequently called while holding sb_lock. * Return: 0 on success, -EMFILE if there are no anonymous bdevs left * or -ENOMEM if memory allocation failed. */ int get_anon_bdev(dev_t *p) { int dev; /* * Many userspace utilities consider an FSID of 0 invalid. * Always return at least 1 from get_anon_bdev. */ dev = ida_alloc_range(&unnamed_dev_ida, 1, (1 << MINORBITS) - 1, GFP_ATOMIC); if (dev == -ENOSPC) dev = -EMFILE; if (dev < 0) return dev; *p = MKDEV(0, dev); return 0; } EXPORT_SYMBOL(get_anon_bdev); void free_anon_bdev(dev_t dev) { ida_free(&unnamed_dev_ida, MINOR(dev)); } EXPORT_SYMBOL(free_anon_bdev); int set_anon_super(struct super_block *s, void *data) { return get_anon_bdev(&s->s_dev); } EXPORT_SYMBOL(set_anon_super); void kill_anon_super(struct super_block *sb) { dev_t dev = sb->s_dev; generic_shutdown_super(sb); kill_super_notify(sb); free_anon_bdev(dev); } EXPORT_SYMBOL(kill_anon_super); void kill_litter_super(struct super_block *sb) { if (sb->s_root) d_genocide(sb->s_root); kill_anon_super(sb); } EXPORT_SYMBOL(kill_litter_super); int set_anon_super_fc(struct super_block *sb, struct fs_context *fc) { return set_anon_super(sb, NULL); } EXPORT_SYMBOL(set_anon_super_fc); static int test_keyed_super(struct super_block *sb, struct fs_context *fc) { return sb->s_fs_info == fc->s_fs_info; } static int test_single_super(struct super_block *s, struct fs_context *fc) { return 1; } static int vfs_get_super(struct fs_context *fc, int (*test)(struct super_block *, struct fs_context *), int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { struct super_block *sb; int err; sb = sget_fc(fc, test, set_anon_super_fc); if (IS_ERR(sb)) return PTR_ERR(sb); if (!sb->s_root) { err = fill_super(sb, fc); if (err) goto error; sb->s_flags |= SB_ACTIVE; } fc->root = dget(sb->s_root); return 0; error: deactivate_locked_super(sb); return err; } int get_tree_nodev(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { return vfs_get_super(fc, NULL, fill_super); } EXPORT_SYMBOL(get_tree_nodev); int get_tree_single(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { return vfs_get_super(fc, test_single_super, fill_super); } EXPORT_SYMBOL(get_tree_single); int get_tree_keyed(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc), void *key) { fc->s_fs_info = key; return vfs_get_super(fc, test_keyed_super, fill_super); } EXPORT_SYMBOL(get_tree_keyed); static int set_bdev_super(struct super_block *s, void *data) { s->s_dev = *(dev_t *)data; return 0; } static int super_s_dev_set(struct super_block *s, struct fs_context *fc) { return set_bdev_super(s, fc->sget_key); } static int super_s_dev_test(struct super_block *s, struct fs_context *fc) { return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)fc->sget_key; } /** * sget_dev - Find or create a superblock by device number * @fc: Filesystem context. * @dev: device number * * Find or create a superblock using the provided device number that * will be stored in fc->sget_key. * * If an extant superblock is matched, then that will be returned with * an elevated reference count that the caller must transfer or discard. * * If no match is made, a new superblock will be allocated and basic * initialisation will be performed (s_type, s_fs_info, s_id, s_dev will * be set). The superblock will be published and it will be returned in * a partially constructed state with SB_BORN and SB_ACTIVE as yet * unset. * * Return: an existing or newly created superblock on success, an error * pointer on failure. */ struct super_block *sget_dev(struct fs_context *fc, dev_t dev) { fc->sget_key = &dev; return sget_fc(fc, super_s_dev_test, super_s_dev_set); } EXPORT_SYMBOL(sget_dev); #ifdef CONFIG_BLOCK /* * Lock a super block that the callers holds a reference to. * * The caller needs to ensure that the super_block isn't being freed while * calling this function, e.g. by holding a lock over the call to this function * and the place that clears the pointer to the superblock used by this function * before freeing the superblock. */ static bool super_lock_shared_active(struct super_block *sb) { bool born = super_lock_shared(sb); if (!born || !sb->s_root || !(sb->s_flags & SB_ACTIVE)) { super_unlock_shared(sb); return false; } return true; } static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise) { struct super_block *sb = bdev->bd_holder; /* bd_holder_lock ensures that the sb isn't freed */ lockdep_assert_held(&bdev->bd_holder_lock); if (!super_lock_shared_active(sb)) return; if (!surprise) sync_filesystem(sb); shrink_dcache_sb(sb); invalidate_inodes(sb); if (sb->s_op->shutdown) sb->s_op->shutdown(sb); super_unlock_shared(sb); } static void fs_bdev_sync(struct block_device *bdev) { struct super_block *sb = bdev->bd_holder; lockdep_assert_held(&bdev->bd_holder_lock); if (!super_lock_shared_active(sb)) return; sync_filesystem(sb); super_unlock_shared(sb); } const struct blk_holder_ops fs_holder_ops = { .mark_dead = fs_bdev_mark_dead, .sync = fs_bdev_sync, }; EXPORT_SYMBOL_GPL(fs_holder_ops); int setup_bdev_super(struct super_block *sb, int sb_flags, struct fs_context *fc) { blk_mode_t mode = sb_open_mode(sb_flags); struct block_device *bdev; bdev = blkdev_get_by_dev(sb->s_dev, mode, sb, &fs_holder_ops); if (IS_ERR(bdev)) { if (fc) errorf(fc, "%s: Can't open blockdev", fc->source); return PTR_ERR(bdev); } /* * This really should be in blkdev_get_by_dev, but right now can't due * to legacy issues that require us to allow opening a block device node * writable from userspace even for a read-only block device. */ if ((mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) { blkdev_put(bdev, sb); return -EACCES; } /* * Until SB_BORN flag is set, there can be no active superblock * references and thus no filesystem freezing. get_active_super() will * just loop waiting for SB_BORN so even freeze_bdev() cannot proceed. * * It is enough to check bdev was not frozen before we set s_bdev. */ mutex_lock(&bdev->bd_fsfreeze_mutex); if (bdev->bd_fsfreeze_count > 0) { mutex_unlock(&bdev->bd_fsfreeze_mutex); if (fc) warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev); blkdev_put(bdev, sb); return -EBUSY; } spin_lock(&sb_lock); sb->s_bdev = bdev; sb->s_bdi = bdi_get(bdev->bd_disk->bdi); if (bdev_stable_writes(bdev)) sb->s_iflags |= SB_I_STABLE_WRITES; spin_unlock(&sb_lock); mutex_unlock(&bdev->bd_fsfreeze_mutex); snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev); shrinker_debugfs_rename(&sb->s_shrink, "sb-%s:%s", sb->s_type->name, sb->s_id); sb_set_blocksize(sb, block_size(bdev)); return 0; } EXPORT_SYMBOL_GPL(setup_bdev_super); /** * get_tree_bdev - Get a superblock based on a single block device * @fc: The filesystem context holding the parameters * @fill_super: Helper to initialise a new superblock */ int get_tree_bdev(struct fs_context *fc, int (*fill_super)(struct super_block *, struct fs_context *)) { struct super_block *s; int error = 0; dev_t dev; if (!fc->source) return invalf(fc, "No source specified"); error = lookup_bdev(fc->source, &dev); if (error) { errorf(fc, "%s: Can't lookup blockdev", fc->source); return error; } fc->sb_flags |= SB_NOSEC; s = sget_dev(fc, dev); if (IS_ERR(s)) return PTR_ERR(s); if (s->s_root) { /* Don't summarily change the RO/RW state. */ if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) { warnf(fc, "%pg: Can't mount, would change RO state", s->s_bdev); deactivate_locked_super(s); return -EBUSY; } } else { /* * We drop s_umount here because we need to open the bdev and * bdev->open_mutex ranks above s_umount (blkdev_put() -> * bdev_mark_dead()). It is safe because we have active sb * reference and SB_BORN is not set yet. */ super_unlock_excl(s); error = setup_bdev_super(s, fc->sb_flags, fc); __super_lock_excl(s); if (!error) error = fill_super(s, fc); if (error) { deactivate_locked_super(s); return error; } s->s_flags |= SB_ACTIVE; } BUG_ON(fc->root); fc->root = dget(s->s_root); return 0; } EXPORT_SYMBOL(get_tree_bdev); static int test_bdev_super(struct super_block *s, void *data) { return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)data; } struct dentry *mount_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, int (*fill_super)(struct super_block *, void *, int)) { struct super_block *s; int error; dev_t dev; error = lookup_bdev(dev_name, &dev); if (error) return ERR_PTR(error); flags |= SB_NOSEC; s = sget(fs_type, test_bdev_super, set_bdev_super, flags, &dev); if (IS_ERR(s)) return ERR_CAST(s); if (s->s_root) { if ((flags ^ s->s_flags) & SB_RDONLY) { deactivate_locked_super(s); return ERR_PTR(-EBUSY); } } else { /* * We drop s_umount here because we need to open the bdev and * bdev->open_mutex ranks above s_umount (blkdev_put() -> * bdev_mark_dead()). It is safe because we have active sb * reference and SB_BORN is not set yet. */ super_unlock_excl(s); error = setup_bdev_super(s, flags, NULL); __super_lock_excl(s); if (!error) error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); if (error) { deactivate_locked_super(s); return ERR_PTR(error); } s->s_flags |= SB_ACTIVE; } return dget(s->s_root); } EXPORT_SYMBOL(mount_bdev); void kill_block_super(struct super_block *sb) { struct block_device *bdev = sb->s_bdev; generic_shutdown_super(sb); if (bdev) { sync_blockdev(bdev); blkdev_put(bdev, sb); } } EXPORT_SYMBOL(kill_block_super); #endif struct dentry *mount_nodev(struct file_system_type *fs_type, int flags, void *data, int (*fill_super)(struct super_block *, void *, int)) { int error; struct super_block *s = sget(fs_type, NULL, set_anon_super, flags, NULL); if (IS_ERR(s)) return ERR_CAST(s); error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); if (error) { deactivate_locked_super(s); return ERR_PTR(error); } s->s_flags |= SB_ACTIVE; return dget(s->s_root); } EXPORT_SYMBOL(mount_nodev); int reconfigure_single(struct super_block *s, int flags, void *data) { struct fs_context *fc; int ret; /* The caller really need to be passing fc down into mount_single(), * then a chunk of this can be removed. [Bollocks -- AV] * Better yet, reconfiguration shouldn't happen, but rather the second * mount should be rejected if the parameters are not compatible. */ fc = fs_context_for_reconfigure(s->s_root, flags, MS_RMT_MASK); if (IS_ERR(fc)) return PTR_ERR(fc); ret = parse_monolithic_mount_data(fc, data); if (ret < 0) goto out; ret = reconfigure_super(fc); out: put_fs_context(fc); return ret; } static int compare_single(struct super_block *s, void *p) { return 1; } struct dentry *mount_single(struct file_system_type *fs_type, int flags, void *data, int (*fill_super)(struct super_block *, void *, int)) { struct super_block *s; int error; s = sget(fs_type, compare_single, set_anon_super, flags, NULL); if (IS_ERR(s)) return ERR_CAST(s); if (!s->s_root) { error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); if (!error) s->s_flags |= SB_ACTIVE; } else { error = reconfigure_single(s, flags, data); } if (unlikely(error)) { deactivate_locked_super(s); return ERR_PTR(error); } return dget(s->s_root); } EXPORT_SYMBOL(mount_single); /** * vfs_get_tree - Get the mountable root * @fc: The superblock configuration context. * * The filesystem is invoked to get or create a superblock which can then later * be used for mounting. The filesystem places a pointer to the root to be * used for mounting in @fc->root. */ int vfs_get_tree(struct fs_context *fc) { struct super_block *sb; int error; if (fc->root) return -EBUSY; /* Get the mountable root in fc->root, with a ref on the root and a ref * on the superblock. */ error = fc->ops->get_tree(fc); if (error < 0) return error; if (!fc->root) { pr_err("Filesystem %s get_tree() didn't set fc->root\n", fc->fs_type->name); /* We don't know what the locking state of the superblock is - * if there is a superblock. */ BUG(); } sb = fc->root->d_sb; WARN_ON(!sb->s_bdi); /* * super_wake() contains a memory barrier which also care of * ordering for super_cache_count(). We place it before setting * SB_BORN as the data dependency between the two functions is * the superblock structure contents that we just set up, not * the SB_BORN flag. */ super_wake(sb, SB_BORN); error = security_sb_set_mnt_opts(sb, fc->security, 0, NULL); if (unlikely(error)) { fc_drop_locked(fc); return error; } /* * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE * but s_maxbytes was an unsigned long long for many releases. Throw * this warning for a little while to try and catch filesystems that * violate this rule. */ WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to " "negative value (%lld)\n", fc->fs_type->name, sb->s_maxbytes); return 0; } EXPORT_SYMBOL(vfs_get_tree); /* * Setup private BDI for given superblock. It gets automatically cleaned up * in generic_shutdown_super(). */ int super_setup_bdi_name(struct super_block *sb, char *fmt, ...) { struct backing_dev_info *bdi; int err; va_list args; bdi = bdi_alloc(NUMA_NO_NODE); if (!bdi) return -ENOMEM; va_start(args, fmt); err = bdi_register_va(bdi, fmt, args); va_end(args); if (err) { bdi_put(bdi); return err; } WARN_ON(sb->s_bdi != &noop_backing_dev_info); sb->s_bdi = bdi; sb->s_iflags |= SB_I_PERSB_BDI; return 0; } EXPORT_SYMBOL(super_setup_bdi_name); /* * Setup private BDI for given superblock. I gets automatically cleaned up * in generic_shutdown_super(). */ int super_setup_bdi(struct super_block *sb) { static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); return super_setup_bdi_name(sb, "%.28s-%ld", sb->s_type->name, atomic_long_inc_return(&bdi_seq)); } EXPORT_SYMBOL(super_setup_bdi); /** * sb_wait_write - wait until all writers to given file system finish * @sb: the super for which we wait * @level: type of writers we wait for (normal vs page fault) * * This function waits until there are no writers of given type to given file * system. */ static void sb_wait_write(struct super_block *sb, int level) { percpu_down_write(sb->s_writers.rw_sem + level-1); } /* * We are going to return to userspace and forget about these locks, the * ownership goes to the caller of thaw_super() which does unlock(). */ static void lockdep_sb_freeze_release(struct super_block *sb) { int level; for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--) percpu_rwsem_release(sb->s_writers.rw_sem + level, 0, _THIS_IP_); } /* * Tell lockdep we are holding these locks before we call ->unfreeze_fs(sb). */ static void lockdep_sb_freeze_acquire(struct super_block *sb) { int level; for (level = 0; level < SB_FREEZE_LEVELS; ++level) percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_); } static void sb_freeze_unlock(struct super_block *sb, int level) { for (level--; level >= 0; level--) percpu_up_write(sb->s_writers.rw_sem + level); } static int wait_for_partially_frozen(struct super_block *sb) { int ret = 0; do { unsigned short old = sb->s_writers.frozen; up_write(&sb->s_umount); ret = wait_var_event_killable(&sb->s_writers.frozen, sb->s_writers.frozen != old); down_write(&sb->s_umount); } while (ret == 0 && sb->s_writers.frozen != SB_UNFROZEN && sb->s_writers.frozen != SB_FREEZE_COMPLETE); return ret; } /** * freeze_super - lock the filesystem and force it into a consistent state * @sb: the super to lock * @who: context that wants to freeze * * Syncs the super to make sure the filesystem is consistent and calls the fs's * freeze_fs. Subsequent calls to this without first thawing the fs may return * -EBUSY. * * @who should be: * * %FREEZE_HOLDER_USERSPACE if userspace wants to freeze the fs; * * %FREEZE_HOLDER_KERNEL if the kernel wants to freeze the fs. * * The @who argument distinguishes between the kernel and userspace trying to * freeze the filesystem. Although there cannot be multiple kernel freezes or * multiple userspace freezes in effect at any given time, the kernel and * userspace can both hold a filesystem frozen. The filesystem remains frozen * until there are no kernel or userspace freezes in effect. * * During this function, sb->s_writers.frozen goes through these values: * * SB_UNFROZEN: File system is normal, all writes progress as usual. * * SB_FREEZE_WRITE: The file system is in the process of being frozen. New * writes should be blocked, though page faults are still allowed. We wait for * all writes to complete and then proceed to the next stage. * * SB_FREEZE_PAGEFAULT: Freezing continues. Now also page faults are blocked * but internal fs threads can still modify the filesystem (although they * should not dirty new pages or inodes), writeback can run etc. After waiting * for all running page faults we sync the filesystem which will clean all * dirty pages and inodes (no new dirty pages or inodes can be created when * sync is running). * * SB_FREEZE_FS: The file system is frozen. Now all internal sources of fs * modification are blocked (e.g. XFS preallocation truncation on inode * reclaim). This is usually implemented by blocking new transactions for * filesystems that have them and need this additional guard. After all * internal writers are finished we call ->freeze_fs() to finish filesystem * freezing. Then we transition to SB_FREEZE_COMPLETE state. This state is * mostly auxiliary for filesystems to verify they do not modify frozen fs. * * sb->s_writers.frozen is protected by sb->s_umount. */ int freeze_super(struct super_block *sb, enum freeze_holder who) { int ret; atomic_inc(&sb->s_active); if (!super_lock_excl(sb)) WARN(1, "Dying superblock while freezing!"); retry: if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) { if (sb->s_writers.freeze_holders & who) { deactivate_locked_super(sb); return -EBUSY; } WARN_ON(sb->s_writers.freeze_holders == 0); /* * Someone else already holds this type of freeze; share the * freeze and assign the active ref to the freeze. */ sb->s_writers.freeze_holders |= who; super_unlock_excl(sb); return 0; } if (sb->s_writers.frozen != SB_UNFROZEN) { ret = wait_for_partially_frozen(sb); if (ret) { deactivate_locked_super(sb); return ret; } goto retry; } if (!(sb->s_flags & SB_BORN)) { super_unlock_excl(sb); return 0; /* sic - it's "nothing to do" */ } if (sb_rdonly(sb)) { /* Nothing to do really... */ sb->s_writers.freeze_holders |= who; sb->s_writers.frozen = SB_FREEZE_COMPLETE; wake_up_var(&sb->s_writers.frozen); super_unlock_excl(sb); return 0; } sb->s_writers.frozen = SB_FREEZE_WRITE; /* Release s_umount to preserve sb_start_write -> s_umount ordering */ super_unlock_excl(sb); sb_wait_write(sb, SB_FREEZE_WRITE); if (!super_lock_excl(sb)) WARN(1, "Dying superblock while freezing!"); /* Now we go and block page faults... */ sb->s_writers.frozen = SB_FREEZE_PAGEFAULT; sb_wait_write(sb, SB_FREEZE_PAGEFAULT); /* All writers are done so after syncing there won't be dirty data */ ret = sync_filesystem(sb); if (ret) { sb->s_writers.frozen = SB_UNFROZEN; sb_freeze_unlock(sb, SB_FREEZE_PAGEFAULT); wake_up_var(&sb->s_writers.frozen); deactivate_locked_super(sb); return ret; } /* Now wait for internal filesystem counter */ sb->s_writers.frozen = SB_FREEZE_FS; sb_wait_write(sb, SB_FREEZE_FS); if (sb->s_op->freeze_fs) { ret = sb->s_op->freeze_fs(sb); if (ret) { printk(KERN_ERR "VFS:Filesystem freeze failed\n"); sb->s_writers.frozen = SB_UNFROZEN; sb_freeze_unlock(sb, SB_FREEZE_FS); wake_up_var(&sb->s_writers.frozen); deactivate_locked_super(sb); return ret; } } /* * For debugging purposes so that fs can warn if it sees write activity * when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super(). */ sb->s_writers.freeze_holders |= who; sb->s_writers.frozen = SB_FREEZE_COMPLETE; wake_up_var(&sb->s_writers.frozen); lockdep_sb_freeze_release(sb); super_unlock_excl(sb); return 0; } EXPORT_SYMBOL(freeze_super); /* * Undoes the effect of a freeze_super_locked call. If the filesystem is * frozen both by userspace and the kernel, a thaw call from either source * removes that state without releasing the other state or unlocking the * filesystem. */ static int thaw_super_locked(struct super_block *sb, enum freeze_holder who) { int error; if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) { if (!(sb->s_writers.freeze_holders & who)) { super_unlock_excl(sb); return -EINVAL; } /* * Freeze is shared with someone else. Release our hold and * drop the active ref that freeze_super assigned to the * freezer. */ if (sb->s_writers.freeze_holders & ~who) { sb->s_writers.freeze_holders &= ~who; deactivate_locked_super(sb); return 0; } } else { super_unlock_excl(sb); return -EINVAL; } if (sb_rdonly(sb)) { sb->s_writers.freeze_holders &= ~who; sb->s_writers.frozen = SB_UNFROZEN; wake_up_var(&sb->s_writers.frozen); goto out; } lockdep_sb_freeze_acquire(sb); if (sb->s_op->unfreeze_fs) { error = sb->s_op->unfreeze_fs(sb); if (error) { printk(KERN_ERR "VFS:Filesystem thaw failed\n"); lockdep_sb_freeze_release(sb); super_unlock_excl(sb); return error; } } sb->s_writers.freeze_holders &= ~who; sb->s_writers.frozen = SB_UNFROZEN; wake_up_var(&sb->s_writers.frozen); sb_freeze_unlock(sb, SB_FREEZE_FS); out: deactivate_locked_super(sb); return 0; } /** * thaw_super -- unlock filesystem * @sb: the super to thaw * @who: context that wants to freeze * * Unlocks the filesystem and marks it writeable again after freeze_super() * if there are no remaining freezes on the filesystem. * * @who should be: * * %FREEZE_HOLDER_USERSPACE if userspace wants to thaw the fs; * * %FREEZE_HOLDER_KERNEL if the kernel wants to thaw the fs. */ int thaw_super(struct super_block *sb, enum freeze_holder who) { if (!super_lock_excl(sb)) WARN(1, "Dying superblock while thawing!"); return thaw_super_locked(sb, who); } EXPORT_SYMBOL(thaw_super); /* * Create workqueue for deferred direct IO completions. We allocate the * workqueue when it's first needed. This avoids creating workqueue for * filesystems that don't need it and also allows us to create the workqueue * late enough so the we can include s_id in the name of the workqueue. */ int sb_init_dio_done_wq(struct super_block *sb) { struct workqueue_struct *old; struct workqueue_struct *wq = alloc_workqueue("dio/%s", WQ_MEM_RECLAIM, 0, sb->s_id); if (!wq) return -ENOMEM; /* * This has to be atomic as more DIOs can race to create the workqueue */ old = cmpxchg(&sb->s_dio_done_wq, NULL, wq); /* Someone created workqueue before us? Free ours... */ if (old) destroy_workqueue(wq); return 0; }
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2016 Mellanox Technologies. All rights reserved. * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> */ #include <net/genetlink.h> #include <net/sock.h> #include "devl_internal.h" struct devlink_info_req { struct sk_buff *msg; void (*version_cb)(const char *version_name, enum devlink_info_version_type version_type, void *version_cb_priv); void *version_cb_priv; }; struct devlink_reload_combination { enum devlink_reload_action action; enum devlink_reload_limit limit; }; static const struct devlink_reload_combination devlink_reload_invalid_combinations[] = { { /* can't reinitialize driver with no down time */ .action = DEVLINK_RELOAD_ACTION_DRIVER_REINIT, .limit = DEVLINK_RELOAD_LIMIT_NO_RESET, }, }; static bool devlink_reload_combination_is_invalid(enum devlink_reload_action action, enum devlink_reload_limit limit) { int i; for (i = 0; i < ARRAY_SIZE(devlink_reload_invalid_combinations); i++) if (devlink_reload_invalid_combinations[i].action == action && devlink_reload_invalid_combinations[i].limit == limit) return true; return false; } static bool devlink_reload_action_is_supported(struct devlink *devlink, enum devlink_reload_action action) { return test_bit(action, &devlink->ops->reload_actions); } static bool devlink_reload_limit_is_supported(struct devlink *devlink, enum devlink_reload_limit limit) { return test_bit(limit, &devlink->ops->reload_limits); } static int devlink_reload_stat_put(struct sk_buff *msg, enum devlink_reload_limit limit, u32 value) { struct nlattr *reload_stats_entry; reload_stats_entry = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_STATS_ENTRY); if (!reload_stats_entry) return -EMSGSIZE; if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_STATS_LIMIT, limit) || nla_put_u32(msg, DEVLINK_ATTR_RELOAD_STATS_VALUE, value)) goto nla_put_failure; nla_nest_end(msg, reload_stats_entry); return 0; nla_put_failure: nla_nest_cancel(msg, reload_stats_entry); return -EMSGSIZE; } static int devlink_reload_stats_put(struct sk_buff *msg, struct devlink *devlink, bool is_remote) { struct nlattr *reload_stats_attr, *act_info, *act_stats; int i, j, stat_idx; u32 value; if (!is_remote) reload_stats_attr = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_STATS); else reload_stats_attr = nla_nest_start(msg, DEVLINK_ATTR_REMOTE_RELOAD_STATS); if (!reload_stats_attr) return -EMSGSIZE; for (i = 0; i <= DEVLINK_RELOAD_ACTION_MAX; i++) { if ((!is_remote && !devlink_reload_action_is_supported(devlink, i)) || i == DEVLINK_RELOAD_ACTION_UNSPEC) continue; act_info = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_ACTION_INFO); if (!act_info) goto nla_put_failure; if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_ACTION, i)) goto action_info_nest_cancel; act_stats = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_ACTION_STATS); if (!act_stats) goto action_info_nest_cancel; for (j = 0; j <= DEVLINK_RELOAD_LIMIT_MAX; j++) { /* Remote stats are shown even if not locally supported. * Stats of actions with unspecified limit are shown * though drivers don't need to register unspecified * limit. */ if ((!is_remote && j != DEVLINK_RELOAD_LIMIT_UNSPEC && !devlink_reload_limit_is_supported(devlink, j)) || devlink_reload_combination_is_invalid(i, j)) continue; stat_idx = j * __DEVLINK_RELOAD_ACTION_MAX + i; if (!is_remote) value = devlink->stats.reload_stats[stat_idx]; else value = devlink->stats.remote_reload_stats[stat_idx]; if (devlink_reload_stat_put(msg, j, value)) goto action_stats_nest_cancel; } nla_nest_end(msg, act_stats); nla_nest_end(msg, act_info); } nla_nest_end(msg, reload_stats_attr); return 0; action_stats_nest_cancel: nla_nest_cancel(msg, act_stats); action_info_nest_cancel: nla_nest_cancel(msg, act_info); nla_put_failure: nla_nest_cancel(msg, reload_stats_attr); return -EMSGSIZE; } static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink, enum devlink_command cmd, u32 portid, u32 seq, int flags) { struct nlattr *dev_stats; void *hdr; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; if (devlink_nl_put_handle(msg, devlink)) goto nla_put_failure; if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_FAILED, devlink->reload_failed)) goto nla_put_failure; dev_stats = nla_nest_start(msg, DEVLINK_ATTR_DEV_STATS); if (!dev_stats) goto nla_put_failure; if (devlink_reload_stats_put(msg, devlink, false)) goto dev_stats_nest_cancel; if (devlink_reload_stats_put(msg, devlink, true)) goto dev_stats_nest_cancel; nla_nest_end(msg, dev_stats); genlmsg_end(msg, hdr); return 0; dev_stats_nest_cancel: nla_nest_cancel(msg, dev_stats); nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static void devlink_notify(struct devlink *devlink, enum devlink_command cmd) { struct sk_buff *msg; int err; WARN_ON(cmd != DEVLINK_CMD_NEW && cmd != DEVLINK_CMD_DEL); WARN_ON(!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; err = devlink_nl_fill(msg, devlink, cmd, 0, 0, 0); if (err) { nlmsg_free(msg); return; } genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } int devlink_nl_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct sk_buff *msg; int err; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW, info->snd_portid, info->snd_seq, 0); if (err) { nlmsg_free(msg); return err; } return genlmsg_reply(msg, info); } static int devlink_nl_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, int flags) { return devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags); } int devlink_nl_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) { return devlink_nl_dumpit(msg, cb, devlink_nl_get_dump_one); } void devlink_notify_register(struct devlink *devlink) { devlink_notify(devlink, DEVLINK_CMD_NEW); devlink_linecards_notify_register(devlink); devlink_ports_notify_register(devlink); devlink_trap_policers_notify_register(devlink); devlink_trap_groups_notify_register(devlink); devlink_traps_notify_register(devlink); devlink_rates_notify_register(devlink); devlink_regions_notify_register(devlink); devlink_params_notify_register(devlink); } void devlink_notify_unregister(struct devlink *devlink) { devlink_params_notify_unregister(devlink); devlink_regions_notify_unregister(devlink); devlink_rates_notify_unregister(devlink); devlink_traps_notify_unregister(devlink); devlink_trap_groups_notify_unregister(devlink); devlink_trap_policers_notify_unregister(devlink); devlink_ports_notify_unregister(devlink); devlink_linecards_notify_unregister(devlink); devlink_notify(devlink, DEVLINK_CMD_DEL); } static void devlink_reload_failed_set(struct devlink *devlink, bool reload_failed) { if (devlink->reload_failed == reload_failed) return; devlink->reload_failed = reload_failed; devlink_notify(devlink, DEVLINK_CMD_NEW); } bool devlink_is_reload_failed(const struct devlink *devlink) { return devlink->reload_failed; } EXPORT_SYMBOL_GPL(devlink_is_reload_failed); static void __devlink_reload_stats_update(struct devlink *devlink, u32 *reload_stats, enum devlink_reload_limit limit, u32 actions_performed) { unsigned long actions = actions_performed; int stat_idx; int action; for_each_set_bit(action, &actions, __DEVLINK_RELOAD_ACTION_MAX) { stat_idx = limit * __DEVLINK_RELOAD_ACTION_MAX + action; reload_stats[stat_idx]++; } devlink_notify(devlink, DEVLINK_CMD_NEW); } static void devlink_reload_stats_update(struct devlink *devlink, enum devlink_reload_limit limit, u32 actions_performed) { __devlink_reload_stats_update(devlink, devlink->stats.reload_stats, limit, actions_performed); } /** * devlink_remote_reload_actions_performed - Update devlink on reload actions * performed which are not a direct result of devlink reload call. * * This should be called by a driver after performing reload actions in case it was not * a result of devlink reload call. For example fw_activate was performed as a result * of devlink reload triggered fw_activate on another host. * The motivation for this function is to keep data on reload actions performed on this * function whether it was done due to direct devlink reload call or not. * * @devlink: devlink * @limit: reload limit * @actions_performed: bitmask of actions performed */ void devlink_remote_reload_actions_performed(struct devlink *devlink, enum devlink_reload_limit limit, u32 actions_performed) { if (WARN_ON(!actions_performed || actions_performed & BIT(DEVLINK_RELOAD_ACTION_UNSPEC) || actions_performed >= BIT(__DEVLINK_RELOAD_ACTION_MAX) || limit > DEVLINK_RELOAD_LIMIT_MAX)) return; __devlink_reload_stats_update(devlink, devlink->stats.remote_reload_stats, limit, actions_performed); } EXPORT_SYMBOL_GPL(devlink_remote_reload_actions_performed); static struct net *devlink_netns_get(struct sk_buff *skb, struct genl_info *info) { struct nlattr *netns_pid_attr = info->attrs[DEVLINK_ATTR_NETNS_PID]; struct nlattr *netns_fd_attr = info->attrs[DEVLINK_ATTR_NETNS_FD]; struct nlattr *netns_id_attr = info->attrs[DEVLINK_ATTR_NETNS_ID]; struct net *net; if (!!netns_pid_attr + !!netns_fd_attr + !!netns_id_attr > 1) { NL_SET_ERR_MSG(info->extack, "multiple netns identifying attributes specified"); return ERR_PTR(-EINVAL); } if (netns_pid_attr) { net = get_net_ns_by_pid(nla_get_u32(netns_pid_attr)); } else if (netns_fd_attr) { net = get_net_ns_by_fd(nla_get_u32(netns_fd_attr)); } else if (netns_id_attr) { net = get_net_ns_by_id(sock_net(skb->sk), nla_get_u32(netns_id_attr)); if (!net) net = ERR_PTR(-EINVAL); } else { WARN_ON(1); net = ERR_PTR(-EINVAL); } if (IS_ERR(net)) { NL_SET_ERR_MSG(info->extack, "Unknown network namespace"); return ERR_PTR(-EINVAL); } if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { put_net(net); return ERR_PTR(-EPERM); } return net; } static void devlink_reload_netns_change(struct devlink *devlink, struct net *curr_net, struct net *dest_net) { /* Userspace needs to be notified about devlink objects * removed from original and entering new network namespace. * The rest of the devlink objects are re-created during * reload process so the notifications are generated separatelly. */ devlink_notify_unregister(devlink); write_pnet(&devlink->_net, dest_net); devlink_notify_register(devlink); } int devlink_reload(struct devlink *devlink, struct net *dest_net, enum devlink_reload_action action, enum devlink_reload_limit limit, u32 *actions_performed, struct netlink_ext_ack *extack) { u32 remote_reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; struct net *curr_net; int err; memcpy(remote_reload_stats, devlink->stats.remote_reload_stats, sizeof(remote_reload_stats)); err = devlink->ops->reload_down(devlink, !!dest_net, action, limit, extack); if (err) return err; curr_net = devlink_net(devlink); if (dest_net && !net_eq(dest_net, curr_net)) devlink_reload_netns_change(devlink, curr_net, dest_net); if (action == DEVLINK_RELOAD_ACTION_DRIVER_REINIT) devlink_params_driverinit_load_new(devlink); err = devlink->ops->reload_up(devlink, action, limit, actions_performed, extack); devlink_reload_failed_set(devlink, !!err); if (err) return err; WARN_ON(!(*actions_performed & BIT(action))); /* Catch driver on updating the remote action within devlink reload */ WARN_ON(memcmp(remote_reload_stats, devlink->stats.remote_reload_stats, sizeof(remote_reload_stats))); devlink_reload_stats_update(devlink, limit, *actions_performed); return 0; } static int devlink_nl_reload_actions_performed_snd(struct devlink *devlink, u32 actions_performed, enum devlink_command cmd, struct genl_info *info) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, &devlink_nl_family, 0, cmd); if (!hdr) goto free_msg; if (devlink_nl_put_handle(msg, devlink)) goto nla_put_failure; if (nla_put_bitfield32(msg, DEVLINK_ATTR_RELOAD_ACTIONS_PERFORMED, actions_performed, actions_performed)) goto nla_put_failure; genlmsg_end(msg, hdr); return genlmsg_reply(msg, info); nla_put_failure: genlmsg_cancel(msg, hdr); free_msg: nlmsg_free(msg); return -EMSGSIZE; } int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; enum devlink_reload_action action; enum devlink_reload_limit limit; struct net *dest_net = NULL; u32 actions_performed; int err; err = devlink_resources_validate(devlink, NULL, info); if (err) { NL_SET_ERR_MSG(info->extack, "resources size validation failed"); return err; } if (info->attrs[DEVLINK_ATTR_RELOAD_ACTION]) action = nla_get_u8(info->attrs[DEVLINK_ATTR_RELOAD_ACTION]); else action = DEVLINK_RELOAD_ACTION_DRIVER_REINIT; if (!devlink_reload_action_is_supported(devlink, action)) { NL_SET_ERR_MSG(info->extack, "Requested reload action is not supported by the driver"); return -EOPNOTSUPP; } limit = DEVLINK_RELOAD_LIMIT_UNSPEC; if (info->attrs[DEVLINK_ATTR_RELOAD_LIMITS]) { struct nla_bitfield32 limits; u32 limits_selected; limits = nla_get_bitfield32(info->attrs[DEVLINK_ATTR_RELOAD_LIMITS]); limits_selected = limits.value & limits.selector; if (!limits_selected) { NL_SET_ERR_MSG(info->extack, "Invalid limit selected"); return -EINVAL; } for (limit = 0 ; limit <= DEVLINK_RELOAD_LIMIT_MAX ; limit++) if (limits_selected & BIT(limit)) break; /* UAPI enables multiselection, but currently it is not used */ if (limits_selected != BIT(limit)) { NL_SET_ERR_MSG(info->extack, "Multiselection of limit is not supported"); return -EOPNOTSUPP; } if (!devlink_reload_limit_is_supported(devlink, limit)) { NL_SET_ERR_MSG(info->extack, "Requested limit is not supported by the driver"); return -EOPNOTSUPP; } if (devlink_reload_combination_is_invalid(action, limit)) { NL_SET_ERR_MSG(info->extack, "Requested limit is invalid for this action"); return -EINVAL; } } if (info->attrs[DEVLINK_ATTR_NETNS_PID] || info->attrs[DEVLINK_ATTR_NETNS_FD] || info->attrs[DEVLINK_ATTR_NETNS_ID]) { dest_net = devlink_netns_get(skb, info); if (IS_ERR(dest_net)) return PTR_ERR(dest_net); if (!net_eq(dest_net, devlink_net(devlink)) && action != DEVLINK_RELOAD_ACTION_DRIVER_REINIT) { NL_SET_ERR_MSG_MOD(info->extack, "Changing namespace is only supported for reinit action"); return -EOPNOTSUPP; } } err = devlink_reload(devlink, dest_net, action, limit, &actions_performed, info->extack); if (dest_net) put_net(dest_net); if (err) return err; /* For backward compatibility generate reply only if attributes used by user */ if (!info->attrs[DEVLINK_ATTR_RELOAD_ACTION] && !info->attrs[DEVLINK_ATTR_RELOAD_LIMITS]) return 0; return devlink_nl_reload_actions_performed_snd(devlink, actions_performed, DEVLINK_CMD_RELOAD, info); } bool devlink_reload_actions_valid(const struct devlink_ops *ops) { const struct devlink_reload_combination *comb; int i; if (!devlink_reload_supported(ops)) { if (WARN_ON(ops->reload_actions)) return false; return true; } if (WARN_ON(!ops->reload_actions || ops->reload_actions & BIT(DEVLINK_RELOAD_ACTION_UNSPEC) || ops->reload_actions >= BIT(__DEVLINK_RELOAD_ACTION_MAX))) return false; if (WARN_ON(ops->reload_limits & BIT(DEVLINK_RELOAD_LIMIT_UNSPEC) || ops->reload_limits >= BIT(__DEVLINK_RELOAD_LIMIT_MAX))) return false; for (i = 0; i < ARRAY_SIZE(devlink_reload_invalid_combinations); i++) { comb = &devlink_reload_invalid_combinations[i]; if (ops->reload_actions == BIT(comb->action) && ops->reload_limits == BIT(comb->limit)) return false; } return true; } static int devlink_nl_eswitch_fill(struct sk_buff *msg, struct devlink *devlink, enum devlink_command cmd, u32 portid, u32 seq, int flags) { const struct devlink_ops *ops = devlink->ops; enum devlink_eswitch_encap_mode encap_mode; u8 inline_mode; void *hdr; int err = 0; u16 mode; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; err = devlink_nl_put_handle(msg, devlink); if (err) goto nla_put_failure; if (ops->eswitch_mode_get) { err = ops->eswitch_mode_get(devlink, &mode); if (err) goto nla_put_failure; err = nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode); if (err) goto nla_put_failure; } if (ops->eswitch_inline_mode_get) { err = ops->eswitch_inline_mode_get(devlink, &inline_mode); if (err) goto nla_put_failure; err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_INLINE_MODE, inline_mode); if (err) goto nla_put_failure; } if (ops->eswitch_encap_mode_get) { err = ops->eswitch_encap_mode_get(devlink, &encap_mode); if (err) goto nla_put_failure; err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_ENCAP_MODE, encap_mode); if (err) goto nla_put_failure; } genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return err; } int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct sk_buff *msg; int err; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_nl_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_GET, info->snd_portid, info->snd_seq, 0); if (err) { nlmsg_free(msg); return err; } return genlmsg_reply(msg, info); } int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; const struct devlink_ops *ops = devlink->ops; enum devlink_eswitch_encap_mode encap_mode; u8 inline_mode; int err = 0; u16 mode; if (info->attrs[DEVLINK_ATTR_ESWITCH_MODE]) { if (!ops->eswitch_mode_set) return -EOPNOTSUPP; mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]); err = devlink_rate_nodes_check(devlink, mode, info->extack); if (err) return err; err = ops->eswitch_mode_set(devlink, mode, info->extack); if (err) return err; } if (info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]) { if (!ops->eswitch_inline_mode_set) return -EOPNOTSUPP; inline_mode = nla_get_u8(info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]); err = ops->eswitch_inline_mode_set(devlink, inline_mode, info->extack); if (err) return err; } if (info->attrs[DEVLINK_ATTR_ESWITCH_ENCAP_MODE]) { if (!ops->eswitch_encap_mode_set) return -EOPNOTSUPP; encap_mode = nla_get_u8(info->attrs[DEVLINK_ATTR_ESWITCH_ENCAP_MODE]); err = ops->eswitch_encap_mode_set(devlink, encap_mode, info->extack); if (err) return err; } return 0; } int devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn) { if (!req->msg) return 0; return nla_put_string(req->msg, DEVLINK_ATTR_INFO_SERIAL_NUMBER, sn); } EXPORT_SYMBOL_GPL(devlink_info_serial_number_put); int devlink_info_board_serial_number_put(struct devlink_info_req *req, const char *bsn) { if (!req->msg) return 0; return nla_put_string(req->msg, DEVLINK_ATTR_INFO_BOARD_SERIAL_NUMBER, bsn); } EXPORT_SYMBOL_GPL(devlink_info_board_serial_number_put); static int devlink_info_version_put(struct devlink_info_req *req, int attr, const char *version_name, const char *version_value, enum devlink_info_version_type version_type) { struct nlattr *nest; int err; if (req->version_cb) req->version_cb(version_name, version_type, req->version_cb_priv); if (!req->msg) return 0; nest = nla_nest_start_noflag(req->msg, attr); if (!nest) return -EMSGSIZE; err = nla_put_string(req->msg, DEVLINK_ATTR_INFO_VERSION_NAME, version_name); if (err) goto nla_put_failure; err = nla_put_string(req->msg, DEVLINK_ATTR_INFO_VERSION_VALUE, version_value); if (err) goto nla_put_failure; nla_nest_end(req->msg, nest); return 0; nla_put_failure: nla_nest_cancel(req->msg, nest); return err; } int devlink_info_version_fixed_put(struct devlink_info_req *req, const char *version_name, const char *version_value) { return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_FIXED, version_name, version_value, DEVLINK_INFO_VERSION_TYPE_NONE); } EXPORT_SYMBOL_GPL(devlink_info_version_fixed_put); int devlink_info_version_stored_put(struct devlink_info_req *req, const char *version_name, const char *version_value) { return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_STORED, version_name, version_value, DEVLINK_INFO_VERSION_TYPE_NONE); } EXPORT_SYMBOL_GPL(devlink_info_version_stored_put); int devlink_info_version_stored_put_ext(struct devlink_info_req *req, const char *version_name, const char *version_value, enum devlink_info_version_type version_type) { return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_STORED, version_name, version_value, version_type); } EXPORT_SYMBOL_GPL(devlink_info_version_stored_put_ext); int devlink_info_version_running_put(struct devlink_info_req *req, const char *version_name, const char *version_value) { return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_RUNNING, version_name, version_value, DEVLINK_INFO_VERSION_TYPE_NONE); } EXPORT_SYMBOL_GPL(devlink_info_version_running_put); int devlink_info_version_running_put_ext(struct devlink_info_req *req, const char *version_name, const char *version_value, enum devlink_info_version_type version_type) { return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_RUNNING, version_name, version_value, version_type); } EXPORT_SYMBOL_GPL(devlink_info_version_running_put_ext); static int devlink_nl_driver_info_get(struct device_driver *drv, struct devlink_info_req *req) { if (!drv) return 0; if (drv->name[0]) return nla_put_string(req->msg, DEVLINK_ATTR_INFO_DRIVER_NAME, drv->name); return 0; } static int devlink_nl_info_fill(struct sk_buff *msg, struct devlink *devlink, enum devlink_command cmd, u32 portid, u32 seq, int flags, struct netlink_ext_ack *extack) { struct device *dev = devlink_to_dev(devlink); struct devlink_info_req req = {}; void *hdr; int err; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; err = -EMSGSIZE; if (devlink_nl_put_handle(msg, devlink)) goto err_cancel_msg; req.msg = msg; if (devlink->ops->info_get) { err = devlink->ops->info_get(devlink, &req, extack); if (err) goto err_cancel_msg; } err = devlink_nl_driver_info_get(dev->driver, &req); if (err) goto err_cancel_msg; genlmsg_end(msg, hdr); return 0; err_cancel_msg: genlmsg_cancel(msg, hdr); return err; } int devlink_nl_info_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct sk_buff *msg; int err; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_nl_info_fill(msg, devlink, DEVLINK_CMD_INFO_GET, info->snd_portid, info->snd_seq, 0, info->extack); if (err) { nlmsg_free(msg); return err; } return genlmsg_reply(msg, info); } static int devlink_nl_info_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, int flags) { int err; err = devlink_nl_info_fill(msg, devlink, DEVLINK_CMD_INFO_GET, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags, cb->extack); if (err == -EOPNOTSUPP) err = 0; return err; } int devlink_nl_info_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) { return devlink_nl_dumpit(msg, cb, devlink_nl_info_get_dump_one); } static int devlink_nl_flash_update_fill(struct sk_buff *msg, struct devlink *devlink, enum devlink_command cmd, struct devlink_flash_notify *params) { void *hdr; hdr = genlmsg_put(msg, 0, 0, &devlink_nl_family, 0, cmd); if (!hdr) return -EMSGSIZE; if (devlink_nl_put_handle(msg, devlink)) goto nla_put_failure; if (cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS) goto out; if (params->status_msg && nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG, params->status_msg)) goto nla_put_failure; if (params->component && nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_COMPONENT, params->component)) goto nla_put_failure; if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE, params->done, DEVLINK_ATTR_PAD)) goto nla_put_failure; if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL, params->total, DEVLINK_ATTR_PAD)) goto nla_put_failure; if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TIMEOUT, params->timeout, DEVLINK_ATTR_PAD)) goto nla_put_failure; out: genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static void __devlink_flash_update_notify(struct devlink *devlink, enum devlink_command cmd, struct devlink_flash_notify *params) { struct sk_buff *msg; int err; WARN_ON(cmd != DEVLINK_CMD_FLASH_UPDATE && cmd != DEVLINK_CMD_FLASH_UPDATE_END && cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS); if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; err = devlink_nl_flash_update_fill(msg, devlink, cmd, params); if (err) goto out_free_msg; genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); return; out_free_msg: nlmsg_free(msg); } static void devlink_flash_update_begin_notify(struct devlink *devlink) { struct devlink_flash_notify params = {}; __devlink_flash_update_notify(devlink, DEVLINK_CMD_FLASH_UPDATE, &params); } static void devlink_flash_update_end_notify(struct devlink *devlink) { struct devlink_flash_notify params = {}; __devlink_flash_update_notify(devlink, DEVLINK_CMD_FLASH_UPDATE_END, &params); } void devlink_flash_update_status_notify(struct devlink *devlink, const char *status_msg, const char *component, unsigned long done, unsigned long total) { struct devlink_flash_notify params = { .status_msg = status_msg, .component = component, .done = done, .total = total, }; __devlink_flash_update_notify(devlink, DEVLINK_CMD_FLASH_UPDATE_STATUS, &params); } EXPORT_SYMBOL_GPL(devlink_flash_update_status_notify); void devlink_flash_update_timeout_notify(struct devlink *devlink, const char *status_msg, const char *component, unsigned long timeout) { struct devlink_flash_notify params = { .status_msg = status_msg, .component = component, .timeout = timeout, }; __devlink_flash_update_notify(devlink, DEVLINK_CMD_FLASH_UPDATE_STATUS, &params); } EXPORT_SYMBOL_GPL(devlink_flash_update_timeout_notify); struct devlink_flash_component_lookup_ctx { const char *lookup_name; bool lookup_name_found; }; static void devlink_flash_component_lookup_cb(const char *version_name, enum devlink_info_version_type version_type, void *version_cb_priv) { struct devlink_flash_component_lookup_ctx *lookup_ctx = version_cb_priv; if (version_type != DEVLINK_INFO_VERSION_TYPE_COMPONENT || lookup_ctx->lookup_name_found) return; lookup_ctx->lookup_name_found = !strcmp(lookup_ctx->lookup_name, version_name); } static int devlink_flash_component_get(struct devlink *devlink, struct nlattr *nla_component, const char **p_component, struct netlink_ext_ack *extack) { struct devlink_flash_component_lookup_ctx lookup_ctx = {}; struct devlink_info_req req = {}; const char *component; int ret; if (!nla_component) return 0; component = nla_data(nla_component); if (!devlink->ops->info_get) { NL_SET_ERR_MSG_ATTR(extack, nla_component, "component update is not supported by this device"); return -EOPNOTSUPP; } lookup_ctx.lookup_name = component; req.version_cb = devlink_flash_component_lookup_cb; req.version_cb_priv = &lookup_ctx; ret = devlink->ops->info_get(devlink, &req, NULL); if (ret) return ret; if (!lookup_ctx.lookup_name_found) { NL_SET_ERR_MSG_ATTR(extack, nla_component, "selected component is not supported by this device"); return -EINVAL; } *p_component = component; return 0; } int devlink_nl_cmd_flash_update(struct sk_buff *skb, struct genl_info *info) { struct nlattr *nla_overwrite_mask, *nla_file_name; struct devlink_flash_update_params params = {}; struct devlink *devlink = info->user_ptr[0]; const char *file_name; u32 supported_params; int ret; if (!devlink->ops->flash_update) return -EOPNOTSUPP; if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME)) return -EINVAL; ret = devlink_flash_component_get(devlink, info->attrs[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT], &params.component, info->extack); if (ret) return ret; supported_params = devlink->ops->supported_flash_update_params; nla_overwrite_mask = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK]; if (nla_overwrite_mask) { struct nla_bitfield32 sections; if (!(supported_params & DEVLINK_SUPPORT_FLASH_UPDATE_OVERWRITE_MASK)) { NL_SET_ERR_MSG_ATTR(info->extack, nla_overwrite_mask, "overwrite settings are not supported by this device"); return -EOPNOTSUPP; } sections = nla_get_bitfield32(nla_overwrite_mask); params.overwrite_mask = sections.value & sections.selector; } nla_file_name = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME]; file_name = nla_data(nla_file_name); ret = request_firmware(&params.fw, file_name, devlink->dev); if (ret) { NL_SET_ERR_MSG_ATTR(info->extack, nla_file_name, "failed to locate the requested firmware file"); return ret; } devlink_flash_update_begin_notify(devlink); ret = devlink->ops->flash_update(devlink, &params, info->extack); devlink_flash_update_end_notify(devlink); release_firmware(params.fw); return ret; } static void __devlink_compat_running_version(struct devlink *devlink, char *buf, size_t len) { struct devlink_info_req req = {}; const struct nlattr *nlattr; struct sk_buff *msg; int rem, err; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; req.msg = msg; err = devlink->ops->info_get(devlink, &req, NULL); if (err) goto free_msg; nla_for_each_attr(nlattr, (void *)msg->data, msg->len, rem) { const struct nlattr *kv; int rem_kv; if (nla_type(nlattr) != DEVLINK_ATTR_INFO_VERSION_RUNNING) continue; nla_for_each_nested(kv, nlattr, rem_kv) { if (nla_type(kv) != DEVLINK_ATTR_INFO_VERSION_VALUE) continue; strlcat(buf, nla_data(kv), len); strlcat(buf, " ", len); } } free_msg: nlmsg_free(msg); } void devlink_compat_running_version(struct devlink *devlink, char *buf, size_t len) { if (!devlink->ops->info_get) return; devl_lock(devlink); if (devl_is_registered(devlink)) __devlink_compat_running_version(devlink, buf, len); devl_unlock(devlink); } int devlink_compat_flash_update(struct devlink *devlink, const char *file_name) { struct devlink_flash_update_params params = {}; int ret; devl_lock(devlink); if (!devl_is_registered(devlink)) { ret = -ENODEV; goto out_unlock; } if (!devlink->ops->flash_update) { ret = -EOPNOTSUPP; goto out_unlock; } ret = request_firmware(&params.fw, file_name, devlink->dev); if (ret) goto out_unlock; devlink_flash_update_begin_notify(devlink); ret = devlink->ops->flash_update(devlink, &params, NULL); devlink_flash_update_end_notify(devlink); release_firmware(params.fw); out_unlock: devl_unlock(devlink); return ret; } static int devlink_nl_selftests_fill(struct sk_buff *msg, struct devlink *devlink, u32 portid, u32 seq, int flags, struct netlink_ext_ack *extack) { struct nlattr *selftests; void *hdr; int err; int i; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, DEVLINK_CMD_SELFTESTS_GET); if (!hdr) return -EMSGSIZE; err = -EMSGSIZE; if (devlink_nl_put_handle(msg, devlink)) goto err_cancel_msg; selftests = nla_nest_start(msg, DEVLINK_ATTR_SELFTESTS); if (!selftests) goto err_cancel_msg; for (i = DEVLINK_ATTR_SELFTEST_ID_UNSPEC + 1; i <= DEVLINK_ATTR_SELFTEST_ID_MAX; i++) { if (devlink->ops->selftest_check(devlink, i, extack)) { err = nla_put_flag(msg, i); if (err) goto err_cancel_msg; } } nla_nest_end(msg, selftests); genlmsg_end(msg, hdr); return 0; err_cancel_msg: genlmsg_cancel(msg, hdr); return err; } int devlink_nl_selftests_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct sk_buff *msg; int err; if (!devlink->ops->selftest_check) return -EOPNOTSUPP; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_nl_selftests_fill(msg, devlink, info->snd_portid, info->snd_seq, 0, info->extack); if (err) { nlmsg_free(msg); return err; } return genlmsg_reply(msg, info); } static int devlink_nl_selftests_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, int flags) { if (!devlink->ops->selftest_check) return 0; return devlink_nl_selftests_fill(msg, devlink, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags, cb->extack); } int devlink_nl_selftests_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return devlink_nl_dumpit(skb, cb, devlink_nl_selftests_get_dump_one); } static int devlink_selftest_result_put(struct sk_buff *skb, unsigned int id, enum devlink_selftest_status test_status) { struct nlattr *result_attr; result_attr = nla_nest_start(skb, DEVLINK_ATTR_SELFTEST_RESULT); if (!result_attr) return -EMSGSIZE; if (nla_put_u32(skb, DEVLINK_ATTR_SELFTEST_RESULT_ID, id) || nla_put_u8(skb, DEVLINK_ATTR_SELFTEST_RESULT_STATUS, test_status)) goto nla_put_failure; nla_nest_end(skb, result_attr); return 0; nla_put_failure: nla_nest_cancel(skb, result_attr); return -EMSGSIZE; } static const struct nla_policy devlink_selftest_nl_policy[DEVLINK_ATTR_SELFTEST_ID_MAX + 1] = { [DEVLINK_ATTR_SELFTEST_ID_FLASH] = { .type = NLA_FLAG }, }; int devlink_nl_cmd_selftests_run(struct sk_buff *skb, struct genl_info *info) { struct nlattr *tb[DEVLINK_ATTR_SELFTEST_ID_MAX + 1]; struct devlink *devlink = info->user_ptr[0]; struct nlattr *attrs, *selftests; struct sk_buff *msg; void *hdr; int err; int i; if (!devlink->ops->selftest_run || !devlink->ops->selftest_check) return -EOPNOTSUPP; if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SELFTESTS)) return -EINVAL; attrs = info->attrs[DEVLINK_ATTR_SELFTESTS]; err = nla_parse_nested(tb, DEVLINK_ATTR_SELFTEST_ID_MAX, attrs, devlink_selftest_nl_policy, info->extack); if (err < 0) return err; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = -EMSGSIZE; hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, &devlink_nl_family, 0, DEVLINK_CMD_SELFTESTS_RUN); if (!hdr) goto free_msg; if (devlink_nl_put_handle(msg, devlink)) goto genlmsg_cancel; selftests = nla_nest_start(msg, DEVLINK_ATTR_SELFTESTS); if (!selftests) goto genlmsg_cancel; for (i = DEVLINK_ATTR_SELFTEST_ID_UNSPEC + 1; i <= DEVLINK_ATTR_SELFTEST_ID_MAX; i++) { enum devlink_selftest_status test_status; if (nla_get_flag(tb[i])) { if (!devlink->ops->selftest_check(devlink, i, info->extack)) { if (devlink_selftest_result_put(msg, i, DEVLINK_SELFTEST_STATUS_SKIP)) goto selftests_nest_cancel; continue; } test_status = devlink->ops->selftest_run(devlink, i, info->extack); if (devlink_selftest_result_put(msg, i, test_status)) goto selftests_nest_cancel; } } nla_nest_end(msg, selftests); genlmsg_end(msg, hdr); return genlmsg_reply(msg, info); selftests_nest_cancel: nla_nest_cancel(msg, selftests); genlmsg_cancel: genlmsg_cancel(msg, hdr); free_msg: nlmsg_free(msg); return err; }
55 55 1526 1526 428 431 428 428 610 610 428 428 411 411 411 411 9 3 411 411 411 3 32 32 18 18 12 12 2 390 390 385 385 7 3 3 3 3 3 1 1 1 1 1 1 1 151 1 1 106 1 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 610 402 378 346 55 55 1527 1525 1120 797 55 1 58 13 13 12 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ #include "hard-interface.h" #include "main.h" #include <linux/atomic.h> #include <linux/byteorder/generic.h> #include <linux/compiler.h> #include <linux/container_of.h> #include <linux/errno.h> #include <linux/gfp.h> #include <linux/if.h> #include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/kref.h> #include <linux/limits.h> #include <linux/list.h> #include <linux/minmax.h> #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/printk.h> #include <linux/rculist.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <net/net_namespace.h> #include <net/rtnetlink.h> #include <uapi/linux/batadv_packet.h> #include "bat_v.h" #include "bridge_loop_avoidance.h" #include "distributed-arp-table.h" #include "gateway_client.h" #include "log.h" #include "originator.h" #include "send.h" #include "soft-interface.h" #include "translation-table.h" /** * batadv_hardif_release() - release hard interface from lists and queue for * free after rcu grace period * @ref: kref pointer of the hard interface */ void batadv_hardif_release(struct kref *ref) { struct batadv_hard_iface *hard_iface; hard_iface = container_of(ref, struct batadv_hard_iface, refcount); dev_put(hard_iface->net_dev); kfree_rcu(hard_iface, rcu); } /** * batadv_hardif_get_by_netdev() - Get hard interface object of a net_device * @net_dev: net_device to search for * * Return: batadv_hard_iface of net_dev (with increased refcnt), NULL on errors */ struct batadv_hard_iface * batadv_hardif_get_by_netdev(const struct net_device *net_dev) { struct batadv_hard_iface *hard_iface; rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->net_dev == net_dev && kref_get_unless_zero(&hard_iface->refcount)) goto out; } hard_iface = NULL; out: rcu_read_unlock(); return hard_iface; } /** * batadv_getlink_net() - return link net namespace (of use fallback) * @netdev: net_device to check * @fallback_net: return in case get_link_net is not available for @netdev * * Return: result of rtnl_link_ops->get_link_net or @fallback_net */ static struct net *batadv_getlink_net(const struct net_device *netdev, struct net *fallback_net) { if (!netdev->rtnl_link_ops) return fallback_net; if (!netdev->rtnl_link_ops->get_link_net) return fallback_net; return netdev->rtnl_link_ops->get_link_net(netdev); } /** * batadv_mutual_parents() - check if two devices are each others parent * @dev1: 1st net dev * @net1: 1st devices netns * @dev2: 2nd net dev * @net2: 2nd devices netns * * veth devices come in pairs and each is the parent of the other! * * Return: true if the devices are each others parent, otherwise false */ static bool batadv_mutual_parents(const struct net_device *dev1, struct net *net1, const struct net_device *dev2, struct net *net2) { int dev1_parent_iflink = dev_get_iflink(dev1); int dev2_parent_iflink = dev_get_iflink(dev2); const struct net *dev1_parent_net; const struct net *dev2_parent_net; dev1_parent_net = batadv_getlink_net(dev1, net1); dev2_parent_net = batadv_getlink_net(dev2, net2); if (!dev1_parent_iflink || !dev2_parent_iflink) return false; return (dev1_parent_iflink == dev2->ifindex) && (dev2_parent_iflink == dev1->ifindex) && net_eq(dev1_parent_net, net2) && net_eq(dev2_parent_net, net1); } /** * batadv_is_on_batman_iface() - check if a device is a batman iface descendant * @net_dev: the device to check * * If the user creates any virtual device on top of a batman-adv interface, it * is important to prevent this new interface from being used to create a new * mesh network (this behaviour would lead to a batman-over-batman * configuration). This function recursively checks all the fathers of the * device passed as argument looking for a batman-adv soft interface. * * Return: true if the device is descendant of a batman-adv mesh interface (or * if it is a batman-adv interface itself), false otherwise */ static bool batadv_is_on_batman_iface(const struct net_device *net_dev) { struct net *net = dev_net(net_dev); struct net_device *parent_dev; struct net *parent_net; int iflink; bool ret; /* check if this is a batman-adv mesh interface */ if (batadv_softif_is_valid(net_dev)) return true; iflink = dev_get_iflink(net_dev); if (iflink == 0) return false; parent_net = batadv_getlink_net(net_dev, net); /* iflink to itself, most likely physical device */ if (net == parent_net && iflink == net_dev->ifindex) return false; /* recurse over the parent device */ parent_dev = __dev_get_by_index((struct net *)parent_net, iflink); if (!parent_dev) { pr_warn("Cannot find parent device. Skipping batadv-on-batadv check for %s\n", net_dev->name); return false; } if (batadv_mutual_parents(net_dev, net, parent_dev, parent_net)) return false; ret = batadv_is_on_batman_iface(parent_dev); return ret; } static bool batadv_is_valid_iface(const struct net_device *net_dev) { if (net_dev->flags & IFF_LOOPBACK) return false; if (net_dev->type != ARPHRD_ETHER) return false; if (net_dev->addr_len != ETH_ALEN) return false; /* no batman over batman */ if (batadv_is_on_batman_iface(net_dev)) return false; return true; } /** * batadv_get_real_netdevice() - check if the given netdev struct is a virtual * interface on top of another 'real' interface * @netdev: the device to check * * Callers must hold the rtnl semaphore. You may want batadv_get_real_netdev() * instead of this. * * Return: the 'real' net device or the original net device and NULL in case * of an error. */ static struct net_device *batadv_get_real_netdevice(struct net_device *netdev) { struct batadv_hard_iface *hard_iface = NULL; struct net_device *real_netdev = NULL; struct net *real_net; struct net *net; int iflink; ASSERT_RTNL(); if (!netdev) return NULL; iflink = dev_get_iflink(netdev); if (iflink == 0) { dev_hold(netdev); return netdev; } hard_iface = batadv_hardif_get_by_netdev(netdev); if (!hard_iface || !hard_iface->soft_iface) goto out; net = dev_net(hard_iface->soft_iface); real_net = batadv_getlink_net(netdev, net); /* iflink to itself, most likely physical device */ if (net == real_net && netdev->ifindex == iflink) { real_netdev = netdev; dev_hold(real_netdev); goto out; } real_netdev = dev_get_by_index(real_net, iflink); out: batadv_hardif_put(hard_iface); return real_netdev; } /** * batadv_get_real_netdev() - check if the given net_device struct is a virtual * interface on top of another 'real' interface * @net_device: the device to check * * Return: the 'real' net device or the original net device and NULL in case * of an error. */ struct net_device *batadv_get_real_netdev(struct net_device *net_device) { struct net_device *real_netdev; rtnl_lock(); real_netdev = batadv_get_real_netdevice(net_device); rtnl_unlock(); return real_netdev; } /** * batadv_is_wext_netdev() - check if the given net_device struct is a * wext wifi interface * @net_device: the device to check * * Return: true if the net device is a wext wireless device, false * otherwise. */ static bool batadv_is_wext_netdev(struct net_device *net_device) { if (!net_device) return false; #ifdef CONFIG_WIRELESS_EXT /* pre-cfg80211 drivers have to implement WEXT, so it is possible to * check for wireless_handlers != NULL */ if (net_device->wireless_handlers) return true; #endif return false; } /** * batadv_is_cfg80211_netdev() - check if the given net_device struct is a * cfg80211 wifi interface * @net_device: the device to check * * Return: true if the net device is a cfg80211 wireless device, false * otherwise. */ static bool batadv_is_cfg80211_netdev(struct net_device *net_device) { if (!net_device) return false; #if IS_ENABLED(CONFIG_CFG80211) /* cfg80211 drivers have to set ieee80211_ptr */ if (net_device->ieee80211_ptr) return true; #endif return false; } /** * batadv_wifi_flags_evaluate() - calculate wifi flags for net_device * @net_device: the device to check * * Return: batadv_hard_iface_wifi_flags flags of the device */ static u32 batadv_wifi_flags_evaluate(struct net_device *net_device) { u32 wifi_flags = 0; struct net_device *real_netdev; if (batadv_is_wext_netdev(net_device)) wifi_flags |= BATADV_HARDIF_WIFI_WEXT_DIRECT; if (batadv_is_cfg80211_netdev(net_device)) wifi_flags |= BATADV_HARDIF_WIFI_CFG80211_DIRECT; real_netdev = batadv_get_real_netdevice(net_device); if (!real_netdev) return wifi_flags; if (real_netdev == net_device) goto out; if (batadv_is_wext_netdev(real_netdev)) wifi_flags |= BATADV_HARDIF_WIFI_WEXT_INDIRECT; if (batadv_is_cfg80211_netdev(real_netdev)) wifi_flags |= BATADV_HARDIF_WIFI_CFG80211_INDIRECT; out: dev_put(real_netdev); return wifi_flags; } /** * batadv_is_cfg80211_hardif() - check if the given hardif is a cfg80211 wifi * interface * @hard_iface: the device to check * * Return: true if the net device is a cfg80211 wireless device, false * otherwise. */ bool batadv_is_cfg80211_hardif(struct batadv_hard_iface *hard_iface) { u32 allowed_flags = 0; allowed_flags |= BATADV_HARDIF_WIFI_CFG80211_DIRECT; allowed_flags |= BATADV_HARDIF_WIFI_CFG80211_INDIRECT; return !!(hard_iface->wifi_flags & allowed_flags); } /** * batadv_is_wifi_hardif() - check if the given hardif is a wifi interface * @hard_iface: the device to check * * Return: true if the net device is a 802.11 wireless device, false otherwise. */ bool batadv_is_wifi_hardif(struct batadv_hard_iface *hard_iface) { if (!hard_iface) return false; return hard_iface->wifi_flags != 0; } /** * batadv_hardif_no_broadcast() - check whether (re)broadcast is necessary * @if_outgoing: the outgoing interface checked and considered for (re)broadcast * @orig_addr: the originator of this packet * @orig_neigh: originator address of the forwarder we just got the packet from * (NULL if we originated) * * Checks whether a packet needs to be (re)broadcasted on the given interface. * * Return: * BATADV_HARDIF_BCAST_NORECIPIENT: No neighbor on interface * BATADV_HARDIF_BCAST_DUPFWD: Just one neighbor, but it is the forwarder * BATADV_HARDIF_BCAST_DUPORIG: Just one neighbor, but it is the originator * BATADV_HARDIF_BCAST_OK: Several neighbors, must broadcast */ int batadv_hardif_no_broadcast(struct batadv_hard_iface *if_outgoing, u8 *orig_addr, u8 *orig_neigh) { struct batadv_hardif_neigh_node *hardif_neigh; struct hlist_node *first; int ret = BATADV_HARDIF_BCAST_OK; rcu_read_lock(); /* 0 neighbors -> no (re)broadcast */ first = rcu_dereference(hlist_first_rcu(&if_outgoing->neigh_list)); if (!first) { ret = BATADV_HARDIF_BCAST_NORECIPIENT; goto out; } /* >1 neighbors -> (re)broadcast */ if (rcu_dereference(hlist_next_rcu(first))) goto out; hardif_neigh = hlist_entry(first, struct batadv_hardif_neigh_node, list); /* 1 neighbor, is the originator -> no rebroadcast */ if (orig_addr && batadv_compare_eth(hardif_neigh->orig, orig_addr)) { ret = BATADV_HARDIF_BCAST_DUPORIG; /* 1 neighbor, is the one we received from -> no rebroadcast */ } else if (orig_neigh && batadv_compare_eth(hardif_neigh->orig, orig_neigh)) { ret = BATADV_HARDIF_BCAST_DUPFWD; } out: rcu_read_unlock(); return ret; } static struct batadv_hard_iface * batadv_hardif_get_active(const struct net_device *soft_iface) { struct batadv_hard_iface *hard_iface; rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->soft_iface != soft_iface) continue; if (hard_iface->if_status == BATADV_IF_ACTIVE && kref_get_unless_zero(&hard_iface->refcount)) goto out; } hard_iface = NULL; out: rcu_read_unlock(); return hard_iface; } static void batadv_primary_if_update_addr(struct batadv_priv *bat_priv, struct batadv_hard_iface *oldif) { struct batadv_hard_iface *primary_if; primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) goto out; batadv_dat_init_own_addr(bat_priv, primary_if); batadv_bla_update_orig_address(bat_priv, primary_if, oldif); out: batadv_hardif_put(primary_if); } static void batadv_primary_if_select(struct batadv_priv *bat_priv, struct batadv_hard_iface *new_hard_iface) { struct batadv_hard_iface *curr_hard_iface; ASSERT_RTNL(); if (new_hard_iface) kref_get(&new_hard_iface->refcount); curr_hard_iface = rcu_replace_pointer(bat_priv->primary_if, new_hard_iface, 1); if (!new_hard_iface) goto out; bat_priv->algo_ops->iface.primary_set(new_hard_iface); batadv_primary_if_update_addr(bat_priv, curr_hard_iface); out: batadv_hardif_put(curr_hard_iface); } static bool batadv_hardif_is_iface_up(const struct batadv_hard_iface *hard_iface) { if (hard_iface->net_dev->flags & IFF_UP) return true; return false; } static void batadv_check_known_mac_addr(const struct net_device *net_dev) { const struct batadv_hard_iface *hard_iface; rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->if_status != BATADV_IF_ACTIVE && hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED) continue; if (hard_iface->net_dev == net_dev) continue; if (!batadv_compare_eth(hard_iface->net_dev->dev_addr, net_dev->dev_addr)) continue; pr_warn("The newly added mac address (%pM) already exists on: %s\n", net_dev->dev_addr, hard_iface->net_dev->name); pr_warn("It is strongly recommended to keep mac addresses unique to avoid problems!\n"); } rcu_read_unlock(); } /** * batadv_hardif_recalc_extra_skbroom() - Recalculate skbuff extra head/tailroom * @soft_iface: netdev struct of the mesh interface */ static void batadv_hardif_recalc_extra_skbroom(struct net_device *soft_iface) { const struct batadv_hard_iface *hard_iface; unsigned short lower_header_len = ETH_HLEN; unsigned short lower_headroom = 0; unsigned short lower_tailroom = 0; unsigned short needed_headroom; rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->if_status == BATADV_IF_NOT_IN_USE) continue; if (hard_iface->soft_iface != soft_iface) continue; lower_header_len = max_t(unsigned short, lower_header_len, hard_iface->net_dev->hard_header_len); lower_headroom = max_t(unsigned short, lower_headroom, hard_iface->net_dev->needed_headroom); lower_tailroom = max_t(unsigned short, lower_tailroom, hard_iface->net_dev->needed_tailroom); } rcu_read_unlock(); needed_headroom = lower_headroom + (lower_header_len - ETH_HLEN); needed_headroom += batadv_max_header_len(); /* fragmentation headers don't strip the unicast/... header */ needed_headroom += sizeof(struct batadv_frag_packet); soft_iface->needed_headroom = needed_headroom; soft_iface->needed_tailroom = lower_tailroom; } /** * batadv_hardif_min_mtu() - Calculate maximum MTU for soft interface * @soft_iface: netdev struct of the soft interface * * Return: MTU for the soft-interface (limited by the minimal MTU of all active * slave interfaces) */ int batadv_hardif_min_mtu(struct net_device *soft_iface) { struct batadv_priv *bat_priv = netdev_priv(soft_iface); const struct batadv_hard_iface *hard_iface; int min_mtu = INT_MAX; rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->if_status != BATADV_IF_ACTIVE && hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED) continue; if (hard_iface->soft_iface != soft_iface) continue; min_mtu = min_t(int, hard_iface->net_dev->mtu, min_mtu); } rcu_read_unlock(); if (atomic_read(&bat_priv->fragmentation) == 0) goto out; /* with fragmentation enabled the maximum size of internally generated * packets such as translation table exchanges or tvlv containers, etc * has to be calculated */ min_mtu = min_t(int, min_mtu, BATADV_FRAG_MAX_FRAG_SIZE); min_mtu -= sizeof(struct batadv_frag_packet); min_mtu *= BATADV_FRAG_MAX_FRAGMENTS; out: /* report to the other components the maximum amount of bytes that * batman-adv can send over the wire (without considering the payload * overhead). For example, this value is used by TT to compute the * maximum local table size */ atomic_set(&bat_priv->packet_size_max, min_mtu); /* the real soft-interface MTU is computed by removing the payload * overhead from the maximum amount of bytes that was just computed. * * However batman-adv does not support MTUs bigger than ETH_DATA_LEN */ return min_t(int, min_mtu - batadv_max_header_len(), ETH_DATA_LEN); } /** * batadv_update_min_mtu() - Adjusts the MTU if a new interface with a smaller * MTU appeared * @soft_iface: netdev struct of the soft interface */ void batadv_update_min_mtu(struct net_device *soft_iface) { struct batadv_priv *bat_priv = netdev_priv(soft_iface); int limit_mtu; int mtu; mtu = batadv_hardif_min_mtu(soft_iface); if (bat_priv->mtu_set_by_user) limit_mtu = bat_priv->mtu_set_by_user; else limit_mtu = ETH_DATA_LEN; mtu = min(mtu, limit_mtu); dev_set_mtu(soft_iface, mtu); /* Check if the local translate table should be cleaned up to match a * new (and smaller) MTU. */ batadv_tt_local_resize_to_mtu(soft_iface); } static void batadv_hardif_activate_interface(struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv; struct batadv_hard_iface *primary_if = NULL; if (hard_iface->if_status != BATADV_IF_INACTIVE) goto out; bat_priv = netdev_priv(hard_iface->soft_iface); bat_priv->algo_ops->iface.update_mac(hard_iface); hard_iface->if_status = BATADV_IF_TO_BE_ACTIVATED; /* the first active interface becomes our primary interface or * the next active interface after the old primary interface was removed */ primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) batadv_primary_if_select(bat_priv, hard_iface); batadv_info(hard_iface->soft_iface, "Interface activated: %s\n", hard_iface->net_dev->name); batadv_update_min_mtu(hard_iface->soft_iface); if (bat_priv->algo_ops->iface.activate) bat_priv->algo_ops->iface.activate(hard_iface); out: batadv_hardif_put(primary_if); } static void batadv_hardif_deactivate_interface(struct batadv_hard_iface *hard_iface) { if (hard_iface->if_status != BATADV_IF_ACTIVE && hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED) return; hard_iface->if_status = BATADV_IF_INACTIVE; batadv_info(hard_iface->soft_iface, "Interface deactivated: %s\n", hard_iface->net_dev->name); batadv_update_min_mtu(hard_iface->soft_iface); } /** * batadv_hardif_enable_interface() - Enslave hard interface to soft interface * @hard_iface: hard interface to add to soft interface * @soft_iface: netdev struct of the mesh interface * * Return: 0 on success or negative error number in case of failure */ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, struct net_device *soft_iface) { struct batadv_priv *bat_priv; __be16 ethertype = htons(ETH_P_BATMAN); int max_header_len = batadv_max_header_len(); unsigned int required_mtu; unsigned int hardif_mtu; int ret; hardif_mtu = READ_ONCE(hard_iface->net_dev->mtu); required_mtu = READ_ONCE(soft_iface->mtu) + max_header_len; if (hardif_mtu < ETH_MIN_MTU + max_header_len) return -EINVAL; if (hard_iface->if_status != BATADV_IF_NOT_IN_USE) goto out; kref_get(&hard_iface->refcount); dev_hold(soft_iface); hard_iface->soft_iface = soft_iface; bat_priv = netdev_priv(hard_iface->soft_iface); ret = netdev_master_upper_dev_link(hard_iface->net_dev, soft_iface, NULL, NULL, NULL); if (ret) goto err_dev; ret = bat_priv->algo_ops->iface.enable(hard_iface); if (ret < 0) goto err_upper; hard_iface->if_status = BATADV_IF_INACTIVE; kref_get(&hard_iface->refcount); hard_iface->batman_adv_ptype.type = ethertype; hard_iface->batman_adv_ptype.func = batadv_batman_skb_recv; hard_iface->batman_adv_ptype.dev = hard_iface->net_dev; dev_add_pack(&hard_iface->batman_adv_ptype); batadv_info(hard_iface->soft_iface, "Adding interface: %s\n", hard_iface->net_dev->name); if (atomic_read(&bat_priv->fragmentation) && hardif_mtu < required_mtu) batadv_info(hard_iface->soft_iface, "The MTU of interface %s is too small (%i) to handle the transport of batman-adv packets. Packets going over this interface will be fragmented on layer2 which could impact the performance. Setting the MTU to %i would solve the problem.\n", hard_iface->net_dev->name, hardif_mtu, required_mtu); if (!atomic_read(&bat_priv->fragmentation) && hardif_mtu < required_mtu) batadv_info(hard_iface->soft_iface, "The MTU of interface %s is too small (%i) to handle the transport of batman-adv packets. If you experience problems getting traffic through try increasing the MTU to %i.\n", hard_iface->net_dev->name, hardif_mtu, required_mtu); if (batadv_hardif_is_iface_up(hard_iface)) batadv_hardif_activate_interface(hard_iface); else batadv_err(hard_iface->soft_iface, "Not using interface %s (retrying later): interface not active\n", hard_iface->net_dev->name); batadv_hardif_recalc_extra_skbroom(soft_iface); if (bat_priv->algo_ops->iface.enabled) bat_priv->algo_ops->iface.enabled(hard_iface); out: return 0; err_upper: netdev_upper_dev_unlink(hard_iface->net_dev, soft_iface); err_dev: hard_iface->soft_iface = NULL; dev_put(soft_iface); batadv_hardif_put(hard_iface); return ret; } /** * batadv_hardif_cnt() - get number of interfaces enslaved to soft interface * @soft_iface: soft interface to check * * This function is only using RCU for locking - the result can therefore be * off when another function is modifying the list at the same time. The * caller can use the rtnl_lock to make sure that the count is accurate. * * Return: number of connected/enslaved hard interfaces */ static size_t batadv_hardif_cnt(const struct net_device *soft_iface) { struct batadv_hard_iface *hard_iface; size_t count = 0; rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->soft_iface != soft_iface) continue; count++; } rcu_read_unlock(); return count; } /** * batadv_hardif_disable_interface() - Remove hard interface from soft interface * @hard_iface: hard interface to be removed */ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); struct batadv_hard_iface *primary_if = NULL; batadv_hardif_deactivate_interface(hard_iface); if (hard_iface->if_status != BATADV_IF_INACTIVE) goto out; batadv_info(hard_iface->soft_iface, "Removing interface: %s\n", hard_iface->net_dev->name); dev_remove_pack(&hard_iface->batman_adv_ptype); batadv_hardif_put(hard_iface); primary_if = batadv_primary_if_get_selected(bat_priv); if (hard_iface == primary_if) { struct batadv_hard_iface *new_if; new_if = batadv_hardif_get_active(hard_iface->soft_iface); batadv_primary_if_select(bat_priv, new_if); batadv_hardif_put(new_if); } bat_priv->algo_ops->iface.disable(hard_iface); hard_iface->if_status = BATADV_IF_NOT_IN_USE; /* delete all references to this hard_iface */ batadv_purge_orig_ref(bat_priv); batadv_purge_outstanding_packets(bat_priv, hard_iface); dev_put(hard_iface->soft_iface); netdev_upper_dev_unlink(hard_iface->net_dev, hard_iface->soft_iface); batadv_hardif_recalc_extra_skbroom(hard_iface->soft_iface); /* nobody uses this interface anymore */ if (batadv_hardif_cnt(hard_iface->soft_iface) <= 1) batadv_gw_check_client_stop(bat_priv); hard_iface->soft_iface = NULL; batadv_hardif_put(hard_iface); out: batadv_hardif_put(primary_if); } static struct batadv_hard_iface * batadv_hardif_add_interface(struct net_device *net_dev) { struct batadv_hard_iface *hard_iface; ASSERT_RTNL(); if (!batadv_is_valid_iface(net_dev)) goto out; dev_hold(net_dev); hard_iface = kzalloc(sizeof(*hard_iface), GFP_ATOMIC); if (!hard_iface) goto release_dev; hard_iface->net_dev = net_dev; hard_iface->soft_iface = NULL; hard_iface->if_status = BATADV_IF_NOT_IN_USE; INIT_LIST_HEAD(&hard_iface->list); INIT_HLIST_HEAD(&hard_iface->neigh_list); mutex_init(&hard_iface->bat_iv.ogm_buff_mutex); spin_lock_init(&hard_iface->neigh_list_lock); kref_init(&hard_iface->refcount); hard_iface->num_bcasts = BATADV_NUM_BCASTS_DEFAULT; hard_iface->wifi_flags = batadv_wifi_flags_evaluate(net_dev); if (batadv_is_wifi_hardif(hard_iface)) hard_iface->num_bcasts = BATADV_NUM_BCASTS_WIRELESS; atomic_set(&hard_iface->hop_penalty, 0); batadv_v_hardif_init(hard_iface); batadv_check_known_mac_addr(hard_iface->net_dev); kref_get(&hard_iface->refcount); list_add_tail_rcu(&hard_iface->list, &batadv_hardif_list); batadv_hardif_generation++; return hard_iface; release_dev: dev_put(net_dev); out: return NULL; } static void batadv_hardif_remove_interface(struct batadv_hard_iface *hard_iface) { ASSERT_RTNL(); /* first deactivate interface */ if (hard_iface->if_status != BATADV_IF_NOT_IN_USE) batadv_hardif_disable_interface(hard_iface); if (hard_iface->if_status != BATADV_IF_NOT_IN_USE) return; hard_iface->if_status = BATADV_IF_TO_BE_REMOVED; batadv_hardif_put(hard_iface); } /** * batadv_hard_if_event_softif() - Handle events for soft interfaces * @event: NETDEV_* event to handle * @net_dev: net_device which generated an event * * Return: NOTIFY_* result */ static int batadv_hard_if_event_softif(unsigned long event, struct net_device *net_dev) { struct batadv_priv *bat_priv; switch (event) { case NETDEV_REGISTER: bat_priv = netdev_priv(net_dev); batadv_softif_create_vlan(bat_priv, BATADV_NO_FLAGS); break; } return NOTIFY_DONE; } static int batadv_hard_if_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *net_dev = netdev_notifier_info_to_dev(ptr); struct batadv_hard_iface *hard_iface; struct batadv_hard_iface *primary_if = NULL; struct batadv_priv *bat_priv; if (batadv_softif_is_valid(net_dev)) return batadv_hard_if_event_softif(event, net_dev); hard_iface = batadv_hardif_get_by_netdev(net_dev); if (!hard_iface && (event == NETDEV_REGISTER || event == NETDEV_POST_TYPE_CHANGE)) hard_iface = batadv_hardif_add_interface(net_dev); if (!hard_iface) goto out; switch (event) { case NETDEV_UP: batadv_hardif_activate_interface(hard_iface); break; case NETDEV_GOING_DOWN: case NETDEV_DOWN: batadv_hardif_deactivate_interface(hard_iface); break; case NETDEV_UNREGISTER: case NETDEV_PRE_TYPE_CHANGE: list_del_rcu(&hard_iface->list); batadv_hardif_generation++; batadv_hardif_remove_interface(hard_iface); break; case NETDEV_CHANGEMTU: if (hard_iface->soft_iface) batadv_update_min_mtu(hard_iface->soft_iface); break; case NETDEV_CHANGEADDR: if (hard_iface->if_status == BATADV_IF_NOT_IN_USE) goto hardif_put; batadv_check_known_mac_addr(hard_iface->net_dev); bat_priv = netdev_priv(hard_iface->soft_iface); bat_priv->algo_ops->iface.update_mac(hard_iface); primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) goto hardif_put; if (hard_iface == primary_if) batadv_primary_if_update_addr(bat_priv, NULL); break; case NETDEV_CHANGEUPPER: hard_iface->wifi_flags = batadv_wifi_flags_evaluate(net_dev); if (batadv_is_wifi_hardif(hard_iface)) hard_iface->num_bcasts = BATADV_NUM_BCASTS_WIRELESS; break; default: break; } hardif_put: batadv_hardif_put(hard_iface); out: batadv_hardif_put(primary_if); return NOTIFY_DONE; } struct notifier_block batadv_hard_if_notifier = { .notifier_call = batadv_hard_if_event, };
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 // SPDX-License-Identifier: GPL-2.0 /* XDP user-space packet buffer * Copyright(c) 2018 Intel Corporation. */ #include <linux/init.h> #include <linux/sched/mm.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/uaccess.h> #include <linux/slab.h> #include <linux/bpf.h> #include <linux/mm.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/idr.h> #include <linux/vmalloc.h> #include "xdp_umem.h" #include "xsk_queue.h" static DEFINE_IDA(umem_ida); static void xdp_umem_unpin_pages(struct xdp_umem *umem) { unpin_user_pages_dirty_lock(umem->pgs, umem->npgs, true); kvfree(umem->pgs); umem->pgs = NULL; } static void xdp_umem_unaccount_pages(struct xdp_umem *umem) { if (umem->user) { atomic_long_sub(umem->npgs, &umem->user->locked_vm); free_uid(umem->user); } } static void xdp_umem_addr_unmap(struct xdp_umem *umem) { vunmap(umem->addrs); umem->addrs = NULL; } static int xdp_umem_addr_map(struct xdp_umem *umem, struct page **pages, u32 nr_pages) { umem->addrs = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); if (!umem->addrs) return -ENOMEM; return 0; } static void xdp_umem_release(struct xdp_umem *umem) { umem->zc = false; ida_free(&umem_ida, umem->id); xdp_umem_addr_unmap(umem); xdp_umem_unpin_pages(umem); xdp_umem_unaccount_pages(umem); kfree(umem); } static void xdp_umem_release_deferred(struct work_struct *work) { struct xdp_umem *umem = container_of(work, struct xdp_umem, work); xdp_umem_release(umem); } void xdp_get_umem(struct xdp_umem *umem) { refcount_inc(&umem->users); } void xdp_put_umem(struct xdp_umem *umem, bool defer_cleanup) { if (!umem) return; if (refcount_dec_and_test(&umem->users)) { if (defer_cleanup) { INIT_WORK(&umem->work, xdp_umem_release_deferred); schedule_work(&umem->work); } else { xdp_umem_release(umem); } } } static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address) { unsigned int gup_flags = FOLL_WRITE; long npgs; int err; umem->pgs = kvcalloc(umem->npgs, sizeof(*umem->pgs), GFP_KERNEL | __GFP_NOWARN); if (!umem->pgs) return -ENOMEM; mmap_read_lock(current->mm); npgs = pin_user_pages(address, umem->npgs, gup_flags | FOLL_LONGTERM, &umem->pgs[0]); mmap_read_unlock(current->mm); if (npgs != umem->npgs) { if (npgs >= 0) { umem->npgs = npgs; err = -ENOMEM; goto out_pin; } err = npgs; goto out_pgs; } return 0; out_pin: xdp_umem_unpin_pages(umem); out_pgs: kvfree(umem->pgs); umem->pgs = NULL; return err; } static int xdp_umem_account_pages(struct xdp_umem *umem) { unsigned long lock_limit, new_npgs, old_npgs; if (capable(CAP_IPC_LOCK)) return 0; lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; umem->user = get_uid(current_user()); do { old_npgs = atomic_long_read(&umem->user->locked_vm); new_npgs = old_npgs + umem->npgs; if (new_npgs > lock_limit) { free_uid(umem->user); umem->user = NULL; return -ENOBUFS; } } while (atomic_long_cmpxchg(&umem->user->locked_vm, old_npgs, new_npgs) != old_npgs); return 0; } static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) { bool unaligned_chunks = mr->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG; u32 chunk_size = mr->chunk_size, headroom = mr->headroom; u64 addr = mr->addr, size = mr->len; u32 chunks_rem, npgs_rem; u64 chunks, npgs; int err; if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) { /* Strictly speaking we could support this, if: * - huge pages, or* * - using an IOMMU, or * - making sure the memory area is consecutive * but for now, we simply say "computer says no". */ return -EINVAL; } if (mr->flags & ~XDP_UMEM_UNALIGNED_CHUNK_FLAG) return -EINVAL; if (!unaligned_chunks && !is_power_of_2(chunk_size)) return -EINVAL; if (!PAGE_ALIGNED(addr)) { /* Memory area has to be page size aligned. For * simplicity, this might change. */ return -EINVAL; } if ((addr + size) < addr) return -EINVAL; npgs = div_u64_rem(size, PAGE_SIZE, &npgs_rem); if (npgs_rem) npgs++; if (npgs > U32_MAX) return -EINVAL; chunks = div_u64_rem(size, chunk_size, &chunks_rem); if (!chunks || chunks > U32_MAX) return -EINVAL; if (!unaligned_chunks && chunks_rem) return -EINVAL; if (headroom >= chunk_size - XDP_PACKET_HEADROOM) return -EINVAL; umem->size = size; umem->headroom = headroom; umem->chunk_size = chunk_size; umem->chunks = chunks; umem->npgs = npgs; umem->pgs = NULL; umem->user = NULL; umem->flags = mr->flags; INIT_LIST_HEAD(&umem->xsk_dma_list); refcount_set(&umem->users, 1); err = xdp_umem_account_pages(umem); if (err) return err; err = xdp_umem_pin_pages(umem, (unsigned long)addr); if (err) goto out_account; err = xdp_umem_addr_map(umem, umem->pgs, umem->npgs); if (err) goto out_unpin; return 0; out_unpin: xdp_umem_unpin_pages(umem); out_account: xdp_umem_unaccount_pages(umem); return err; } struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr) { struct xdp_umem *umem; int err; umem = kzalloc(sizeof(*umem), GFP_KERNEL); if (!umem) return ERR_PTR(-ENOMEM); err = ida_alloc(&umem_ida, GFP_KERNEL); if (err < 0) { kfree(umem); return ERR_PTR(err); } umem->id = err; err = xdp_umem_reg(umem, mr); if (err) { ida_free(&umem_ida, umem->id); kfree(umem); return ERR_PTR(err); } return umem; }
294 294 294 294 293 293 293 293 276 293 293 293 293 293 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 // SPDX-License-Identifier: GPL-2.0 #include <linux/proc_fs.h> #include <linux/nsproxy.h> #include <linux/ptrace.h> #include <linux/namei.h> #include <linux/file.h> #include <linux/utsname.h> #include <net/net_namespace.h> #include <linux/ipc_namespace.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> #include "internal.h" static const struct proc_ns_operations *ns_entries[] = { #ifdef CONFIG_NET_NS &netns_operations, #endif #ifdef CONFIG_UTS_NS &utsns_operations, #endif #ifdef CONFIG_IPC_NS &ipcns_operations, #endif #ifdef CONFIG_PID_NS &pidns_operations, &pidns_for_children_operations, #endif #ifdef CONFIG_USER_NS &userns_operations, #endif &mntns_operations, #ifdef CONFIG_CGROUPS &cgroupns_operations, #endif #ifdef CONFIG_TIME_NS &timens_operations, &timens_for_children_operations, #endif }; static const char *proc_ns_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops; struct task_struct *task; struct path ns_path; int error = -EACCES; if (!dentry) return ERR_PTR(-ECHILD); task = get_proc_task(inode); if (!task) return ERR_PTR(-EACCES); if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) goto out; error = ns_get_path(&ns_path, task, ns_ops); if (error) goto out; error = nd_jump_link(&ns_path); out: put_task_struct(task); return ERR_PTR(error); } static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen) { struct inode *inode = d_inode(dentry); const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops; struct task_struct *task; char name[50]; int res = -EACCES; task = get_proc_task(inode); if (!task) return res; if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { res = ns_get_name(name, sizeof(name), task, ns_ops); if (res >= 0) res = readlink_copy(buffer, buflen, name); } put_task_struct(task); return res; } static const struct inode_operations proc_ns_link_inode_operations = { .readlink = proc_ns_readlink, .get_link = proc_ns_get_link, .setattr = proc_setattr, }; static struct dentry *proc_ns_instantiate(struct dentry *dentry, struct task_struct *task, const void *ptr) { const struct proc_ns_operations *ns_ops = ptr; struct inode *inode; struct proc_inode *ei; inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK | S_IRWXUGO); if (!inode) return ERR_PTR(-ENOENT); ei = PROC_I(inode); inode->i_op = &proc_ns_link_inode_operations; ei->ns_ops = ns_ops; pid_update_inode(task, inode); d_set_d_op(dentry, &pid_dentry_operations); return d_splice_alias(inode, dentry); } static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx) { struct task_struct *task = get_proc_task(file_inode(file)); const struct proc_ns_operations **entry, **last; if (!task) return -ENOENT; if (!dir_emit_dots(file, ctx)) goto out; if (ctx->pos >= 2 + ARRAY_SIZE(ns_entries)) goto out; entry = ns_entries + (ctx->pos - 2); last = &ns_entries[ARRAY_SIZE(ns_entries) - 1]; while (entry <= last) { const struct proc_ns_operations *ops = *entry; if (!proc_fill_cache(file, ctx, ops->name, strlen(ops->name), proc_ns_instantiate, task, ops)) break; ctx->pos++; entry++; } out: put_task_struct(task); return 0; } const struct file_operations proc_ns_dir_operations = { .read = generic_read_dir, .iterate_shared = proc_ns_dir_readdir, .llseek = generic_file_llseek, }; static struct dentry *proc_ns_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct task_struct *task = get_proc_task(dir); const struct proc_ns_operations **entry, **last; unsigned int len = dentry->d_name.len; struct dentry *res = ERR_PTR(-ENOENT); if (!task) goto out_no_task; last = &ns_entries[ARRAY_SIZE(ns_entries)]; for (entry = ns_entries; entry < last; entry++) { if (strlen((*entry)->name) != len) continue; if (!memcmp(dentry->d_name.name, (*entry)->name, len)) break; } if (entry == last) goto out; res = proc_ns_instantiate(dentry, task, *entry); out: put_task_struct(task); out_no_task: return res; } const struct inode_operations proc_ns_dir_inode_operations = { .lookup = proc_ns_dir_lookup, .getattr = pid_getattr, .setattr = proc_setattr, };
771 6773 1642 5806 604 54 7395 1789 1174 9 2460 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Berkeley style UIO structures - Alan Cox 1994. */ #ifndef __LINUX_UIO_H #define __LINUX_UIO_H #include <linux/kernel.h> #include <linux/thread_info.h> #include <linux/mm_types.h> #include <uapi/linux/uio.h> struct page; typedef unsigned int __bitwise iov_iter_extraction_t; struct kvec { void *iov_base; /* and that should *never* hold a userland pointer */ size_t iov_len; }; enum iter_type { /* iter types */ ITER_IOVEC, ITER_KVEC, ITER_BVEC, ITER_XARRAY, ITER_DISCARD, ITER_UBUF, }; #define ITER_SOURCE 1 // == WRITE #define ITER_DEST 0 // == READ struct iov_iter_state { size_t iov_offset; size_t count; unsigned long nr_segs; }; struct iov_iter { u8 iter_type; bool copy_mc; bool nofault; bool data_source; bool user_backed; union { size_t iov_offset; int last_offset; }; /* * Hack alert: overlay ubuf_iovec with iovec + count, so * that the members resolve correctly regardless of the type * of iterator used. This means that you can use: * * &iter->__ubuf_iovec or iter->__iov * * interchangably for the user_backed cases, hence simplifying * some of the cases that need to deal with both. */ union { /* * This really should be a const, but we cannot do that without * also modifying any of the zero-filling iter init functions. * Leave it non-const for now, but it should be treated as such. */ struct iovec __ubuf_iovec; struct { union { /* use iter_iov() to get the current vec */ const struct iovec *__iov; const struct kvec *kvec; const struct bio_vec *bvec; struct xarray *xarray; void __user *ubuf; }; size_t count; }; }; union { unsigned long nr_segs; loff_t xarray_start; }; }; static inline const struct iovec *iter_iov(const struct iov_iter *iter) { if (iter->iter_type == ITER_UBUF) return (const struct iovec *) &iter->__ubuf_iovec; return iter->__iov; } #define iter_iov_addr(iter) (iter_iov(iter)->iov_base + (iter)->iov_offset) #define iter_iov_len(iter) (iter_iov(iter)->iov_len - (iter)->iov_offset) static inline enum iter_type iov_iter_type(const struct iov_iter *i) { return i->iter_type; } static inline void iov_iter_save_state(struct iov_iter *iter, struct iov_iter_state *state) { state->iov_offset = iter->iov_offset; state->count = iter->count; state->nr_segs = iter->nr_segs; } static inline bool iter_is_ubuf(const struct iov_iter *i) { return iov_iter_type(i) == ITER_UBUF; } static inline bool iter_is_iovec(const struct iov_iter *i) { return iov_iter_type(i) == ITER_IOVEC; } static inline bool iov_iter_is_kvec(const struct iov_iter *i) { return iov_iter_type(i) == ITER_KVEC; } static inline bool iov_iter_is_bvec(const struct iov_iter *i) { return iov_iter_type(i) == ITER_BVEC; } static inline bool iov_iter_is_discard(const struct iov_iter *i) { return iov_iter_type(i) == ITER_DISCARD; } static inline bool iov_iter_is_xarray(const struct iov_iter *i) { return iov_iter_type(i) == ITER_XARRAY; } static inline unsigned char iov_iter_rw(const struct iov_iter *i) { return i->data_source ? WRITE : READ; } static inline bool user_backed_iter(const struct iov_iter *i) { return i->user_backed; } /* * Total number of bytes covered by an iovec. * * NOTE that it is not safe to use this function until all the iovec's * segment lengths have been validated. Because the individual lengths can * overflow a size_t when added together. */ static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs) { unsigned long seg; size_t ret = 0; for (seg = 0; seg < nr_segs; seg++) ret += iov[seg].iov_len; return ret; } size_t copy_page_from_iter_atomic(struct page *page, size_t offset, size_t bytes, struct iov_iter *i); void iov_iter_advance(struct iov_iter *i, size_t bytes); void iov_iter_revert(struct iov_iter *i, size_t bytes); size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t bytes); size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t bytes); size_t iov_iter_single_seg_count(const struct iov_iter *i); size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i); size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i); size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i); size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i); size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i); static inline size_t copy_folio_to_iter(struct folio *folio, size_t offset, size_t bytes, struct iov_iter *i) { return copy_page_to_iter(&folio->page, offset, bytes, i); } static inline size_t copy_folio_from_iter_atomic(struct folio *folio, size_t offset, size_t bytes, struct iov_iter *i) { return copy_page_from_iter_atomic(&folio->page, offset, bytes, i); } size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t bytes, struct iov_iter *i); static __always_inline __must_check size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { if (check_copy_size(addr, bytes, true)) return _copy_to_iter(addr, bytes, i); return 0; } static __always_inline __must_check size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) { if (check_copy_size(addr, bytes, false)) return _copy_from_iter(addr, bytes, i); return 0; } static __always_inline __must_check bool copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i) { size_t copied = copy_from_iter(addr, bytes, i); if (likely(copied == bytes)) return true; iov_iter_revert(i, copied); return false; } static __always_inline __must_check size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) { if (check_copy_size(addr, bytes, false)) return _copy_from_iter_nocache(addr, bytes, i); return 0; } static __always_inline __must_check bool copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i) { size_t copied = copy_from_iter_nocache(addr, bytes, i); if (likely(copied == bytes)) return true; iov_iter_revert(i, copied); return false; } #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE /* * Note, users like pmem that depend on the stricter semantics of * _copy_from_iter_flushcache() than _copy_from_iter_nocache() must check for * IS_ENABLED(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) before assuming that the * destination is flushed from the cache on return. */ size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i); #else #define _copy_from_iter_flushcache _copy_from_iter_nocache #endif #ifdef CONFIG_ARCH_HAS_COPY_MC size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i); static inline void iov_iter_set_copy_mc(struct iov_iter *i) { i->copy_mc = true; } static inline bool iov_iter_is_copy_mc(const struct iov_iter *i) { return i->copy_mc; } #else #define _copy_mc_to_iter _copy_to_iter static inline void iov_iter_set_copy_mc(struct iov_iter *i) { } static inline bool iov_iter_is_copy_mc(const struct iov_iter *i) { return false; } #endif size_t iov_iter_zero(size_t bytes, struct iov_iter *); bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask, unsigned len_mask); unsigned long iov_iter_alignment(const struct iov_iter *i); unsigned long iov_iter_gap_alignment(const struct iov_iter *i); void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov, unsigned long nr_segs, size_t count); void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec *kvec, unsigned long nr_segs, size_t count); void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec, unsigned long nr_segs, size_t count); void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count); void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray, loff_t start, size_t count); ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start); ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, struct page ***pages, size_t maxsize, size_t *start); int iov_iter_npages(const struct iov_iter *i, int maxpages); void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state); const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags); static inline size_t iov_iter_count(const struct iov_iter *i) { return i->count; } /* * Cap the iov_iter by given limit; note that the second argument is * *not* the new size - it's upper limit for such. Passing it a value * greater than the amount of data in iov_iter is fine - it'll just do * nothing in that case. */ static inline void iov_iter_truncate(struct iov_iter *i, u64 count) { /* * count doesn't have to fit in size_t - comparison extends both * operands to u64 here and any value that would be truncated by * conversion in assignement is by definition greater than all * values of size_t, including old i->count. */ if (i->count > count) i->count = count; } /* * reexpand a previously truncated iterator; count must be no more than how much * we had shrunk it. */ static inline void iov_iter_reexpand(struct iov_iter *i, size_t count) { i->count = count; } static inline int iov_iter_npages_cap(struct iov_iter *i, int maxpages, size_t max_bytes) { size_t shorted = 0; int npages; if (iov_iter_count(i) > max_bytes) { shorted = iov_iter_count(i) - max_bytes; iov_iter_truncate(i, max_bytes); } npages = iov_iter_npages(i, maxpages); if (shorted) iov_iter_reexpand(i, iov_iter_count(i) + shorted); return npages; } struct csum_state { __wsum csum; size_t off; }; size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *csstate, struct iov_iter *i); size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i); static __always_inline __must_check bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i) { size_t copied = csum_and_copy_from_iter(addr, bytes, csum, i); if (likely(copied == bytes)) return true; iov_iter_revert(i, copied); return false; } size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp, struct iov_iter *i); struct iovec *iovec_from_user(const struct iovec __user *uvector, unsigned long nr_segs, unsigned long fast_segs, struct iovec *fast_iov, bool compat); ssize_t import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i); ssize_t __import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i, bool compat); int import_single_range(int type, void __user *buf, size_t len, struct iovec *iov, struct iov_iter *i); int import_ubuf(int type, void __user *buf, size_t len, struct iov_iter *i); static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction, void __user *buf, size_t count) { WARN_ON(direction & ~(READ | WRITE)); *i = (struct iov_iter) { .iter_type = ITER_UBUF, .copy_mc = false, .user_backed = true, .data_source = direction, .ubuf = buf, .count = count, .nr_segs = 1 }; } /* Flags for iov_iter_get/extract_pages*() */ /* Allow P2PDMA on the extracted pages */ #define ITER_ALLOW_P2PDMA ((__force iov_iter_extraction_t)0x01) ssize_t iov_iter_extract_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0); /** * iov_iter_extract_will_pin - Indicate how pages from the iterator will be retained * @iter: The iterator * * Examine the iterator and indicate by returning true or false as to how, if * at all, pages extracted from the iterator will be retained by the extraction * function. * * %true indicates that the pages will have a pin placed in them that the * caller must unpin. This is must be done for DMA/async DIO to force fork() * to forcibly copy a page for the child (the parent must retain the original * page). * * %false indicates that no measures are taken and that it's up to the caller * to retain the pages. */ static inline bool iov_iter_extract_will_pin(const struct iov_iter *iter) { return user_backed_iter(iter); } struct sg_table; ssize_t extract_iter_to_sg(struct iov_iter *iter, size_t len, struct sg_table *sgtable, unsigned int sg_max, iov_iter_extraction_t extraction_flags); #endif
55 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PAGE_COUNTER_H #define _LINUX_PAGE_COUNTER_H #include <linux/atomic.h> #include <linux/cache.h> #include <linux/kernel.h> #include <asm/page.h> struct page_counter { /* * Make sure 'usage' does not share cacheline with any other field. The * memcg->memory.usage is a hot member of struct mem_cgroup. */ atomic_long_t usage; CACHELINE_PADDING(_pad1_); /* effective memory.min and memory.min usage tracking */ unsigned long emin; atomic_long_t min_usage; atomic_long_t children_min_usage; /* effective memory.low and memory.low usage tracking */ unsigned long elow; atomic_long_t low_usage; atomic_long_t children_low_usage; unsigned long watermark; unsigned long failcnt; /* Keep all the read most fields in a separete cacheline. */ CACHELINE_PADDING(_pad2_); unsigned long min; unsigned long low; unsigned long high; unsigned long max; struct page_counter *parent; } ____cacheline_internodealigned_in_smp; #if BITS_PER_LONG == 32 #define PAGE_COUNTER_MAX LONG_MAX #else #define PAGE_COUNTER_MAX (LONG_MAX / PAGE_SIZE) #endif static inline void page_counter_init(struct page_counter *counter, struct page_counter *parent) { atomic_long_set(&counter->usage, 0); counter->max = PAGE_COUNTER_MAX; counter->parent = parent; } static inline unsigned long page_counter_read(struct page_counter *counter) { return atomic_long_read(&counter->usage); } void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages); void page_counter_charge(struct page_counter *counter, unsigned long nr_pages); bool page_counter_try_charge(struct page_counter *counter, unsigned long nr_pages, struct page_counter **fail); void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages); void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages); void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages); static inline void page_counter_set_high(struct page_counter *counter, unsigned long nr_pages) { WRITE_ONCE(counter->high, nr_pages); } int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages); int page_counter_memparse(const char *buf, const char *max, unsigned long *nr_pages); static inline void page_counter_reset_watermark(struct page_counter *counter) { counter->watermark = page_counter_read(counter); } #endif /* _LINUX_PAGE_COUNTER_H */
7927 6401 4249 4249 1174 1174 16258 16257 142 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 // SPDX-License-Identifier: GPL-2.0 // Generated by scripts/atomic/gen-atomic-fallback.sh // DO NOT MODIFY THIS FILE DIRECTLY #ifndef _LINUX_ATOMIC_FALLBACK_H #define _LINUX_ATOMIC_FALLBACK_H #include <linux/compiler.h> #if defined(arch_xchg) #define raw_xchg arch_xchg #elif defined(arch_xchg_relaxed) #define raw_xchg(...) \ __atomic_op_fence(arch_xchg, __VA_ARGS__) #else extern void raw_xchg_not_implemented(void); #define raw_xchg(...) raw_xchg_not_implemented() #endif #if defined(arch_xchg_acquire) #define raw_xchg_acquire arch_xchg_acquire #elif defined(arch_xchg_relaxed) #define raw_xchg_acquire(...) \ __atomic_op_acquire(arch_xchg, __VA_ARGS__) #elif defined(arch_xchg) #define raw_xchg_acquire arch_xchg #else extern void raw_xchg_acquire_not_implemented(void); #define raw_xchg_acquire(...) raw_xchg_acquire_not_implemented() #endif #if defined(arch_xchg_release) #define raw_xchg_release arch_xchg_release #elif defined(arch_xchg_relaxed) #define raw_xchg_release(...) \ __atomic_op_release(arch_xchg, __VA_ARGS__) #elif defined(arch_xchg) #define raw_xchg_release arch_xchg #else extern void raw_xchg_release_not_implemented(void); #define raw_xchg_release(...) raw_xchg_release_not_implemented() #endif #if defined(arch_xchg_relaxed) #define raw_xchg_relaxed arch_xchg_relaxed #elif defined(arch_xchg) #define raw_xchg_relaxed arch_xchg #else extern void raw_xchg_relaxed_not_implemented(void); #define raw_xchg_relaxed(...) raw_xchg_relaxed_not_implemented() #endif #if defined(arch_cmpxchg) #define raw_cmpxchg arch_cmpxchg #elif defined(arch_cmpxchg_relaxed) #define raw_cmpxchg(...) \ __atomic_op_fence(arch_cmpxchg, __VA_ARGS__) #else extern void raw_cmpxchg_not_implemented(void); #define raw_cmpxchg(...) raw_cmpxchg_not_implemented() #endif #if defined(arch_cmpxchg_acquire) #define raw_cmpxchg_acquire arch_cmpxchg_acquire #elif defined(arch_cmpxchg_relaxed) #define raw_cmpxchg_acquire(...) \ __atomic_op_acquire(arch_cmpxchg, __VA_ARGS__) #elif defined(arch_cmpxchg) #define raw_cmpxchg_acquire arch_cmpxchg #else extern void raw_cmpxchg_acquire_not_implemented(void); #define raw_cmpxchg_acquire(...) raw_cmpxchg_acquire_not_implemented() #endif #if defined(arch_cmpxchg_release) #define raw_cmpxchg_release arch_cmpxchg_release #elif defined(arch_cmpxchg_relaxed) #define raw_cmpxchg_release(...) \ __atomic_op_release(arch_cmpxchg, __VA_ARGS__) #elif defined(arch_cmpxchg) #define raw_cmpxchg_release arch_cmpxchg #else extern void raw_cmpxchg_release_not_implemented(void); #define raw_cmpxchg_release(...) raw_cmpxchg_release_not_implemented() #endif #if defined(arch_cmpxchg_relaxed) #define raw_cmpxchg_relaxed arch_cmpxchg_relaxed #elif defined(arch_cmpxchg) #define raw_cmpxchg_relaxed arch_cmpxchg #else extern void raw_cmpxchg_relaxed_not_implemented(void); #define raw_cmpxchg_relaxed(...) raw_cmpxchg_relaxed_not_implemented() #endif #if defined(arch_cmpxchg64) #define raw_cmpxchg64 arch_cmpxchg64 #elif defined(arch_cmpxchg64_relaxed) #define raw_cmpxchg64(...) \ __atomic_op_fence(arch_cmpxchg64, __VA_ARGS__) #else extern void raw_cmpxchg64_not_implemented(void); #define raw_cmpxchg64(...) raw_cmpxchg64_not_implemented() #endif #if defined(arch_cmpxchg64_acquire) #define raw_cmpxchg64_acquire arch_cmpxchg64_acquire #elif defined(arch_cmpxchg64_relaxed) #define raw_cmpxchg64_acquire(...) \ __atomic_op_acquire(arch_cmpxchg64, __VA_ARGS__) #elif defined(arch_cmpxchg64) #define raw_cmpxchg64_acquire arch_cmpxchg64 #else extern void raw_cmpxchg64_acquire_not_implemented(void); #define raw_cmpxchg64_acquire(...) raw_cmpxchg64_acquire_not_implemented() #endif #if defined(arch_cmpxchg64_release) #define raw_cmpxchg64_release arch_cmpxchg64_release #elif defined(arch_cmpxchg64_relaxed) #define raw_cmpxchg64_release(...) \ __atomic_op_release(arch_cmpxchg64, __VA_ARGS__) #elif defined(arch_cmpxchg64) #define raw_cmpxchg64_release arch_cmpxchg64 #else extern void raw_cmpxchg64_release_not_implemented(void); #define raw_cmpxchg64_release(...) raw_cmpxchg64_release_not_implemented() #endif #if defined(arch_cmpxchg64_relaxed) #define raw_cmpxchg64_relaxed arch_cmpxchg64_relaxed #elif defined(arch_cmpxchg64) #define raw_cmpxchg64_relaxed arch_cmpxchg64 #else extern void raw_cmpxchg64_relaxed_not_implemented(void); #define raw_cmpxchg64_relaxed(...) raw_cmpxchg64_relaxed_not_implemented() #endif #if defined(arch_cmpxchg128) #define raw_cmpxchg128 arch_cmpxchg128 #elif defined(arch_cmpxchg128_relaxed) #define raw_cmpxchg128(...) \ __atomic_op_fence(arch_cmpxchg128, __VA_ARGS__) #else extern void raw_cmpxchg128_not_implemented(void); #define raw_cmpxchg128(...) raw_cmpxchg128_not_implemented() #endif #if defined(arch_cmpxchg128_acquire) #define raw_cmpxchg128_acquire arch_cmpxchg128_acquire #elif defined(arch_cmpxchg128_relaxed) #define raw_cmpxchg128_acquire(...) \ __atomic_op_acquire(arch_cmpxchg128, __VA_ARGS__) #elif defined(arch_cmpxchg128) #define raw_cmpxchg128_acquire arch_cmpxchg128 #else extern void raw_cmpxchg128_acquire_not_implemented(void); #define raw_cmpxchg128_acquire(...) raw_cmpxchg128_acquire_not_implemented() #endif #if defined(arch_cmpxchg128_release) #define raw_cmpxchg128_release arch_cmpxchg128_release #elif defined(arch_cmpxchg128_relaxed) #define raw_cmpxchg128_release(...) \ __atomic_op_release(arch_cmpxchg128, __VA_ARGS__) #elif defined(arch_cmpxchg128) #define raw_cmpxchg128_release arch_cmpxchg128 #else extern void raw_cmpxchg128_release_not_implemented(void); #define raw_cmpxchg128_release(...) raw_cmpxchg128_release_not_implemented() #endif #if defined(arch_cmpxchg128_relaxed) #define raw_cmpxchg128_relaxed arch_cmpxchg128_relaxed #elif defined(arch_cmpxchg128) #define raw_cmpxchg128_relaxed arch_cmpxchg128 #else extern void raw_cmpxchg128_relaxed_not_implemented(void); #define raw_cmpxchg128_relaxed(...) raw_cmpxchg128_relaxed_not_implemented() #endif #if defined(arch_try_cmpxchg) #define raw_try_cmpxchg arch_try_cmpxchg #elif defined(arch_try_cmpxchg_relaxed) #define raw_try_cmpxchg(...) \ __atomic_op_fence(arch_try_cmpxchg, __VA_ARGS__) #else #define raw_try_cmpxchg(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg_acquire) #define raw_try_cmpxchg_acquire arch_try_cmpxchg_acquire #elif defined(arch_try_cmpxchg_relaxed) #define raw_try_cmpxchg_acquire(...) \ __atomic_op_acquire(arch_try_cmpxchg, __VA_ARGS__) #elif defined(arch_try_cmpxchg) #define raw_try_cmpxchg_acquire arch_try_cmpxchg #else #define raw_try_cmpxchg_acquire(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg_acquire((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg_release) #define raw_try_cmpxchg_release arch_try_cmpxchg_release #elif defined(arch_try_cmpxchg_relaxed) #define raw_try_cmpxchg_release(...) \ __atomic_op_release(arch_try_cmpxchg, __VA_ARGS__) #elif defined(arch_try_cmpxchg) #define raw_try_cmpxchg_release arch_try_cmpxchg #else #define raw_try_cmpxchg_release(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg_release((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg_relaxed) #define raw_try_cmpxchg_relaxed arch_try_cmpxchg_relaxed #elif defined(arch_try_cmpxchg) #define raw_try_cmpxchg_relaxed arch_try_cmpxchg #else #define raw_try_cmpxchg_relaxed(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg_relaxed((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg64) #define raw_try_cmpxchg64 arch_try_cmpxchg64 #elif defined(arch_try_cmpxchg64_relaxed) #define raw_try_cmpxchg64(...) \ __atomic_op_fence(arch_try_cmpxchg64, __VA_ARGS__) #else #define raw_try_cmpxchg64(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg64((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg64_acquire) #define raw_try_cmpxchg64_acquire arch_try_cmpxchg64_acquire #elif defined(arch_try_cmpxchg64_relaxed) #define raw_try_cmpxchg64_acquire(...) \ __atomic_op_acquire(arch_try_cmpxchg64, __VA_ARGS__) #elif defined(arch_try_cmpxchg64) #define raw_try_cmpxchg64_acquire arch_try_cmpxchg64 #else #define raw_try_cmpxchg64_acquire(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg64_acquire((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg64_release) #define raw_try_cmpxchg64_release arch_try_cmpxchg64_release #elif defined(arch_try_cmpxchg64_relaxed) #define raw_try_cmpxchg64_release(...) \ __atomic_op_release(arch_try_cmpxchg64, __VA_ARGS__) #elif defined(arch_try_cmpxchg64) #define raw_try_cmpxchg64_release arch_try_cmpxchg64 #else #define raw_try_cmpxchg64_release(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg64_release((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg64_relaxed) #define raw_try_cmpxchg64_relaxed arch_try_cmpxchg64_relaxed #elif defined(arch_try_cmpxchg64) #define raw_try_cmpxchg64_relaxed arch_try_cmpxchg64 #else #define raw_try_cmpxchg64_relaxed(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg64_relaxed((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg128) #define raw_try_cmpxchg128 arch_try_cmpxchg128 #elif defined(arch_try_cmpxchg128_relaxed) #define raw_try_cmpxchg128(...) \ __atomic_op_fence(arch_try_cmpxchg128, __VA_ARGS__) #else #define raw_try_cmpxchg128(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg128((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg128_acquire) #define raw_try_cmpxchg128_acquire arch_try_cmpxchg128_acquire #elif defined(arch_try_cmpxchg128_relaxed) #define raw_try_cmpxchg128_acquire(...) \ __atomic_op_acquire(arch_try_cmpxchg128, __VA_ARGS__) #elif defined(arch_try_cmpxchg128) #define raw_try_cmpxchg128_acquire arch_try_cmpxchg128 #else #define raw_try_cmpxchg128_acquire(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg128_acquire((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg128_release) #define raw_try_cmpxchg128_release arch_try_cmpxchg128_release #elif defined(arch_try_cmpxchg128_relaxed) #define raw_try_cmpxchg128_release(...) \ __atomic_op_release(arch_try_cmpxchg128, __VA_ARGS__) #elif defined(arch_try_cmpxchg128) #define raw_try_cmpxchg128_release arch_try_cmpxchg128 #else #define raw_try_cmpxchg128_release(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg128_release((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #if defined(arch_try_cmpxchg128_relaxed) #define raw_try_cmpxchg128_relaxed arch_try_cmpxchg128_relaxed #elif defined(arch_try_cmpxchg128) #define raw_try_cmpxchg128_relaxed arch_try_cmpxchg128 #else #define raw_try_cmpxchg128_relaxed(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg128_relaxed((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #define raw_cmpxchg_local arch_cmpxchg_local #ifdef arch_try_cmpxchg_local #define raw_try_cmpxchg_local arch_try_cmpxchg_local #else #define raw_try_cmpxchg_local(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg_local((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #define raw_cmpxchg64_local arch_cmpxchg64_local #ifdef arch_try_cmpxchg64_local #define raw_try_cmpxchg64_local arch_try_cmpxchg64_local #else #define raw_try_cmpxchg64_local(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg64_local((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #define raw_cmpxchg128_local arch_cmpxchg128_local #ifdef arch_try_cmpxchg128_local #define raw_try_cmpxchg128_local arch_try_cmpxchg128_local #else #define raw_try_cmpxchg128_local(_ptr, _oldp, _new) \ ({ \ typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ ___r = raw_cmpxchg128_local((_ptr), ___o, (_new)); \ if (unlikely(___r != ___o)) \ *___op = ___r; \ likely(___r == ___o); \ }) #endif #define raw_sync_cmpxchg arch_sync_cmpxchg /** * raw_atomic_read() - atomic load with relaxed ordering * @v: pointer to atomic_t * * Atomically loads the value of @v with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_read() elsewhere. * * Return: The value loaded from @v. */ static __always_inline int raw_atomic_read(const atomic_t *v) { return arch_atomic_read(v); } /** * raw_atomic_read_acquire() - atomic load with acquire ordering * @v: pointer to atomic_t * * Atomically loads the value of @v with acquire ordering. * * Safe to use in noinstr code; prefer atomic_read_acquire() elsewhere. * * Return: The value loaded from @v. */ static __always_inline int raw_atomic_read_acquire(const atomic_t *v) { #if defined(arch_atomic_read_acquire) return arch_atomic_read_acquire(v); #else int ret; if (__native_word(atomic_t)) { ret = smp_load_acquire(&(v)->counter); } else { ret = raw_atomic_read(v); __atomic_acquire_fence(); } return ret; #endif } /** * raw_atomic_set() - atomic set with relaxed ordering * @v: pointer to atomic_t * @i: int value to assign * * Atomically sets @v to @i with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_set() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_set(atomic_t *v, int i) { arch_atomic_set(v, i); } /** * raw_atomic_set_release() - atomic set with release ordering * @v: pointer to atomic_t * @i: int value to assign * * Atomically sets @v to @i with release ordering. * * Safe to use in noinstr code; prefer atomic_set_release() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_set_release(atomic_t *v, int i) { #if defined(arch_atomic_set_release) arch_atomic_set_release(v, i); #else if (__native_word(atomic_t)) { smp_store_release(&(v)->counter, i); } else { __atomic_release_fence(); raw_atomic_set(v, i); } #endif } /** * raw_atomic_add() - atomic add with relaxed ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_add() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_add(int i, atomic_t *v) { arch_atomic_add(i, v); } /** * raw_atomic_add_return() - atomic add with full ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_add_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_add_return(int i, atomic_t *v) { #if defined(arch_atomic_add_return) return arch_atomic_add_return(i, v); #elif defined(arch_atomic_add_return_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_add_return_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic_add_return" #endif } /** * raw_atomic_add_return_acquire() - atomic add with acquire ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_add_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_add_return_acquire(int i, atomic_t *v) { #if defined(arch_atomic_add_return_acquire) return arch_atomic_add_return_acquire(i, v); #elif defined(arch_atomic_add_return_relaxed) int ret = arch_atomic_add_return_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_add_return) return arch_atomic_add_return(i, v); #else #error "Unable to define raw_atomic_add_return_acquire" #endif } /** * raw_atomic_add_return_release() - atomic add with release ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_add_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_add_return_release(int i, atomic_t *v) { #if defined(arch_atomic_add_return_release) return arch_atomic_add_return_release(i, v); #elif defined(arch_atomic_add_return_relaxed) __atomic_release_fence(); return arch_atomic_add_return_relaxed(i, v); #elif defined(arch_atomic_add_return) return arch_atomic_add_return(i, v); #else #error "Unable to define raw_atomic_add_return_release" #endif } /** * raw_atomic_add_return_relaxed() - atomic add with relaxed ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_add_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_add_return_relaxed(int i, atomic_t *v) { #if defined(arch_atomic_add_return_relaxed) return arch_atomic_add_return_relaxed(i, v); #elif defined(arch_atomic_add_return) return arch_atomic_add_return(i, v); #else #error "Unable to define raw_atomic_add_return_relaxed" #endif } /** * raw_atomic_fetch_add() - atomic add with full ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_fetch_add() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_add(int i, atomic_t *v) { #if defined(arch_atomic_fetch_add) return arch_atomic_fetch_add(i, v); #elif defined(arch_atomic_fetch_add_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_fetch_add_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic_fetch_add" #endif } /** * raw_atomic_fetch_add_acquire() - atomic add with acquire ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_fetch_add_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_add_acquire(int i, atomic_t *v) { #if defined(arch_atomic_fetch_add_acquire) return arch_atomic_fetch_add_acquire(i, v); #elif defined(arch_atomic_fetch_add_relaxed) int ret = arch_atomic_fetch_add_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_fetch_add) return arch_atomic_fetch_add(i, v); #else #error "Unable to define raw_atomic_fetch_add_acquire" #endif } /** * raw_atomic_fetch_add_release() - atomic add with release ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_fetch_add_release() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_add_release(int i, atomic_t *v) { #if defined(arch_atomic_fetch_add_release) return arch_atomic_fetch_add_release(i, v); #elif defined(arch_atomic_fetch_add_relaxed) __atomic_release_fence(); return arch_atomic_fetch_add_relaxed(i, v); #elif defined(arch_atomic_fetch_add) return arch_atomic_fetch_add(i, v); #else #error "Unable to define raw_atomic_fetch_add_release" #endif } /** * raw_atomic_fetch_add_relaxed() - atomic add with relaxed ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_fetch_add_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_add_relaxed(int i, atomic_t *v) { #if defined(arch_atomic_fetch_add_relaxed) return arch_atomic_fetch_add_relaxed(i, v); #elif defined(arch_atomic_fetch_add) return arch_atomic_fetch_add(i, v); #else #error "Unable to define raw_atomic_fetch_add_relaxed" #endif } /** * raw_atomic_sub() - atomic subtract with relaxed ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_sub() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_sub(int i, atomic_t *v) { arch_atomic_sub(i, v); } /** * raw_atomic_sub_return() - atomic subtract with full ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_sub_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_sub_return(int i, atomic_t *v) { #if defined(arch_atomic_sub_return) return arch_atomic_sub_return(i, v); #elif defined(arch_atomic_sub_return_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_sub_return_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic_sub_return" #endif } /** * raw_atomic_sub_return_acquire() - atomic subtract with acquire ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_sub_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_sub_return_acquire(int i, atomic_t *v) { #if defined(arch_atomic_sub_return_acquire) return arch_atomic_sub_return_acquire(i, v); #elif defined(arch_atomic_sub_return_relaxed) int ret = arch_atomic_sub_return_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_sub_return) return arch_atomic_sub_return(i, v); #else #error "Unable to define raw_atomic_sub_return_acquire" #endif } /** * raw_atomic_sub_return_release() - atomic subtract with release ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_sub_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_sub_return_release(int i, atomic_t *v) { #if defined(arch_atomic_sub_return_release) return arch_atomic_sub_return_release(i, v); #elif defined(arch_atomic_sub_return_relaxed) __atomic_release_fence(); return arch_atomic_sub_return_relaxed(i, v); #elif defined(arch_atomic_sub_return) return arch_atomic_sub_return(i, v); #else #error "Unable to define raw_atomic_sub_return_release" #endif } /** * raw_atomic_sub_return_relaxed() - atomic subtract with relaxed ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_sub_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_sub_return_relaxed(int i, atomic_t *v) { #if defined(arch_atomic_sub_return_relaxed) return arch_atomic_sub_return_relaxed(i, v); #elif defined(arch_atomic_sub_return) return arch_atomic_sub_return(i, v); #else #error "Unable to define raw_atomic_sub_return_relaxed" #endif } /** * raw_atomic_fetch_sub() - atomic subtract with full ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_fetch_sub() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_sub(int i, atomic_t *v) { #if defined(arch_atomic_fetch_sub) return arch_atomic_fetch_sub(i, v); #elif defined(arch_atomic_fetch_sub_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_fetch_sub_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic_fetch_sub" #endif } /** * raw_atomic_fetch_sub_acquire() - atomic subtract with acquire ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_fetch_sub_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_sub_acquire(int i, atomic_t *v) { #if defined(arch_atomic_fetch_sub_acquire) return arch_atomic_fetch_sub_acquire(i, v); #elif defined(arch_atomic_fetch_sub_relaxed) int ret = arch_atomic_fetch_sub_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_fetch_sub) return arch_atomic_fetch_sub(i, v); #else #error "Unable to define raw_atomic_fetch_sub_acquire" #endif } /** * raw_atomic_fetch_sub_release() - atomic subtract with release ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_fetch_sub_release() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_sub_release(int i, atomic_t *v) { #if defined(arch_atomic_fetch_sub_release) return arch_atomic_fetch_sub_release(i, v); #elif defined(arch_atomic_fetch_sub_relaxed) __atomic_release_fence(); return arch_atomic_fetch_sub_relaxed(i, v); #elif defined(arch_atomic_fetch_sub) return arch_atomic_fetch_sub(i, v); #else #error "Unable to define raw_atomic_fetch_sub_release" #endif } /** * raw_atomic_fetch_sub_relaxed() - atomic subtract with relaxed ordering * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_fetch_sub_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_sub_relaxed(int i, atomic_t *v) { #if defined(arch_atomic_fetch_sub_relaxed) return arch_atomic_fetch_sub_relaxed(i, v); #elif defined(arch_atomic_fetch_sub) return arch_atomic_fetch_sub(i, v); #else #error "Unable to define raw_atomic_fetch_sub_relaxed" #endif } /** * raw_atomic_inc() - atomic increment with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_inc() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_inc(atomic_t *v) { #if defined(arch_atomic_inc) arch_atomic_inc(v); #else raw_atomic_add(1, v); #endif } /** * raw_atomic_inc_return() - atomic increment with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_inc_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_inc_return(atomic_t *v) { #if defined(arch_atomic_inc_return) return arch_atomic_inc_return(v); #elif defined(arch_atomic_inc_return_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_inc_return_relaxed(v); __atomic_post_full_fence(); return ret; #else return raw_atomic_add_return(1, v); #endif } /** * raw_atomic_inc_return_acquire() - atomic increment with acquire ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_inc_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_inc_return_acquire(atomic_t *v) { #if defined(arch_atomic_inc_return_acquire) return arch_atomic_inc_return_acquire(v); #elif defined(arch_atomic_inc_return_relaxed) int ret = arch_atomic_inc_return_relaxed(v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_inc_return) return arch_atomic_inc_return(v); #else return raw_atomic_add_return_acquire(1, v); #endif } /** * raw_atomic_inc_return_release() - atomic increment with release ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with release ordering. * * Safe to use in noinstr code; prefer atomic_inc_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_inc_return_release(atomic_t *v) { #if defined(arch_atomic_inc_return_release) return arch_atomic_inc_return_release(v); #elif defined(arch_atomic_inc_return_relaxed) __atomic_release_fence(); return arch_atomic_inc_return_relaxed(v); #elif defined(arch_atomic_inc_return) return arch_atomic_inc_return(v); #else return raw_atomic_add_return_release(1, v); #endif } /** * raw_atomic_inc_return_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_inc_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_inc_return_relaxed(atomic_t *v) { #if defined(arch_atomic_inc_return_relaxed) return arch_atomic_inc_return_relaxed(v); #elif defined(arch_atomic_inc_return) return arch_atomic_inc_return(v); #else return raw_atomic_add_return_relaxed(1, v); #endif } /** * raw_atomic_fetch_inc() - atomic increment with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_fetch_inc() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_inc(atomic_t *v) { #if defined(arch_atomic_fetch_inc) return arch_atomic_fetch_inc(v); #elif defined(arch_atomic_fetch_inc_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_fetch_inc_relaxed(v); __atomic_post_full_fence(); return ret; #else return raw_atomic_fetch_add(1, v); #endif } /** * raw_atomic_fetch_inc_acquire() - atomic increment with acquire ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_fetch_inc_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_inc_acquire(atomic_t *v) { #if defined(arch_atomic_fetch_inc_acquire) return arch_atomic_fetch_inc_acquire(v); #elif defined(arch_atomic_fetch_inc_relaxed) int ret = arch_atomic_fetch_inc_relaxed(v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_fetch_inc) return arch_atomic_fetch_inc(v); #else return raw_atomic_fetch_add_acquire(1, v); #endif } /** * raw_atomic_fetch_inc_release() - atomic increment with release ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with release ordering. * * Safe to use in noinstr code; prefer atomic_fetch_inc_release() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_inc_release(atomic_t *v) { #if defined(arch_atomic_fetch_inc_release) return arch_atomic_fetch_inc_release(v); #elif defined(arch_atomic_fetch_inc_relaxed) __atomic_release_fence(); return arch_atomic_fetch_inc_relaxed(v); #elif defined(arch_atomic_fetch_inc) return arch_atomic_fetch_inc(v); #else return raw_atomic_fetch_add_release(1, v); #endif } /** * raw_atomic_fetch_inc_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_fetch_inc_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_inc_relaxed(atomic_t *v) { #if defined(arch_atomic_fetch_inc_relaxed) return arch_atomic_fetch_inc_relaxed(v); #elif defined(arch_atomic_fetch_inc) return arch_atomic_fetch_inc(v); #else return raw_atomic_fetch_add_relaxed(1, v); #endif } /** * raw_atomic_dec() - atomic decrement with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_dec() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_dec(atomic_t *v) { #if defined(arch_atomic_dec) arch_atomic_dec(v); #else raw_atomic_sub(1, v); #endif } /** * raw_atomic_dec_return() - atomic decrement with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_dec_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_dec_return(atomic_t *v) { #if defined(arch_atomic_dec_return) return arch_atomic_dec_return(v); #elif defined(arch_atomic_dec_return_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_dec_return_relaxed(v); __atomic_post_full_fence(); return ret; #else return raw_atomic_sub_return(1, v); #endif } /** * raw_atomic_dec_return_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_dec_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_dec_return_acquire(atomic_t *v) { #if defined(arch_atomic_dec_return_acquire) return arch_atomic_dec_return_acquire(v); #elif defined(arch_atomic_dec_return_relaxed) int ret = arch_atomic_dec_return_relaxed(v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_dec_return) return arch_atomic_dec_return(v); #else return raw_atomic_sub_return_acquire(1, v); #endif } /** * raw_atomic_dec_return_release() - atomic decrement with release ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with release ordering. * * Safe to use in noinstr code; prefer atomic_dec_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_dec_return_release(atomic_t *v) { #if defined(arch_atomic_dec_return_release) return arch_atomic_dec_return_release(v); #elif defined(arch_atomic_dec_return_relaxed) __atomic_release_fence(); return arch_atomic_dec_return_relaxed(v); #elif defined(arch_atomic_dec_return) return arch_atomic_dec_return(v); #else return raw_atomic_sub_return_release(1, v); #endif } /** * raw_atomic_dec_return_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_dec_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline int raw_atomic_dec_return_relaxed(atomic_t *v) { #if defined(arch_atomic_dec_return_relaxed) return arch_atomic_dec_return_relaxed(v); #elif defined(arch_atomic_dec_return) return arch_atomic_dec_return(v); #else return raw_atomic_sub_return_relaxed(1, v); #endif } /** * raw_atomic_fetch_dec() - atomic decrement with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_fetch_dec() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_dec(atomic_t *v) { #if defined(arch_atomic_fetch_dec) return arch_atomic_fetch_dec(v); #elif defined(arch_atomic_fetch_dec_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_fetch_dec_relaxed(v); __atomic_post_full_fence(); return ret; #else return raw_atomic_fetch_sub(1, v); #endif } /** * raw_atomic_fetch_dec_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_fetch_dec_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_dec_acquire(atomic_t *v) { #if defined(arch_atomic_fetch_dec_acquire) return arch_atomic_fetch_dec_acquire(v); #elif defined(arch_atomic_fetch_dec_relaxed) int ret = arch_atomic_fetch_dec_relaxed(v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_fetch_dec) return arch_atomic_fetch_dec(v); #else return raw_atomic_fetch_sub_acquire(1, v); #endif } /** * raw_atomic_fetch_dec_release() - atomic decrement with release ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with release ordering. * * Safe to use in noinstr code; prefer atomic_fetch_dec_release() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_dec_release(atomic_t *v) { #if defined(arch_atomic_fetch_dec_release) return arch_atomic_fetch_dec_release(v); #elif defined(arch_atomic_fetch_dec_relaxed) __atomic_release_fence(); return arch_atomic_fetch_dec_relaxed(v); #elif defined(arch_atomic_fetch_dec) return arch_atomic_fetch_dec(v); #else return raw_atomic_fetch_sub_release(1, v); #endif } /** * raw_atomic_fetch_dec_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_fetch_dec_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_dec_relaxed(atomic_t *v) { #if defined(arch_atomic_fetch_dec_relaxed) return arch_atomic_fetch_dec_relaxed(v); #elif defined(arch_atomic_fetch_dec) return arch_atomic_fetch_dec(v); #else return raw_atomic_fetch_sub_relaxed(1, v); #endif } /** * raw_atomic_and() - atomic bitwise AND with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_and() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_and(int i, atomic_t *v) { arch_atomic_and(i, v); } /** * raw_atomic_fetch_and() - atomic bitwise AND with full ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_fetch_and() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_and(int i, atomic_t *v) { #if defined(arch_atomic_fetch_and) return arch_atomic_fetch_and(i, v); #elif defined(arch_atomic_fetch_and_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_fetch_and_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic_fetch_and" #endif } /** * raw_atomic_fetch_and_acquire() - atomic bitwise AND with acquire ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_fetch_and_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_and_acquire(int i, atomic_t *v) { #if defined(arch_atomic_fetch_and_acquire) return arch_atomic_fetch_and_acquire(i, v); #elif defined(arch_atomic_fetch_and_relaxed) int ret = arch_atomic_fetch_and_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_fetch_and) return arch_atomic_fetch_and(i, v); #else #error "Unable to define raw_atomic_fetch_and_acquire" #endif } /** * raw_atomic_fetch_and_release() - atomic bitwise AND with release ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_fetch_and_release() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_and_release(int i, atomic_t *v) { #if defined(arch_atomic_fetch_and_release) return arch_atomic_fetch_and_release(i, v); #elif defined(arch_atomic_fetch_and_relaxed) __atomic_release_fence(); return arch_atomic_fetch_and_relaxed(i, v); #elif defined(arch_atomic_fetch_and) return arch_atomic_fetch_and(i, v); #else #error "Unable to define raw_atomic_fetch_and_release" #endif } /** * raw_atomic_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_fetch_and_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_and_relaxed(int i, atomic_t *v) { #if defined(arch_atomic_fetch_and_relaxed) return arch_atomic_fetch_and_relaxed(i, v); #elif defined(arch_atomic_fetch_and) return arch_atomic_fetch_and(i, v); #else #error "Unable to define raw_atomic_fetch_and_relaxed" #endif } /** * raw_atomic_andnot() - atomic bitwise AND NOT with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_andnot() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_andnot(int i, atomic_t *v) { #if defined(arch_atomic_andnot) arch_atomic_andnot(i, v); #else raw_atomic_and(~i, v); #endif } /** * raw_atomic_fetch_andnot() - atomic bitwise AND NOT with full ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & ~@i) with full ordering. * * Safe to use in noinstr code; prefer atomic_fetch_andnot() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_andnot(int i, atomic_t *v) { #if defined(arch_atomic_fetch_andnot) return arch_atomic_fetch_andnot(i, v); #elif defined(arch_atomic_fetch_andnot_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_fetch_andnot_relaxed(i, v); __atomic_post_full_fence(); return ret; #else return raw_atomic_fetch_and(~i, v); #endif } /** * raw_atomic_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & ~@i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_fetch_andnot_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_andnot_acquire(int i, atomic_t *v) { #if defined(arch_atomic_fetch_andnot_acquire) return arch_atomic_fetch_andnot_acquire(i, v); #elif defined(arch_atomic_fetch_andnot_relaxed) int ret = arch_atomic_fetch_andnot_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_fetch_andnot) return arch_atomic_fetch_andnot(i, v); #else return raw_atomic_fetch_and_acquire(~i, v); #endif } /** * raw_atomic_fetch_andnot_release() - atomic bitwise AND NOT with release ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & ~@i) with release ordering. * * Safe to use in noinstr code; prefer atomic_fetch_andnot_release() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_andnot_release(int i, atomic_t *v) { #if defined(arch_atomic_fetch_andnot_release) return arch_atomic_fetch_andnot_release(i, v); #elif defined(arch_atomic_fetch_andnot_relaxed) __atomic_release_fence(); return arch_atomic_fetch_andnot_relaxed(i, v); #elif defined(arch_atomic_fetch_andnot) return arch_atomic_fetch_andnot(i, v); #else return raw_atomic_fetch_and_release(~i, v); #endif } /** * raw_atomic_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_fetch_andnot_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_andnot_relaxed(int i, atomic_t *v) { #if defined(arch_atomic_fetch_andnot_relaxed) return arch_atomic_fetch_andnot_relaxed(i, v); #elif defined(arch_atomic_fetch_andnot) return arch_atomic_fetch_andnot(i, v); #else return raw_atomic_fetch_and_relaxed(~i, v); #endif } /** * raw_atomic_or() - atomic bitwise OR with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_or() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_or(int i, atomic_t *v) { arch_atomic_or(i, v); } /** * raw_atomic_fetch_or() - atomic bitwise OR with full ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v | @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_fetch_or() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_or(int i, atomic_t *v) { #if defined(arch_atomic_fetch_or) return arch_atomic_fetch_or(i, v); #elif defined(arch_atomic_fetch_or_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_fetch_or_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic_fetch_or" #endif } /** * raw_atomic_fetch_or_acquire() - atomic bitwise OR with acquire ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v | @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_fetch_or_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_or_acquire(int i, atomic_t *v) { #if defined(arch_atomic_fetch_or_acquire) return arch_atomic_fetch_or_acquire(i, v); #elif defined(arch_atomic_fetch_or_relaxed) int ret = arch_atomic_fetch_or_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_fetch_or) return arch_atomic_fetch_or(i, v); #else #error "Unable to define raw_atomic_fetch_or_acquire" #endif } /** * raw_atomic_fetch_or_release() - atomic bitwise OR with release ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v | @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_fetch_or_release() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_or_release(int i, atomic_t *v) { #if defined(arch_atomic_fetch_or_release) return arch_atomic_fetch_or_release(i, v); #elif defined(arch_atomic_fetch_or_relaxed) __atomic_release_fence(); return arch_atomic_fetch_or_relaxed(i, v); #elif defined(arch_atomic_fetch_or) return arch_atomic_fetch_or(i, v); #else #error "Unable to define raw_atomic_fetch_or_release" #endif } /** * raw_atomic_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_fetch_or_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_or_relaxed(int i, atomic_t *v) { #if defined(arch_atomic_fetch_or_relaxed) return arch_atomic_fetch_or_relaxed(i, v); #elif defined(arch_atomic_fetch_or) return arch_atomic_fetch_or(i, v); #else #error "Unable to define raw_atomic_fetch_or_relaxed" #endif } /** * raw_atomic_xor() - atomic bitwise XOR with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_xor() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_xor(int i, atomic_t *v) { arch_atomic_xor(i, v); } /** * raw_atomic_fetch_xor() - atomic bitwise XOR with full ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v ^ @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_fetch_xor() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_xor(int i, atomic_t *v) { #if defined(arch_atomic_fetch_xor) return arch_atomic_fetch_xor(i, v); #elif defined(arch_atomic_fetch_xor_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_fetch_xor_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic_fetch_xor" #endif } /** * raw_atomic_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v ^ @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_fetch_xor_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_xor_acquire(int i, atomic_t *v) { #if defined(arch_atomic_fetch_xor_acquire) return arch_atomic_fetch_xor_acquire(i, v); #elif defined(arch_atomic_fetch_xor_relaxed) int ret = arch_atomic_fetch_xor_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_fetch_xor) return arch_atomic_fetch_xor(i, v); #else #error "Unable to define raw_atomic_fetch_xor_acquire" #endif } /** * raw_atomic_fetch_xor_release() - atomic bitwise XOR with release ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v ^ @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_fetch_xor_release() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_xor_release(int i, atomic_t *v) { #if defined(arch_atomic_fetch_xor_release) return arch_atomic_fetch_xor_release(i, v); #elif defined(arch_atomic_fetch_xor_relaxed) __atomic_release_fence(); return arch_atomic_fetch_xor_relaxed(i, v); #elif defined(arch_atomic_fetch_xor) return arch_atomic_fetch_xor(i, v); #else #error "Unable to define raw_atomic_fetch_xor_release" #endif } /** * raw_atomic_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering * @i: int value * @v: pointer to atomic_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_fetch_xor_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_xor_relaxed(int i, atomic_t *v) { #if defined(arch_atomic_fetch_xor_relaxed) return arch_atomic_fetch_xor_relaxed(i, v); #elif defined(arch_atomic_fetch_xor) return arch_atomic_fetch_xor(i, v); #else #error "Unable to define raw_atomic_fetch_xor_relaxed" #endif } /** * raw_atomic_xchg() - atomic exchange with full ordering * @v: pointer to atomic_t * @new: int value to assign * * Atomically updates @v to @new with full ordering. * * Safe to use in noinstr code; prefer atomic_xchg() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_xchg(atomic_t *v, int new) { #if defined(arch_atomic_xchg) return arch_atomic_xchg(v, new); #elif defined(arch_atomic_xchg_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_xchg_relaxed(v, new); __atomic_post_full_fence(); return ret; #else return raw_xchg(&v->counter, new); #endif } /** * raw_atomic_xchg_acquire() - atomic exchange with acquire ordering * @v: pointer to atomic_t * @new: int value to assign * * Atomically updates @v to @new with acquire ordering. * * Safe to use in noinstr code; prefer atomic_xchg_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_xchg_acquire(atomic_t *v, int new) { #if defined(arch_atomic_xchg_acquire) return arch_atomic_xchg_acquire(v, new); #elif defined(arch_atomic_xchg_relaxed) int ret = arch_atomic_xchg_relaxed(v, new); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_xchg) return arch_atomic_xchg(v, new); #else return raw_xchg_acquire(&v->counter, new); #endif } /** * raw_atomic_xchg_release() - atomic exchange with release ordering * @v: pointer to atomic_t * @new: int value to assign * * Atomically updates @v to @new with release ordering. * * Safe to use in noinstr code; prefer atomic_xchg_release() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_xchg_release(atomic_t *v, int new) { #if defined(arch_atomic_xchg_release) return arch_atomic_xchg_release(v, new); #elif defined(arch_atomic_xchg_relaxed) __atomic_release_fence(); return arch_atomic_xchg_relaxed(v, new); #elif defined(arch_atomic_xchg) return arch_atomic_xchg(v, new); #else return raw_xchg_release(&v->counter, new); #endif } /** * raw_atomic_xchg_relaxed() - atomic exchange with relaxed ordering * @v: pointer to atomic_t * @new: int value to assign * * Atomically updates @v to @new with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_xchg_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_xchg_relaxed(atomic_t *v, int new) { #if defined(arch_atomic_xchg_relaxed) return arch_atomic_xchg_relaxed(v, new); #elif defined(arch_atomic_xchg) return arch_atomic_xchg(v, new); #else return raw_xchg_relaxed(&v->counter, new); #endif } /** * raw_atomic_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic_t * @old: int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * * Safe to use in noinstr code; prefer atomic_cmpxchg() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_cmpxchg(atomic_t *v, int old, int new) { #if defined(arch_atomic_cmpxchg) return arch_atomic_cmpxchg(v, old, new); #elif defined(arch_atomic_cmpxchg_relaxed) int ret; __atomic_pre_full_fence(); ret = arch_atomic_cmpxchg_relaxed(v, old, new); __atomic_post_full_fence(); return ret; #else return raw_cmpxchg(&v->counter, old, new); #endif } /** * raw_atomic_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic_t * @old: int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * * Safe to use in noinstr code; prefer atomic_cmpxchg_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_cmpxchg_acquire(atomic_t *v, int old, int new) { #if defined(arch_atomic_cmpxchg_acquire) return arch_atomic_cmpxchg_acquire(v, old, new); #elif defined(arch_atomic_cmpxchg_relaxed) int ret = arch_atomic_cmpxchg_relaxed(v, old, new); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_cmpxchg) return arch_atomic_cmpxchg(v, old, new); #else return raw_cmpxchg_acquire(&v->counter, old, new); #endif } /** * raw_atomic_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic_t * @old: int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * * Safe to use in noinstr code; prefer atomic_cmpxchg_release() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_cmpxchg_release(atomic_t *v, int old, int new) { #if defined(arch_atomic_cmpxchg_release) return arch_atomic_cmpxchg_release(v, old, new); #elif defined(arch_atomic_cmpxchg_relaxed) __atomic_release_fence(); return arch_atomic_cmpxchg_relaxed(v, old, new); #elif defined(arch_atomic_cmpxchg) return arch_atomic_cmpxchg(v, old, new); #else return raw_cmpxchg_release(&v->counter, old, new); #endif } /** * raw_atomic_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic_t * @old: int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_cmpxchg_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_cmpxchg_relaxed(atomic_t *v, int old, int new) { #if defined(arch_atomic_cmpxchg_relaxed) return arch_atomic_cmpxchg_relaxed(v, old, new); #elif defined(arch_atomic_cmpxchg) return arch_atomic_cmpxchg(v, old, new); #else return raw_cmpxchg_relaxed(&v->counter, old, new); #endif } /** * raw_atomic_try_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic_t * @old: pointer to int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * Otherwise, updates @old to the current value of @v. * * Safe to use in noinstr code; prefer atomic_try_cmpxchg() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic_try_cmpxchg(atomic_t *v, int *old, int new) { #if defined(arch_atomic_try_cmpxchg) return arch_atomic_try_cmpxchg(v, old, new); #elif defined(arch_atomic_try_cmpxchg_relaxed) bool ret; __atomic_pre_full_fence(); ret = arch_atomic_try_cmpxchg_relaxed(v, old, new); __atomic_post_full_fence(); return ret; #else int r, o = *old; r = raw_atomic_cmpxchg(v, o, new); if (unlikely(r != o)) *old = r; return likely(r == o); #endif } /** * raw_atomic_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic_t * @old: pointer to int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * Otherwise, updates @old to the current value of @v. * * Safe to use in noinstr code; prefer atomic_try_cmpxchg_acquire() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new) { #if defined(arch_atomic_try_cmpxchg_acquire) return arch_atomic_try_cmpxchg_acquire(v, old, new); #elif defined(arch_atomic_try_cmpxchg_relaxed) bool ret = arch_atomic_try_cmpxchg_relaxed(v, old, new); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_try_cmpxchg) return arch_atomic_try_cmpxchg(v, old, new); #else int r, o = *old; r = raw_atomic_cmpxchg_acquire(v, o, new); if (unlikely(r != o)) *old = r; return likely(r == o); #endif } /** * raw_atomic_try_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic_t * @old: pointer to int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * Otherwise, updates @old to the current value of @v. * * Safe to use in noinstr code; prefer atomic_try_cmpxchg_release() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic_try_cmpxchg_release(atomic_t *v, int *old, int new) { #if defined(arch_atomic_try_cmpxchg_release) return arch_atomic_try_cmpxchg_release(v, old, new); #elif defined(arch_atomic_try_cmpxchg_relaxed) __atomic_release_fence(); return arch_atomic_try_cmpxchg_relaxed(v, old, new); #elif defined(arch_atomic_try_cmpxchg) return arch_atomic_try_cmpxchg(v, old, new); #else int r, o = *old; r = raw_atomic_cmpxchg_release(v, o, new); if (unlikely(r != o)) *old = r; return likely(r == o); #endif } /** * raw_atomic_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic_t * @old: pointer to int value to compare with * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * Otherwise, updates @old to the current value of @v. * * Safe to use in noinstr code; prefer atomic_try_cmpxchg_relaxed() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new) { #if defined(arch_atomic_try_cmpxchg_relaxed) return arch_atomic_try_cmpxchg_relaxed(v, old, new); #elif defined(arch_atomic_try_cmpxchg) return arch_atomic_try_cmpxchg(v, old, new); #else int r, o = *old; r = raw_atomic_cmpxchg_relaxed(v, o, new); if (unlikely(r != o)) *old = r; return likely(r == o); #endif } /** * raw_atomic_sub_and_test() - atomic subtract and test if zero with full ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_sub_and_test() elsewhere. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool raw_atomic_sub_and_test(int i, atomic_t *v) { #if defined(arch_atomic_sub_and_test) return arch_atomic_sub_and_test(i, v); #else return raw_atomic_sub_return(i, v) == 0; #endif } /** * raw_atomic_dec_and_test() - atomic decrement and test if zero with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_dec_and_test() elsewhere. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool raw_atomic_dec_and_test(atomic_t *v) { #if defined(arch_atomic_dec_and_test) return arch_atomic_dec_and_test(v); #else return raw_atomic_dec_return(v) == 0; #endif } /** * raw_atomic_inc_and_test() - atomic increment and test if zero with full ordering * @v: pointer to atomic_t * * Atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_inc_and_test() elsewhere. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool raw_atomic_inc_and_test(atomic_t *v) { #if defined(arch_atomic_inc_and_test) return arch_atomic_inc_and_test(v); #else return raw_atomic_inc_return(v) == 0; #endif } /** * raw_atomic_add_negative() - atomic add and test if negative with full ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_add_negative() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic_add_negative(int i, atomic_t *v) { #if defined(arch_atomic_add_negative) return arch_atomic_add_negative(i, v); #elif defined(arch_atomic_add_negative_relaxed) bool ret; __atomic_pre_full_fence(); ret = arch_atomic_add_negative_relaxed(i, v); __atomic_post_full_fence(); return ret; #else return raw_atomic_add_return(i, v) < 0; #endif } /** * raw_atomic_add_negative_acquire() - atomic add and test if negative with acquire ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_add_negative_acquire() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic_add_negative_acquire(int i, atomic_t *v) { #if defined(arch_atomic_add_negative_acquire) return arch_atomic_add_negative_acquire(i, v); #elif defined(arch_atomic_add_negative_relaxed) bool ret = arch_atomic_add_negative_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic_add_negative) return arch_atomic_add_negative(i, v); #else return raw_atomic_add_return_acquire(i, v) < 0; #endif } /** * raw_atomic_add_negative_release() - atomic add and test if negative with release ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_add_negative_release() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic_add_negative_release(int i, atomic_t *v) { #if defined(arch_atomic_add_negative_release) return arch_atomic_add_negative_release(i, v); #elif defined(arch_atomic_add_negative_relaxed) __atomic_release_fence(); return arch_atomic_add_negative_relaxed(i, v); #elif defined(arch_atomic_add_negative) return arch_atomic_add_negative(i, v); #else return raw_atomic_add_return_release(i, v) < 0; #endif } /** * raw_atomic_add_negative_relaxed() - atomic add and test if negative with relaxed ordering * @i: int value to add * @v: pointer to atomic_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_add_negative_relaxed() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic_add_negative_relaxed(int i, atomic_t *v) { #if defined(arch_atomic_add_negative_relaxed) return arch_atomic_add_negative_relaxed(i, v); #elif defined(arch_atomic_add_negative) return arch_atomic_add_negative(i, v); #else return raw_atomic_add_return_relaxed(i, v) < 0; #endif } /** * raw_atomic_fetch_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic_t * @a: int value to add * @u: int value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * * Safe to use in noinstr code; prefer atomic_fetch_add_unless() elsewhere. * * Return: The original value of @v. */ static __always_inline int raw_atomic_fetch_add_unless(atomic_t *v, int a, int u) { #if defined(arch_atomic_fetch_add_unless) return arch_atomic_fetch_add_unless(v, a, u); #else int c = raw_atomic_read(v); do { if (unlikely(c == u)) break; } while (!raw_atomic_try_cmpxchg(v, &c, c + a)); return c; #endif } /** * raw_atomic_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic_t * @a: int value to add * @u: int value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * * Safe to use in noinstr code; prefer atomic_add_unless() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic_add_unless(atomic_t *v, int a, int u) { #if defined(arch_atomic_add_unless) return arch_atomic_add_unless(v, a, u); #else return raw_atomic_fetch_add_unless(v, a, u) != u; #endif } /** * raw_atomic_inc_not_zero() - atomic increment unless zero with full ordering * @v: pointer to atomic_t * * If (@v != 0), atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_inc_not_zero() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic_inc_not_zero(atomic_t *v) { #if defined(arch_atomic_inc_not_zero) return arch_atomic_inc_not_zero(v); #else return raw_atomic_add_unless(v, 1, 0); #endif } /** * raw_atomic_inc_unless_negative() - atomic increment unless negative with full ordering * @v: pointer to atomic_t * * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_inc_unless_negative() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic_inc_unless_negative(atomic_t *v) { #if defined(arch_atomic_inc_unless_negative) return arch_atomic_inc_unless_negative(v); #else int c = raw_atomic_read(v); do { if (unlikely(c < 0)) return false; } while (!raw_atomic_try_cmpxchg(v, &c, c + 1)); return true; #endif } /** * raw_atomic_dec_unless_positive() - atomic decrement unless positive with full ordering * @v: pointer to atomic_t * * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_dec_unless_positive() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic_dec_unless_positive(atomic_t *v) { #if defined(arch_atomic_dec_unless_positive) return arch_atomic_dec_unless_positive(v); #else int c = raw_atomic_read(v); do { if (unlikely(c > 0)) return false; } while (!raw_atomic_try_cmpxchg(v, &c, c - 1)); return true; #endif } /** * raw_atomic_dec_if_positive() - atomic decrement if positive with full ordering * @v: pointer to atomic_t * * If (@v > 0), atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_dec_if_positive() elsewhere. * * Return: The old value of (@v - 1), regardless of whether @v was updated. */ static __always_inline int raw_atomic_dec_if_positive(atomic_t *v) { #if defined(arch_atomic_dec_if_positive) return arch_atomic_dec_if_positive(v); #else int dec, c = raw_atomic_read(v); do { dec = c - 1; if (unlikely(dec < 0)) break; } while (!raw_atomic_try_cmpxchg(v, &c, dec)); return dec; #endif } #ifdef CONFIG_GENERIC_ATOMIC64 #include <asm-generic/atomic64.h> #endif /** * raw_atomic64_read() - atomic load with relaxed ordering * @v: pointer to atomic64_t * * Atomically loads the value of @v with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_read() elsewhere. * * Return: The value loaded from @v. */ static __always_inline s64 raw_atomic64_read(const atomic64_t *v) { return arch_atomic64_read(v); } /** * raw_atomic64_read_acquire() - atomic load with acquire ordering * @v: pointer to atomic64_t * * Atomically loads the value of @v with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_read_acquire() elsewhere. * * Return: The value loaded from @v. */ static __always_inline s64 raw_atomic64_read_acquire(const atomic64_t *v) { #if defined(arch_atomic64_read_acquire) return arch_atomic64_read_acquire(v); #else s64 ret; if (__native_word(atomic64_t)) { ret = smp_load_acquire(&(v)->counter); } else { ret = raw_atomic64_read(v); __atomic_acquire_fence(); } return ret; #endif } /** * raw_atomic64_set() - atomic set with relaxed ordering * @v: pointer to atomic64_t * @i: s64 value to assign * * Atomically sets @v to @i with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_set() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic64_set(atomic64_t *v, s64 i) { arch_atomic64_set(v, i); } /** * raw_atomic64_set_release() - atomic set with release ordering * @v: pointer to atomic64_t * @i: s64 value to assign * * Atomically sets @v to @i with release ordering. * * Safe to use in noinstr code; prefer atomic64_set_release() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic64_set_release(atomic64_t *v, s64 i) { #if defined(arch_atomic64_set_release) arch_atomic64_set_release(v, i); #else if (__native_word(atomic64_t)) { smp_store_release(&(v)->counter, i); } else { __atomic_release_fence(); raw_atomic64_set(v, i); } #endif } /** * raw_atomic64_add() - atomic add with relaxed ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_add() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic64_add(s64 i, atomic64_t *v) { arch_atomic64_add(i, v); } /** * raw_atomic64_add_return() - atomic add with full ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with full ordering. * * Safe to use in noinstr code; prefer atomic64_add_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_add_return(s64 i, atomic64_t *v) { #if defined(arch_atomic64_add_return) return arch_atomic64_add_return(i, v); #elif defined(arch_atomic64_add_return_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_add_return_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic64_add_return" #endif } /** * raw_atomic64_add_return_acquire() - atomic add with acquire ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_add_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_add_return_acquire(s64 i, atomic64_t *v) { #if defined(arch_atomic64_add_return_acquire) return arch_atomic64_add_return_acquire(i, v); #elif defined(arch_atomic64_add_return_relaxed) s64 ret = arch_atomic64_add_return_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_add_return) return arch_atomic64_add_return(i, v); #else #error "Unable to define raw_atomic64_add_return_acquire" #endif } /** * raw_atomic64_add_return_release() - atomic add with release ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with release ordering. * * Safe to use in noinstr code; prefer atomic64_add_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_add_return_release(s64 i, atomic64_t *v) { #if defined(arch_atomic64_add_return_release) return arch_atomic64_add_return_release(i, v); #elif defined(arch_atomic64_add_return_relaxed) __atomic_release_fence(); return arch_atomic64_add_return_relaxed(i, v); #elif defined(arch_atomic64_add_return) return arch_atomic64_add_return(i, v); #else #error "Unable to define raw_atomic64_add_return_release" #endif } /** * raw_atomic64_add_return_relaxed() - atomic add with relaxed ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_add_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_add_return_relaxed(s64 i, atomic64_t *v) { #if defined(arch_atomic64_add_return_relaxed) return arch_atomic64_add_return_relaxed(i, v); #elif defined(arch_atomic64_add_return) return arch_atomic64_add_return(i, v); #else #error "Unable to define raw_atomic64_add_return_relaxed" #endif } /** * raw_atomic64_fetch_add() - atomic add with full ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with full ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_add() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_add(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_add) return arch_atomic64_fetch_add(i, v); #elif defined(arch_atomic64_fetch_add_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_fetch_add_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic64_fetch_add" #endif } /** * raw_atomic64_fetch_add_acquire() - atomic add with acquire ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_add_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_add_acquire(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_add_acquire) return arch_atomic64_fetch_add_acquire(i, v); #elif defined(arch_atomic64_fetch_add_relaxed) s64 ret = arch_atomic64_fetch_add_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_fetch_add) return arch_atomic64_fetch_add(i, v); #else #error "Unable to define raw_atomic64_fetch_add_acquire" #endif } /** * raw_atomic64_fetch_add_release() - atomic add with release ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with release ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_add_release() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_add_release(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_add_release) return arch_atomic64_fetch_add_release(i, v); #elif defined(arch_atomic64_fetch_add_relaxed) __atomic_release_fence(); return arch_atomic64_fetch_add_relaxed(i, v); #elif defined(arch_atomic64_fetch_add) return arch_atomic64_fetch_add(i, v); #else #error "Unable to define raw_atomic64_fetch_add_release" #endif } /** * raw_atomic64_fetch_add_relaxed() - atomic add with relaxed ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_add_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_add_relaxed(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_add_relaxed) return arch_atomic64_fetch_add_relaxed(i, v); #elif defined(arch_atomic64_fetch_add) return arch_atomic64_fetch_add(i, v); #else #error "Unable to define raw_atomic64_fetch_add_relaxed" #endif } /** * raw_atomic64_sub() - atomic subtract with relaxed ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_sub() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic64_sub(s64 i, atomic64_t *v) { arch_atomic64_sub(i, v); } /** * raw_atomic64_sub_return() - atomic subtract with full ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with full ordering. * * Safe to use in noinstr code; prefer atomic64_sub_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_sub_return(s64 i, atomic64_t *v) { #if defined(arch_atomic64_sub_return) return arch_atomic64_sub_return(i, v); #elif defined(arch_atomic64_sub_return_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_sub_return_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic64_sub_return" #endif } /** * raw_atomic64_sub_return_acquire() - atomic subtract with acquire ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_sub_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_sub_return_acquire(s64 i, atomic64_t *v) { #if defined(arch_atomic64_sub_return_acquire) return arch_atomic64_sub_return_acquire(i, v); #elif defined(arch_atomic64_sub_return_relaxed) s64 ret = arch_atomic64_sub_return_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_sub_return) return arch_atomic64_sub_return(i, v); #else #error "Unable to define raw_atomic64_sub_return_acquire" #endif } /** * raw_atomic64_sub_return_release() - atomic subtract with release ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with release ordering. * * Safe to use in noinstr code; prefer atomic64_sub_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_sub_return_release(s64 i, atomic64_t *v) { #if defined(arch_atomic64_sub_return_release) return arch_atomic64_sub_return_release(i, v); #elif defined(arch_atomic64_sub_return_relaxed) __atomic_release_fence(); return arch_atomic64_sub_return_relaxed(i, v); #elif defined(arch_atomic64_sub_return) return arch_atomic64_sub_return(i, v); #else #error "Unable to define raw_atomic64_sub_return_release" #endif } /** * raw_atomic64_sub_return_relaxed() - atomic subtract with relaxed ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_sub_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_sub_return_relaxed(s64 i, atomic64_t *v) { #if defined(arch_atomic64_sub_return_relaxed) return arch_atomic64_sub_return_relaxed(i, v); #elif defined(arch_atomic64_sub_return) return arch_atomic64_sub_return(i, v); #else #error "Unable to define raw_atomic64_sub_return_relaxed" #endif } /** * raw_atomic64_fetch_sub() - atomic subtract with full ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with full ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_sub() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_sub(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_sub) return arch_atomic64_fetch_sub(i, v); #elif defined(arch_atomic64_fetch_sub_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_fetch_sub_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic64_fetch_sub" #endif } /** * raw_atomic64_fetch_sub_acquire() - atomic subtract with acquire ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_sub_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_sub_acquire(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_sub_acquire) return arch_atomic64_fetch_sub_acquire(i, v); #elif defined(arch_atomic64_fetch_sub_relaxed) s64 ret = arch_atomic64_fetch_sub_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_fetch_sub) return arch_atomic64_fetch_sub(i, v); #else #error "Unable to define raw_atomic64_fetch_sub_acquire" #endif } /** * raw_atomic64_fetch_sub_release() - atomic subtract with release ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with release ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_sub_release() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_sub_release(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_sub_release) return arch_atomic64_fetch_sub_release(i, v); #elif defined(arch_atomic64_fetch_sub_relaxed) __atomic_release_fence(); return arch_atomic64_fetch_sub_relaxed(i, v); #elif defined(arch_atomic64_fetch_sub) return arch_atomic64_fetch_sub(i, v); #else #error "Unable to define raw_atomic64_fetch_sub_release" #endif } /** * raw_atomic64_fetch_sub_relaxed() - atomic subtract with relaxed ordering * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_sub_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_sub_relaxed(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_sub_relaxed) return arch_atomic64_fetch_sub_relaxed(i, v); #elif defined(arch_atomic64_fetch_sub) return arch_atomic64_fetch_sub(i, v); #else #error "Unable to define raw_atomic64_fetch_sub_relaxed" #endif } /** * raw_atomic64_inc() - atomic increment with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_inc() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic64_inc(atomic64_t *v) { #if defined(arch_atomic64_inc) arch_atomic64_inc(v); #else raw_atomic64_add(1, v); #endif } /** * raw_atomic64_inc_return() - atomic increment with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic64_inc_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_inc_return(atomic64_t *v) { #if defined(arch_atomic64_inc_return) return arch_atomic64_inc_return(v); #elif defined(arch_atomic64_inc_return_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_inc_return_relaxed(v); __atomic_post_full_fence(); return ret; #else return raw_atomic64_add_return(1, v); #endif } /** * raw_atomic64_inc_return_acquire() - atomic increment with acquire ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_inc_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_inc_return_acquire(atomic64_t *v) { #if defined(arch_atomic64_inc_return_acquire) return arch_atomic64_inc_return_acquire(v); #elif defined(arch_atomic64_inc_return_relaxed) s64 ret = arch_atomic64_inc_return_relaxed(v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_inc_return) return arch_atomic64_inc_return(v); #else return raw_atomic64_add_return_acquire(1, v); #endif } /** * raw_atomic64_inc_return_release() - atomic increment with release ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with release ordering. * * Safe to use in noinstr code; prefer atomic64_inc_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_inc_return_release(atomic64_t *v) { #if defined(arch_atomic64_inc_return_release) return arch_atomic64_inc_return_release(v); #elif defined(arch_atomic64_inc_return_relaxed) __atomic_release_fence(); return arch_atomic64_inc_return_relaxed(v); #elif defined(arch_atomic64_inc_return) return arch_atomic64_inc_return(v); #else return raw_atomic64_add_return_release(1, v); #endif } /** * raw_atomic64_inc_return_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_inc_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_inc_return_relaxed(atomic64_t *v) { #if defined(arch_atomic64_inc_return_relaxed) return arch_atomic64_inc_return_relaxed(v); #elif defined(arch_atomic64_inc_return) return arch_atomic64_inc_return(v); #else return raw_atomic64_add_return_relaxed(1, v); #endif } /** * raw_atomic64_fetch_inc() - atomic increment with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_inc() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_inc(atomic64_t *v) { #if defined(arch_atomic64_fetch_inc) return arch_atomic64_fetch_inc(v); #elif defined(arch_atomic64_fetch_inc_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_fetch_inc_relaxed(v); __atomic_post_full_fence(); return ret; #else return raw_atomic64_fetch_add(1, v); #endif } /** * raw_atomic64_fetch_inc_acquire() - atomic increment with acquire ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_inc_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_inc_acquire(atomic64_t *v) { #if defined(arch_atomic64_fetch_inc_acquire) return arch_atomic64_fetch_inc_acquire(v); #elif defined(arch_atomic64_fetch_inc_relaxed) s64 ret = arch_atomic64_fetch_inc_relaxed(v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_fetch_inc) return arch_atomic64_fetch_inc(v); #else return raw_atomic64_fetch_add_acquire(1, v); #endif } /** * raw_atomic64_fetch_inc_release() - atomic increment with release ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with release ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_inc_release() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_inc_release(atomic64_t *v) { #if defined(arch_atomic64_fetch_inc_release) return arch_atomic64_fetch_inc_release(v); #elif defined(arch_atomic64_fetch_inc_relaxed) __atomic_release_fence(); return arch_atomic64_fetch_inc_relaxed(v); #elif defined(arch_atomic64_fetch_inc) return arch_atomic64_fetch_inc(v); #else return raw_atomic64_fetch_add_release(1, v); #endif } /** * raw_atomic64_fetch_inc_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_inc_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_inc_relaxed(atomic64_t *v) { #if defined(arch_atomic64_fetch_inc_relaxed) return arch_atomic64_fetch_inc_relaxed(v); #elif defined(arch_atomic64_fetch_inc) return arch_atomic64_fetch_inc(v); #else return raw_atomic64_fetch_add_relaxed(1, v); #endif } /** * raw_atomic64_dec() - atomic decrement with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_dec() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic64_dec(atomic64_t *v) { #if defined(arch_atomic64_dec) arch_atomic64_dec(v); #else raw_atomic64_sub(1, v); #endif } /** * raw_atomic64_dec_return() - atomic decrement with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic64_dec_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_dec_return(atomic64_t *v) { #if defined(arch_atomic64_dec_return) return arch_atomic64_dec_return(v); #elif defined(arch_atomic64_dec_return_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_dec_return_relaxed(v); __atomic_post_full_fence(); return ret; #else return raw_atomic64_sub_return(1, v); #endif } /** * raw_atomic64_dec_return_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_dec_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_dec_return_acquire(atomic64_t *v) { #if defined(arch_atomic64_dec_return_acquire) return arch_atomic64_dec_return_acquire(v); #elif defined(arch_atomic64_dec_return_relaxed) s64 ret = arch_atomic64_dec_return_relaxed(v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_dec_return) return arch_atomic64_dec_return(v); #else return raw_atomic64_sub_return_acquire(1, v); #endif } /** * raw_atomic64_dec_return_release() - atomic decrement with release ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with release ordering. * * Safe to use in noinstr code; prefer atomic64_dec_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_dec_return_release(atomic64_t *v) { #if defined(arch_atomic64_dec_return_release) return arch_atomic64_dec_return_release(v); #elif defined(arch_atomic64_dec_return_relaxed) __atomic_release_fence(); return arch_atomic64_dec_return_relaxed(v); #elif defined(arch_atomic64_dec_return) return arch_atomic64_dec_return(v); #else return raw_atomic64_sub_return_release(1, v); #endif } /** * raw_atomic64_dec_return_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_dec_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline s64 raw_atomic64_dec_return_relaxed(atomic64_t *v) { #if defined(arch_atomic64_dec_return_relaxed) return arch_atomic64_dec_return_relaxed(v); #elif defined(arch_atomic64_dec_return) return arch_atomic64_dec_return(v); #else return raw_atomic64_sub_return_relaxed(1, v); #endif } /** * raw_atomic64_fetch_dec() - atomic decrement with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_dec() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_dec(atomic64_t *v) { #if defined(arch_atomic64_fetch_dec) return arch_atomic64_fetch_dec(v); #elif defined(arch_atomic64_fetch_dec_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_fetch_dec_relaxed(v); __atomic_post_full_fence(); return ret; #else return raw_atomic64_fetch_sub(1, v); #endif } /** * raw_atomic64_fetch_dec_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_dec_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_dec_acquire(atomic64_t *v) { #if defined(arch_atomic64_fetch_dec_acquire) return arch_atomic64_fetch_dec_acquire(v); #elif defined(arch_atomic64_fetch_dec_relaxed) s64 ret = arch_atomic64_fetch_dec_relaxed(v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_fetch_dec) return arch_atomic64_fetch_dec(v); #else return raw_atomic64_fetch_sub_acquire(1, v); #endif } /** * raw_atomic64_fetch_dec_release() - atomic decrement with release ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with release ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_dec_release() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_dec_release(atomic64_t *v) { #if defined(arch_atomic64_fetch_dec_release) return arch_atomic64_fetch_dec_release(v); #elif defined(arch_atomic64_fetch_dec_relaxed) __atomic_release_fence(); return arch_atomic64_fetch_dec_relaxed(v); #elif defined(arch_atomic64_fetch_dec) return arch_atomic64_fetch_dec(v); #else return raw_atomic64_fetch_sub_release(1, v); #endif } /** * raw_atomic64_fetch_dec_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_dec_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_dec_relaxed(atomic64_t *v) { #if defined(arch_atomic64_fetch_dec_relaxed) return arch_atomic64_fetch_dec_relaxed(v); #elif defined(arch_atomic64_fetch_dec) return arch_atomic64_fetch_dec(v); #else return raw_atomic64_fetch_sub_relaxed(1, v); #endif } /** * raw_atomic64_and() - atomic bitwise AND with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_and() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic64_and(s64 i, atomic64_t *v) { arch_atomic64_and(i, v); } /** * raw_atomic64_fetch_and() - atomic bitwise AND with full ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & @i) with full ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_and() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_and(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_and) return arch_atomic64_fetch_and(i, v); #elif defined(arch_atomic64_fetch_and_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_fetch_and_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic64_fetch_and" #endif } /** * raw_atomic64_fetch_and_acquire() - atomic bitwise AND with acquire ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_and_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_and_acquire(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_and_acquire) return arch_atomic64_fetch_and_acquire(i, v); #elif defined(arch_atomic64_fetch_and_relaxed) s64 ret = arch_atomic64_fetch_and_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_fetch_and) return arch_atomic64_fetch_and(i, v); #else #error "Unable to define raw_atomic64_fetch_and_acquire" #endif } /** * raw_atomic64_fetch_and_release() - atomic bitwise AND with release ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & @i) with release ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_and_release() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_and_release(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_and_release) return arch_atomic64_fetch_and_release(i, v); #elif defined(arch_atomic64_fetch_and_relaxed) __atomic_release_fence(); return arch_atomic64_fetch_and_relaxed(i, v); #elif defined(arch_atomic64_fetch_and) return arch_atomic64_fetch_and(i, v); #else #error "Unable to define raw_atomic64_fetch_and_release" #endif } /** * raw_atomic64_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_and_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_and_relaxed(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_and_relaxed) return arch_atomic64_fetch_and_relaxed(i, v); #elif defined(arch_atomic64_fetch_and) return arch_atomic64_fetch_and(i, v); #else #error "Unable to define raw_atomic64_fetch_and_relaxed" #endif } /** * raw_atomic64_andnot() - atomic bitwise AND NOT with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_andnot() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic64_andnot(s64 i, atomic64_t *v) { #if defined(arch_atomic64_andnot) arch_atomic64_andnot(i, v); #else raw_atomic64_and(~i, v); #endif } /** * raw_atomic64_fetch_andnot() - atomic bitwise AND NOT with full ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & ~@i) with full ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_andnot() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_andnot(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_andnot) return arch_atomic64_fetch_andnot(i, v); #elif defined(arch_atomic64_fetch_andnot_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_fetch_andnot_relaxed(i, v); __atomic_post_full_fence(); return ret; #else return raw_atomic64_fetch_and(~i, v); #endif } /** * raw_atomic64_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & ~@i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_andnot_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_andnot_acquire) return arch_atomic64_fetch_andnot_acquire(i, v); #elif defined(arch_atomic64_fetch_andnot_relaxed) s64 ret = arch_atomic64_fetch_andnot_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_fetch_andnot) return arch_atomic64_fetch_andnot(i, v); #else return raw_atomic64_fetch_and_acquire(~i, v); #endif } /** * raw_atomic64_fetch_andnot_release() - atomic bitwise AND NOT with release ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & ~@i) with release ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_andnot_release() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_andnot_release(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_andnot_release) return arch_atomic64_fetch_andnot_release(i, v); #elif defined(arch_atomic64_fetch_andnot_relaxed) __atomic_release_fence(); return arch_atomic64_fetch_andnot_relaxed(i, v); #elif defined(arch_atomic64_fetch_andnot) return arch_atomic64_fetch_andnot(i, v); #else return raw_atomic64_fetch_and_release(~i, v); #endif } /** * raw_atomic64_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_andnot_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_andnot_relaxed(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_andnot_relaxed) return arch_atomic64_fetch_andnot_relaxed(i, v); #elif defined(arch_atomic64_fetch_andnot) return arch_atomic64_fetch_andnot(i, v); #else return raw_atomic64_fetch_and_relaxed(~i, v); #endif } /** * raw_atomic64_or() - atomic bitwise OR with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_or() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic64_or(s64 i, atomic64_t *v) { arch_atomic64_or(i, v); } /** * raw_atomic64_fetch_or() - atomic bitwise OR with full ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v | @i) with full ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_or() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_or(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_or) return arch_atomic64_fetch_or(i, v); #elif defined(arch_atomic64_fetch_or_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_fetch_or_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic64_fetch_or" #endif } /** * raw_atomic64_fetch_or_acquire() - atomic bitwise OR with acquire ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v | @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_or_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_or_acquire(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_or_acquire) return arch_atomic64_fetch_or_acquire(i, v); #elif defined(arch_atomic64_fetch_or_relaxed) s64 ret = arch_atomic64_fetch_or_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_fetch_or) return arch_atomic64_fetch_or(i, v); #else #error "Unable to define raw_atomic64_fetch_or_acquire" #endif } /** * raw_atomic64_fetch_or_release() - atomic bitwise OR with release ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v | @i) with release ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_or_release() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_or_release(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_or_release) return arch_atomic64_fetch_or_release(i, v); #elif defined(arch_atomic64_fetch_or_relaxed) __atomic_release_fence(); return arch_atomic64_fetch_or_relaxed(i, v); #elif defined(arch_atomic64_fetch_or) return arch_atomic64_fetch_or(i, v); #else #error "Unable to define raw_atomic64_fetch_or_release" #endif } /** * raw_atomic64_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_or_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_or_relaxed(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_or_relaxed) return arch_atomic64_fetch_or_relaxed(i, v); #elif defined(arch_atomic64_fetch_or) return arch_atomic64_fetch_or(i, v); #else #error "Unable to define raw_atomic64_fetch_or_relaxed" #endif } /** * raw_atomic64_xor() - atomic bitwise XOR with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_xor() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic64_xor(s64 i, atomic64_t *v) { arch_atomic64_xor(i, v); } /** * raw_atomic64_fetch_xor() - atomic bitwise XOR with full ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v ^ @i) with full ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_xor() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_xor(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_xor) return arch_atomic64_fetch_xor(i, v); #elif defined(arch_atomic64_fetch_xor_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_fetch_xor_relaxed(i, v); __atomic_post_full_fence(); return ret; #else #error "Unable to define raw_atomic64_fetch_xor" #endif } /** * raw_atomic64_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v ^ @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_xor_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_xor_acquire(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_xor_acquire) return arch_atomic64_fetch_xor_acquire(i, v); #elif defined(arch_atomic64_fetch_xor_relaxed) s64 ret = arch_atomic64_fetch_xor_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_fetch_xor) return arch_atomic64_fetch_xor(i, v); #else #error "Unable to define raw_atomic64_fetch_xor_acquire" #endif } /** * raw_atomic64_fetch_xor_release() - atomic bitwise XOR with release ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v ^ @i) with release ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_xor_release() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_xor_release(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_xor_release) return arch_atomic64_fetch_xor_release(i, v); #elif defined(arch_atomic64_fetch_xor_relaxed) __atomic_release_fence(); return arch_atomic64_fetch_xor_relaxed(i, v); #elif defined(arch_atomic64_fetch_xor) return arch_atomic64_fetch_xor(i, v); #else #error "Unable to define raw_atomic64_fetch_xor_release" #endif } /** * raw_atomic64_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering * @i: s64 value * @v: pointer to atomic64_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_xor_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_xor_relaxed(s64 i, atomic64_t *v) { #if defined(arch_atomic64_fetch_xor_relaxed) return arch_atomic64_fetch_xor_relaxed(i, v); #elif defined(arch_atomic64_fetch_xor) return arch_atomic64_fetch_xor(i, v); #else #error "Unable to define raw_atomic64_fetch_xor_relaxed" #endif } /** * raw_atomic64_xchg() - atomic exchange with full ordering * @v: pointer to atomic64_t * @new: s64 value to assign * * Atomically updates @v to @new with full ordering. * * Safe to use in noinstr code; prefer atomic64_xchg() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_xchg(atomic64_t *v, s64 new) { #if defined(arch_atomic64_xchg) return arch_atomic64_xchg(v, new); #elif defined(arch_atomic64_xchg_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_xchg_relaxed(v, new); __atomic_post_full_fence(); return ret; #else return raw_xchg(&v->counter, new); #endif } /** * raw_atomic64_xchg_acquire() - atomic exchange with acquire ordering * @v: pointer to atomic64_t * @new: s64 value to assign * * Atomically updates @v to @new with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_xchg_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_xchg_acquire(atomic64_t *v, s64 new) { #if defined(arch_atomic64_xchg_acquire) return arch_atomic64_xchg_acquire(v, new); #elif defined(arch_atomic64_xchg_relaxed) s64 ret = arch_atomic64_xchg_relaxed(v, new); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_xchg) return arch_atomic64_xchg(v, new); #else return raw_xchg_acquire(&v->counter, new); #endif } /** * raw_atomic64_xchg_release() - atomic exchange with release ordering * @v: pointer to atomic64_t * @new: s64 value to assign * * Atomically updates @v to @new with release ordering. * * Safe to use in noinstr code; prefer atomic64_xchg_release() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_xchg_release(atomic64_t *v, s64 new) { #if defined(arch_atomic64_xchg_release) return arch_atomic64_xchg_release(v, new); #elif defined(arch_atomic64_xchg_relaxed) __atomic_release_fence(); return arch_atomic64_xchg_relaxed(v, new); #elif defined(arch_atomic64_xchg) return arch_atomic64_xchg(v, new); #else return raw_xchg_release(&v->counter, new); #endif } /** * raw_atomic64_xchg_relaxed() - atomic exchange with relaxed ordering * @v: pointer to atomic64_t * @new: s64 value to assign * * Atomically updates @v to @new with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_xchg_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_xchg_relaxed(atomic64_t *v, s64 new) { #if defined(arch_atomic64_xchg_relaxed) return arch_atomic64_xchg_relaxed(v, new); #elif defined(arch_atomic64_xchg) return arch_atomic64_xchg(v, new); #else return raw_xchg_relaxed(&v->counter, new); #endif } /** * raw_atomic64_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic64_t * @old: s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * * Safe to use in noinstr code; prefer atomic64_cmpxchg() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) { #if defined(arch_atomic64_cmpxchg) return arch_atomic64_cmpxchg(v, old, new); #elif defined(arch_atomic64_cmpxchg_relaxed) s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_cmpxchg_relaxed(v, old, new); __atomic_post_full_fence(); return ret; #else return raw_cmpxchg(&v->counter, old, new); #endif } /** * raw_atomic64_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic64_t * @old: s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_cmpxchg_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new) { #if defined(arch_atomic64_cmpxchg_acquire) return arch_atomic64_cmpxchg_acquire(v, old, new); #elif defined(arch_atomic64_cmpxchg_relaxed) s64 ret = arch_atomic64_cmpxchg_relaxed(v, old, new); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_cmpxchg) return arch_atomic64_cmpxchg(v, old, new); #else return raw_cmpxchg_acquire(&v->counter, old, new); #endif } /** * raw_atomic64_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic64_t * @old: s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * * Safe to use in noinstr code; prefer atomic64_cmpxchg_release() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_cmpxchg_release(atomic64_t *v, s64 old, s64 new) { #if defined(arch_atomic64_cmpxchg_release) return arch_atomic64_cmpxchg_release(v, old, new); #elif defined(arch_atomic64_cmpxchg_relaxed) __atomic_release_fence(); return arch_atomic64_cmpxchg_relaxed(v, old, new); #elif defined(arch_atomic64_cmpxchg) return arch_atomic64_cmpxchg(v, old, new); #else return raw_cmpxchg_release(&v->counter, old, new); #endif } /** * raw_atomic64_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic64_t * @old: s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_cmpxchg_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_cmpxchg_relaxed(atomic64_t *v, s64 old, s64 new) { #if defined(arch_atomic64_cmpxchg_relaxed) return arch_atomic64_cmpxchg_relaxed(v, old, new); #elif defined(arch_atomic64_cmpxchg) return arch_atomic64_cmpxchg(v, old, new); #else return raw_cmpxchg_relaxed(&v->counter, old, new); #endif } /** * raw_atomic64_try_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic64_t * @old: pointer to s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * Otherwise, updates @old to the current value of @v. * * Safe to use in noinstr code; prefer atomic64_try_cmpxchg() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) { #if defined(arch_atomic64_try_cmpxchg) return arch_atomic64_try_cmpxchg(v, old, new); #elif defined(arch_atomic64_try_cmpxchg_relaxed) bool ret; __atomic_pre_full_fence(); ret = arch_atomic64_try_cmpxchg_relaxed(v, old, new); __atomic_post_full_fence(); return ret; #else s64 r, o = *old; r = raw_atomic64_cmpxchg(v, o, new); if (unlikely(r != o)) *old = r; return likely(r == o); #endif } /** * raw_atomic64_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic64_t * @old: pointer to s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * Otherwise, updates @old to the current value of @v. * * Safe to use in noinstr code; prefer atomic64_try_cmpxchg_acquire() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new) { #if defined(arch_atomic64_try_cmpxchg_acquire) return arch_atomic64_try_cmpxchg_acquire(v, old, new); #elif defined(arch_atomic64_try_cmpxchg_relaxed) bool ret = arch_atomic64_try_cmpxchg_relaxed(v, old, new); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_try_cmpxchg) return arch_atomic64_try_cmpxchg(v, old, new); #else s64 r, o = *old; r = raw_atomic64_cmpxchg_acquire(v, o, new); if (unlikely(r != o)) *old = r; return likely(r == o); #endif } /** * raw_atomic64_try_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic64_t * @old: pointer to s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * Otherwise, updates @old to the current value of @v. * * Safe to use in noinstr code; prefer atomic64_try_cmpxchg_release() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new) { #if defined(arch_atomic64_try_cmpxchg_release) return arch_atomic64_try_cmpxchg_release(v, old, new); #elif defined(arch_atomic64_try_cmpxchg_relaxed) __atomic_release_fence(); return arch_atomic64_try_cmpxchg_relaxed(v, old, new); #elif defined(arch_atomic64_try_cmpxchg) return arch_atomic64_try_cmpxchg(v, old, new); #else s64 r, o = *old; r = raw_atomic64_cmpxchg_release(v, o, new); if (unlikely(r != o)) *old = r; return likely(r == o); #endif } /** * raw_atomic64_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic64_t * @old: pointer to s64 value to compare with * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * Otherwise, updates @old to the current value of @v. * * Safe to use in noinstr code; prefer atomic64_try_cmpxchg_relaxed() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new) { #if defined(arch_atomic64_try_cmpxchg_relaxed) return arch_atomic64_try_cmpxchg_relaxed(v, old, new); #elif defined(arch_atomic64_try_cmpxchg) return arch_atomic64_try_cmpxchg(v, old, new); #else s64 r, o = *old; r = raw_atomic64_cmpxchg_relaxed(v, o, new); if (unlikely(r != o)) *old = r; return likely(r == o); #endif } /** * raw_atomic64_sub_and_test() - atomic subtract and test if zero with full ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with full ordering. * * Safe to use in noinstr code; prefer atomic64_sub_and_test() elsewhere. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool raw_atomic64_sub_and_test(s64 i, atomic64_t *v) { #if defined(arch_atomic64_sub_and_test) return arch_atomic64_sub_and_test(i, v); #else return raw_atomic64_sub_return(i, v) == 0; #endif } /** * raw_atomic64_dec_and_test() - atomic decrement and test if zero with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic64_dec_and_test() elsewhere. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool raw_atomic64_dec_and_test(atomic64_t *v) { #if defined(arch_atomic64_dec_and_test) return arch_atomic64_dec_and_test(v); #else return raw_atomic64_dec_return(v) == 0; #endif } /** * raw_atomic64_inc_and_test() - atomic increment and test if zero with full ordering * @v: pointer to atomic64_t * * Atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic64_inc_and_test() elsewhere. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool raw_atomic64_inc_and_test(atomic64_t *v) { #if defined(arch_atomic64_inc_and_test) return arch_atomic64_inc_and_test(v); #else return raw_atomic64_inc_return(v) == 0; #endif } /** * raw_atomic64_add_negative() - atomic add and test if negative with full ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with full ordering. * * Safe to use in noinstr code; prefer atomic64_add_negative() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic64_add_negative(s64 i, atomic64_t *v) { #if defined(arch_atomic64_add_negative) return arch_atomic64_add_negative(i, v); #elif defined(arch_atomic64_add_negative_relaxed) bool ret; __atomic_pre_full_fence(); ret = arch_atomic64_add_negative_relaxed(i, v); __atomic_post_full_fence(); return ret; #else return raw_atomic64_add_return(i, v) < 0; #endif } /** * raw_atomic64_add_negative_acquire() - atomic add and test if negative with acquire ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic64_add_negative_acquire() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic64_add_negative_acquire(s64 i, atomic64_t *v) { #if defined(arch_atomic64_add_negative_acquire) return arch_atomic64_add_negative_acquire(i, v); #elif defined(arch_atomic64_add_negative_relaxed) bool ret = arch_atomic64_add_negative_relaxed(i, v); __atomic_acquire_fence(); return ret; #elif defined(arch_atomic64_add_negative) return arch_atomic64_add_negative(i, v); #else return raw_atomic64_add_return_acquire(i, v) < 0; #endif } /** * raw_atomic64_add_negative_release() - atomic add and test if negative with release ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with release ordering. * * Safe to use in noinstr code; prefer atomic64_add_negative_release() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic64_add_negative_release(s64 i, atomic64_t *v) { #if defined(arch_atomic64_add_negative_release) return arch_atomic64_add_negative_release(i, v); #elif defined(arch_atomic64_add_negative_relaxed) __atomic_release_fence(); return arch_atomic64_add_negative_relaxed(i, v); #elif defined(arch_atomic64_add_negative) return arch_atomic64_add_negative(i, v); #else return raw_atomic64_add_return_release(i, v) < 0; #endif } /** * raw_atomic64_add_negative_relaxed() - atomic add and test if negative with relaxed ordering * @i: s64 value to add * @v: pointer to atomic64_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic64_add_negative_relaxed() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic64_add_negative_relaxed(s64 i, atomic64_t *v) { #if defined(arch_atomic64_add_negative_relaxed) return arch_atomic64_add_negative_relaxed(i, v); #elif defined(arch_atomic64_add_negative) return arch_atomic64_add_negative(i, v); #else return raw_atomic64_add_return_relaxed(i, v) < 0; #endif } /** * raw_atomic64_fetch_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic64_t * @a: s64 value to add * @u: s64 value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * * Safe to use in noinstr code; prefer atomic64_fetch_add_unless() elsewhere. * * Return: The original value of @v. */ static __always_inline s64 raw_atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u) { #if defined(arch_atomic64_fetch_add_unless) return arch_atomic64_fetch_add_unless(v, a, u); #else s64 c = raw_atomic64_read(v); do { if (unlikely(c == u)) break; } while (!raw_atomic64_try_cmpxchg(v, &c, c + a)); return c; #endif } /** * raw_atomic64_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic64_t * @a: s64 value to add * @u: s64 value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * * Safe to use in noinstr code; prefer atomic64_add_unless() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic64_add_unless(atomic64_t *v, s64 a, s64 u) { #if defined(arch_atomic64_add_unless) return arch_atomic64_add_unless(v, a, u); #else return raw_atomic64_fetch_add_unless(v, a, u) != u; #endif } /** * raw_atomic64_inc_not_zero() - atomic increment unless zero with full ordering * @v: pointer to atomic64_t * * If (@v != 0), atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic64_inc_not_zero() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic64_inc_not_zero(atomic64_t *v) { #if defined(arch_atomic64_inc_not_zero) return arch_atomic64_inc_not_zero(v); #else return raw_atomic64_add_unless(v, 1, 0); #endif } /** * raw_atomic64_inc_unless_negative() - atomic increment unless negative with full ordering * @v: pointer to atomic64_t * * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic64_inc_unless_negative() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic64_inc_unless_negative(atomic64_t *v) { #if defined(arch_atomic64_inc_unless_negative) return arch_atomic64_inc_unless_negative(v); #else s64 c = raw_atomic64_read(v); do { if (unlikely(c < 0)) return false; } while (!raw_atomic64_try_cmpxchg(v, &c, c + 1)); return true; #endif } /** * raw_atomic64_dec_unless_positive() - atomic decrement unless positive with full ordering * @v: pointer to atomic64_t * * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic64_dec_unless_positive() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic64_dec_unless_positive(atomic64_t *v) { #if defined(arch_atomic64_dec_unless_positive) return arch_atomic64_dec_unless_positive(v); #else s64 c = raw_atomic64_read(v); do { if (unlikely(c > 0)) return false; } while (!raw_atomic64_try_cmpxchg(v, &c, c - 1)); return true; #endif } /** * raw_atomic64_dec_if_positive() - atomic decrement if positive with full ordering * @v: pointer to atomic64_t * * If (@v > 0), atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic64_dec_if_positive() elsewhere. * * Return: The old value of (@v - 1), regardless of whether @v was updated. */ static __always_inline s64 raw_atomic64_dec_if_positive(atomic64_t *v) { #if defined(arch_atomic64_dec_if_positive) return arch_atomic64_dec_if_positive(v); #else s64 dec, c = raw_atomic64_read(v); do { dec = c - 1; if (unlikely(dec < 0)) break; } while (!raw_atomic64_try_cmpxchg(v, &c, dec)); return dec; #endif } #endif /* _LINUX_ATOMIC_FALLBACK_H */ // 2fdd6702823fa842f9cea57a002e6e4476ae780c
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_PKT_CLS_H #define __NET_PKT_CLS_H #include <linux/pkt_cls.h> #include <linux/workqueue.h> #include <net/sch_generic.h> #include <net/act_api.h> #include <net/net_namespace.h> /* TC action not accessible from user space */ #define TC_ACT_CONSUMED (TC_ACT_VALUE_MAX + 1) /* Basic packet classifier frontend definitions. */ struct tcf_walker { int stop; int skip; int count; bool nonempty; unsigned long cookie; int (*fn)(struct tcf_proto *, void *node, struct tcf_walker *); }; int register_tcf_proto_ops(struct tcf_proto_ops *ops); void unregister_tcf_proto_ops(struct tcf_proto_ops *ops); struct tcf_block_ext_info { enum flow_block_binder_type binder_type; tcf_chain_head_change_t *chain_head_change; void *chain_head_change_priv; u32 block_index; }; struct tcf_qevent { struct tcf_block *block; struct tcf_block_ext_info info; struct tcf_proto __rcu *filter_chain; }; struct tcf_block_cb; bool tcf_queue_work(struct rcu_work *rwork, work_func_t func); #ifdef CONFIG_NET_CLS struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block, u32 chain_index); void tcf_chain_put_by_act(struct tcf_chain *chain); struct tcf_chain *tcf_get_next_chain(struct tcf_block *block, struct tcf_chain *chain); struct tcf_proto *tcf_get_next_proto(struct tcf_chain *chain, struct tcf_proto *tp); void tcf_block_netif_keep_dst(struct tcf_block *block); int tcf_block_get(struct tcf_block **p_block, struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, struct netlink_ext_ack *extack); int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, struct tcf_block_ext_info *ei, struct netlink_ext_ack *extack); void tcf_block_put(struct tcf_block *block); void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, struct tcf_block_ext_info *ei); int tcf_exts_init_ex(struct tcf_exts *exts, struct net *net, int action, int police, struct tcf_proto *tp, u32 handle, bool used_action_miss); static inline bool tcf_block_shared(struct tcf_block *block) { return block->index; } static inline bool tcf_block_non_null_shared(struct tcf_block *block) { return block && block->index; } static inline struct Qdisc *tcf_block_q(struct tcf_block *block) { WARN_ON(tcf_block_shared(block)); return block->q; } int tcf_classify(struct sk_buff *skb, const struct tcf_block *block, const struct tcf_proto *tp, struct tcf_result *res, bool compat_mode); static inline bool tc_cls_stats_dump(struct tcf_proto *tp, struct tcf_walker *arg, void *filter) { if (arg->count >= arg->skip && arg->fn(tp, filter, arg) < 0) { arg->stop = 1; return false; } arg->count++; return true; } #else static inline bool tcf_block_shared(struct tcf_block *block) { return false; } static inline bool tcf_block_non_null_shared(struct tcf_block *block) { return false; } static inline int tcf_block_get(struct tcf_block **p_block, struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, struct netlink_ext_ack *extack) { return 0; } static inline int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, struct tcf_block_ext_info *ei, struct netlink_ext_ack *extack) { return 0; } static inline void tcf_block_put(struct tcf_block *block) { } static inline void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, struct tcf_block_ext_info *ei) { } static inline struct Qdisc *tcf_block_q(struct tcf_block *block) { return NULL; } static inline int tcf_classify(struct sk_buff *skb, const struct tcf_block *block, const struct tcf_proto *tp, struct tcf_result *res, bool compat_mode) { return TC_ACT_UNSPEC; } #endif static inline unsigned long __cls_set_class(unsigned long *clp, unsigned long cl) { return xchg(clp, cl); } static inline void __tcf_bind_filter(struct Qdisc *q, struct tcf_result *r, unsigned long base) { unsigned long cl; cl = q->ops->cl_ops->bind_tcf(q, base, r->classid); cl = __cls_set_class(&r->class, cl); if (cl) q->ops->cl_ops->unbind_tcf(q, cl); } static inline void tcf_bind_filter(struct tcf_proto *tp, struct tcf_result *r, unsigned long base) { struct Qdisc *q = tp->chain->block->q; /* Check q as it is not set for shared blocks. In that case, * setting class is not supported. */ if (!q) return; sch_tree_lock(q); __tcf_bind_filter(q, r, base); sch_tree_unlock(q); } static inline void __tcf_unbind_filter(struct Qdisc *q, struct tcf_result *r) { unsigned long cl; if ((cl = __cls_set_class(&r->class, 0)) != 0) q->ops->cl_ops->unbind_tcf(q, cl); } static inline void tcf_unbind_filter(struct tcf_proto *tp, struct tcf_result *r) { struct Qdisc *q = tp->chain->block->q; if (!q) return; __tcf_unbind_filter(q, r); } static inline void tc_cls_bind_class(u32 classid, unsigned long cl, void *q, struct tcf_result *res, unsigned long base) { if (res->classid == classid) { if (cl) __tcf_bind_filter(q, res, base); else __tcf_unbind_filter(q, res); } } struct tcf_exts { #ifdef CONFIG_NET_CLS_ACT __u32 type; /* for backward compat(TCA_OLD_COMPAT) */ int nr_actions; struct tc_action **actions; struct net *net; netns_tracker ns_tracker; struct tcf_exts_miss_cookie_node *miss_cookie_node; #endif /* Map to export classifier specific extension TLV types to the * generic extensions API. Unsupported extensions must be set to 0. */ int action; int police; }; static inline int tcf_exts_init(struct tcf_exts *exts, struct net *net, int action, int police) { #ifdef CONFIG_NET_CLS return tcf_exts_init_ex(exts, net, action, police, NULL, 0, false); #else return -EOPNOTSUPP; #endif } /* Return false if the netns is being destroyed in cleanup_net(). Callers * need to do cleanup synchronously in this case, otherwise may race with * tc_action_net_exit(). Return true for other cases. */ static inline bool tcf_exts_get_net(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT exts->net = maybe_get_net(exts->net); if (exts->net) netns_tracker_alloc(exts->net, &exts->ns_tracker, GFP_KERNEL); return exts->net != NULL; #else return true; #endif } static inline void tcf_exts_put_net(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT if (exts->net) put_net_track(exts->net, &exts->ns_tracker); #endif } #ifdef CONFIG_NET_CLS_ACT #define tcf_exts_for_each_action(i, a, exts) \ for (i = 0; i < TCA_ACT_MAX_PRIO && ((a) = (exts)->actions[i]); i++) #else #define tcf_exts_for_each_action(i, a, exts) \ for (; 0; (void)(i), (void)(a), (void)(exts)) #endif #define tcf_act_for_each_action(i, a, actions) \ for (i = 0; i < TCA_ACT_MAX_PRIO && ((a) = actions[i]); i++) static inline bool tc_act_in_hw(struct tc_action *act) { return !!act->in_hw_count; } static inline void tcf_exts_hw_stats_update(const struct tcf_exts *exts, struct flow_stats *stats, bool use_act_stats) { #ifdef CONFIG_NET_CLS_ACT int i; for (i = 0; i < exts->nr_actions; i++) { struct tc_action *a = exts->actions[i]; if (use_act_stats || tc_act_in_hw(a)) { if (!tcf_action_update_hw_stats(a)) continue; } preempt_disable(); tcf_action_stats_update(a, stats->bytes, stats->pkts, stats->drops, stats->lastused, true); preempt_enable(); a->used_hw_stats = stats->used_hw_stats; a->used_hw_stats_valid = stats->used_hw_stats_valid; } #endif } /** * tcf_exts_has_actions - check if at least one action is present * @exts: tc filter extensions handle * * Returns true if at least one action is present. */ static inline bool tcf_exts_has_actions(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT return exts->nr_actions; #else return false; #endif } /** * tcf_exts_exec - execute tc filter extensions * @skb: socket buffer * @exts: tc filter extensions handle * @res: desired result * * Executes all configured extensions. Returns TC_ACT_OK on a normal execution, * a negative number if the filter must be considered unmatched or * a positive action code (TC_ACT_*) which must be returned to the * underlying layer. */ static inline int tcf_exts_exec(struct sk_buff *skb, struct tcf_exts *exts, struct tcf_result *res) { #ifdef CONFIG_NET_CLS_ACT return tcf_action_exec(skb, exts->actions, exts->nr_actions, res); #endif return TC_ACT_OK; } static inline int tcf_exts_exec_ex(struct sk_buff *skb, struct tcf_exts *exts, int act_index, struct tcf_result *res) { #ifdef CONFIG_NET_CLS_ACT return tcf_action_exec(skb, exts->actions + act_index, exts->nr_actions - act_index, res); #else return TC_ACT_OK; #endif } int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, struct nlattr *rate_tlv, struct tcf_exts *exts, u32 flags, struct netlink_ext_ack *extack); int tcf_exts_validate_ex(struct net *net, struct tcf_proto *tp, struct nlattr **tb, struct nlattr *rate_tlv, struct tcf_exts *exts, u32 flags, u32 fl_flags, struct netlink_ext_ack *extack); void tcf_exts_destroy(struct tcf_exts *exts); void tcf_exts_change(struct tcf_exts *dst, struct tcf_exts *src); int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts); int tcf_exts_terse_dump(struct sk_buff *skb, struct tcf_exts *exts); int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts); /** * struct tcf_pkt_info - packet information * * @ptr: start of the pkt data * @nexthdr: offset of the next header */ struct tcf_pkt_info { unsigned char * ptr; int nexthdr; }; #ifdef CONFIG_NET_EMATCH struct tcf_ematch_ops; /** * struct tcf_ematch - extended match (ematch) * * @matchid: identifier to allow userspace to reidentify a match * @flags: flags specifying attributes and the relation to other matches * @ops: the operations lookup table of the corresponding ematch module * @datalen: length of the ematch specific configuration data * @data: ematch specific data * @net: the network namespace */ struct tcf_ematch { struct tcf_ematch_ops * ops; unsigned long data; unsigned int datalen; u16 matchid; u16 flags; struct net *net; }; static inline int tcf_em_is_container(struct tcf_ematch *em) { return !em->ops; } static inline int tcf_em_is_simple(struct tcf_ematch *em) { return em->flags & TCF_EM_SIMPLE; } static inline int tcf_em_is_inverted(struct tcf_ematch *em) { return em->flags & TCF_EM_INVERT; } static inline int tcf_em_last_match(struct tcf_ematch *em) { return (em->flags & TCF_EM_REL_MASK) == TCF_EM_REL_END; } static inline int tcf_em_early_end(struct tcf_ematch *em, int result) { if (tcf_em_last_match(em)) return 1; if (result == 0 && em->flags & TCF_EM_REL_AND) return 1; if (result != 0 && em->flags & TCF_EM_REL_OR) return 1; return 0; } /** * struct tcf_ematch_tree - ematch tree handle * * @hdr: ematch tree header supplied by userspace * @matches: array of ematches */ struct tcf_ematch_tree { struct tcf_ematch_tree_hdr hdr; struct tcf_ematch * matches; }; /** * struct tcf_ematch_ops - ematch module operations * * @kind: identifier (kind) of this ematch module * @datalen: length of expected configuration data (optional) * @change: called during validation (optional) * @match: called during ematch tree evaluation, must return 1/0 * @destroy: called during destroyage (optional) * @dump: called during dumping process (optional) * @owner: owner, must be set to THIS_MODULE * @link: link to previous/next ematch module (internal use) */ struct tcf_ematch_ops { int kind; int datalen; int (*change)(struct net *net, void *, int, struct tcf_ematch *); int (*match)(struct sk_buff *, struct tcf_ematch *, struct tcf_pkt_info *); void (*destroy)(struct tcf_ematch *); int (*dump)(struct sk_buff *, struct tcf_ematch *); struct module *owner; struct list_head link; }; int tcf_em_register(struct tcf_ematch_ops *); void tcf_em_unregister(struct tcf_ematch_ops *); int tcf_em_tree_validate(struct tcf_proto *, struct nlattr *, struct tcf_ematch_tree *); void tcf_em_tree_destroy(struct tcf_ematch_tree *); int tcf_em_tree_dump(struct sk_buff *, struct tcf_ematch_tree *, int); int __tcf_em_tree_match(struct sk_buff *, struct tcf_ematch_tree *, struct tcf_pkt_info *); /** * tcf_em_tree_match - evaulate an ematch tree * * @skb: socket buffer of the packet in question * @tree: ematch tree to be used for evaluation * @info: packet information examined by classifier * * This function matches @skb against the ematch tree in @tree by going * through all ematches respecting their logic relations returning * as soon as the result is obvious. * * Returns 1 if the ematch tree as-one matches, no ematches are configured * or ematch is not enabled in the kernel, otherwise 0 is returned. */ static inline int tcf_em_tree_match(struct sk_buff *skb, struct tcf_ematch_tree *tree, struct tcf_pkt_info *info) { if (tree->hdr.nmatches) return __tcf_em_tree_match(skb, tree, info); else return 1; } #define MODULE_ALIAS_TCF_EMATCH(kind) MODULE_ALIAS("ematch-kind-" __stringify(kind)) #else /* CONFIG_NET_EMATCH */ struct tcf_ematch_tree { }; #define tcf_em_tree_validate(tp, tb, t) ((void)(t), 0) #define tcf_em_tree_destroy(t) do { (void)(t); } while(0) #define tcf_em_tree_dump(skb, t, tlv) (0) #define tcf_em_tree_match(skb, t, info) ((void)(info), 1) #endif /* CONFIG_NET_EMATCH */ static inline unsigned char * tcf_get_base_ptr(struct sk_buff *skb, int layer) { switch (layer) { case TCF_LAYER_LINK: return skb_mac_header(skb); case TCF_LAYER_NETWORK: return skb_network_header(skb); case TCF_LAYER_TRANSPORT: return skb_transport_header(skb); } return NULL; } static inline int tcf_valid_offset(const struct sk_buff *skb, const unsigned char *ptr, const int len) { return likely((ptr + len) <= skb_tail_pointer(skb) && ptr >= skb->head && (ptr <= (ptr + len))); } static inline int tcf_change_indev(struct net *net, struct nlattr *indev_tlv, struct netlink_ext_ack *extack) { char indev[IFNAMSIZ]; struct net_device *dev; if (nla_strscpy(indev, indev_tlv, IFNAMSIZ) < 0) { NL_SET_ERR_MSG_ATTR(extack, indev_tlv, "Interface name too long"); return -EINVAL; } dev = __dev_get_by_name(net, indev); if (!dev) { NL_SET_ERR_MSG_ATTR(extack, indev_tlv, "Network device not found"); return -ENODEV; } return dev->ifindex; } static inline bool tcf_match_indev(struct sk_buff *skb, int ifindex) { if (!ifindex) return true; if (!skb->skb_iif) return false; return ifindex == skb->skb_iif; } int tc_setup_offload_action(struct flow_action *flow_action, const struct tcf_exts *exts, struct netlink_ext_ack *extack); void tc_cleanup_offload_action(struct flow_action *flow_action); int tc_setup_action(struct flow_action *flow_action, struct tc_action *actions[], u32 miss_cookie_base, struct netlink_ext_ack *extack); int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type, void *type_data, bool err_stop, bool rtnl_held); int tc_setup_cb_add(struct tcf_block *block, struct tcf_proto *tp, enum tc_setup_type type, void *type_data, bool err_stop, u32 *flags, unsigned int *in_hw_count, bool rtnl_held); int tc_setup_cb_replace(struct tcf_block *block, struct tcf_proto *tp, enum tc_setup_type type, void *type_data, bool err_stop, u32 *old_flags, unsigned int *old_in_hw_count, u32 *new_flags, unsigned int *new_in_hw_count, bool rtnl_held); int tc_setup_cb_destroy(struct tcf_block *block, struct tcf_proto *tp, enum tc_setup_type type, void *type_data, bool err_stop, u32 *flags, unsigned int *in_hw_count, bool rtnl_held); int tc_setup_cb_reoffload(struct tcf_block *block, struct tcf_proto *tp, bool add, flow_setup_cb_t *cb, enum tc_setup_type type, void *type_data, void *cb_priv, u32 *flags, unsigned int *in_hw_count); unsigned int tcf_exts_num_actions(struct tcf_exts *exts); #ifdef CONFIG_NET_CLS_ACT int tcf_qevent_init(struct tcf_qevent *qe, struct Qdisc *sch, enum flow_block_binder_type binder_type, struct nlattr *block_index_attr, struct netlink_ext_ack *extack); void tcf_qevent_destroy(struct tcf_qevent *qe, struct Qdisc *sch); int tcf_qevent_validate_change(struct tcf_qevent *qe, struct nlattr *block_index_attr, struct netlink_ext_ack *extack); struct sk_buff *tcf_qevent_handle(struct tcf_qevent *qe, struct Qdisc *sch, struct sk_buff *skb, struct sk_buff **to_free, int *ret); int tcf_qevent_dump(struct sk_buff *skb, int attr_name, struct tcf_qevent *qe); #else static inline int tcf_qevent_init(struct tcf_qevent *qe, struct Qdisc *sch, enum flow_block_binder_type binder_type, struct nlattr *block_index_attr, struct netlink_ext_ack *extack) { return 0; } static inline void tcf_qevent_destroy(struct tcf_qevent *qe, struct Qdisc *sch) { } static inline int tcf_qevent_validate_change(struct tcf_qevent *qe, struct nlattr *block_index_attr, struct netlink_ext_ack *extack) { return 0; } static inline struct sk_buff * tcf_qevent_handle(struct tcf_qevent *qe, struct Qdisc *sch, struct sk_buff *skb, struct sk_buff **to_free, int *ret) { return skb; } static inline int tcf_qevent_dump(struct sk_buff *skb, int attr_name, struct tcf_qevent *qe) { return 0; } #endif struct tc_cls_u32_knode { struct tcf_exts *exts; struct tcf_result *res; struct tc_u32_sel *sel; u32 handle; u32 val; u32 mask; u32 link_handle; u8 fshift; }; struct tc_cls_u32_hnode { u32 handle; u32 prio; unsigned int divisor; }; enum tc_clsu32_command { TC_CLSU32_NEW_KNODE, TC_CLSU32_REPLACE_KNODE, TC_CLSU32_DELETE_KNODE, TC_CLSU32_NEW_HNODE, TC_CLSU32_REPLACE_HNODE, TC_CLSU32_DELETE_HNODE, }; struct tc_cls_u32_offload { struct flow_cls_common_offload common; /* knode values */ enum tc_clsu32_command command; union { struct tc_cls_u32_knode knode; struct tc_cls_u32_hnode hnode; }; }; static inline bool tc_can_offload(const struct net_device *dev) { return dev->features & NETIF_F_HW_TC; } static inline bool tc_can_offload_extack(const struct net_device *dev, struct netlink_ext_ack *extack) { bool can = tc_can_offload(dev); if (!can) NL_SET_ERR_MSG(extack, "TC offload is disabled on net device"); return can; } static inline bool tc_cls_can_offload_and_chain0(const struct net_device *dev, struct flow_cls_common_offload *common) { if (!tc_can_offload_extack(dev, common->extack)) return false; if (common->chain_index) { NL_SET_ERR_MSG(common->extack, "Driver supports only offload of chain 0"); return false; } return true; } static inline bool tc_skip_hw(u32 flags) { return (flags & TCA_CLS_FLAGS_SKIP_HW) ? true : false; } static inline bool tc_skip_sw(u32 flags) { return (flags & TCA_CLS_FLAGS_SKIP_SW) ? true : false; } /* SKIP_HW and SKIP_SW are mutually exclusive flags. */ static inline bool tc_flags_valid(u32 flags) { if (flags & ~(TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW | TCA_CLS_FLAGS_VERBOSE)) return false; flags &= TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW; if (!(flags ^ (TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW))) return false; return true; } static inline bool tc_in_hw(u32 flags) { return (flags & TCA_CLS_FLAGS_IN_HW) ? true : false; } static inline void tc_cls_common_offload_init(struct flow_cls_common_offload *cls_common, const struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { cls_common->chain_index = tp->chain->index; cls_common->protocol = tp->protocol; cls_common->prio = tp->prio >> 16; if (tc_skip_sw(flags) || flags & TCA_CLS_FLAGS_VERBOSE) cls_common->extack = extack; } #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) static inline struct tc_skb_ext *tc_skb_ext_alloc(struct sk_buff *skb) { struct tc_skb_ext *tc_skb_ext = skb_ext_add(skb, TC_SKB_EXT); if (tc_skb_ext) memset(tc_skb_ext, 0, sizeof(*tc_skb_ext)); return tc_skb_ext; } #endif enum tc_matchall_command { TC_CLSMATCHALL_REPLACE, TC_CLSMATCHALL_DESTROY, TC_CLSMATCHALL_STATS, }; struct tc_cls_matchall_offload { struct flow_cls_common_offload common; enum tc_matchall_command command; struct flow_rule *rule; struct flow_stats stats; bool use_act_stats; unsigned long cookie; }; enum tc_clsbpf_command { TC_CLSBPF_OFFLOAD, TC_CLSBPF_STATS, }; struct tc_cls_bpf_offload { struct flow_cls_common_offload common; enum tc_clsbpf_command command; struct tcf_exts *exts; struct bpf_prog *prog; struct bpf_prog *oldprog; const char *name; bool exts_integrated; }; /* This structure holds cookie structure that is passed from user * to the kernel for actions and classifiers */ struct tc_cookie { u8 *data; u32 len; struct rcu_head rcu; }; struct tc_qopt_offload_stats { struct gnet_stats_basic_sync *bstats; struct gnet_stats_queue *qstats; }; enum tc_mq_command { TC_MQ_CREATE, TC_MQ_DESTROY, TC_MQ_STATS, TC_MQ_GRAFT, }; struct tc_mq_opt_offload_graft_params { unsigned long queue; u32 child_handle; }; struct tc_mq_qopt_offload { enum tc_mq_command command; u32 handle; union { struct tc_qopt_offload_stats stats; struct tc_mq_opt_offload_graft_params graft_params; }; }; enum tc_htb_command { /* Root */ TC_HTB_CREATE, /* Initialize HTB offload. */ TC_HTB_DESTROY, /* Destroy HTB offload. */ /* Classes */ /* Allocate qid and create leaf. */ TC_HTB_LEAF_ALLOC_QUEUE, /* Convert leaf to inner, preserve and return qid, create new leaf. */ TC_HTB_LEAF_TO_INNER, /* Delete leaf, while siblings remain. */ TC_HTB_LEAF_DEL, /* Delete leaf, convert parent to leaf, preserving qid. */ TC_HTB_LEAF_DEL_LAST, /* TC_HTB_LEAF_DEL_LAST, but delete driver data on hardware errors. */ TC_HTB_LEAF_DEL_LAST_FORCE, /* Modify parameters of a node. */ TC_HTB_NODE_MODIFY, /* Class qdisc */ TC_HTB_LEAF_QUERY_QUEUE, /* Query qid by classid. */ }; struct tc_htb_qopt_offload { struct netlink_ext_ack *extack; enum tc_htb_command command; u32 parent_classid; u16 classid; u16 qid; u32 quantum; u64 rate; u64 ceil; u8 prio; }; #define TC_HTB_CLASSID_ROOT U32_MAX enum tc_red_command { TC_RED_REPLACE, TC_RED_DESTROY, TC_RED_STATS, TC_RED_XSTATS, TC_RED_GRAFT, }; struct tc_red_qopt_offload_params { u32 min; u32 max; u32 probability; u32 limit; bool is_ecn; bool is_harddrop; bool is_nodrop; struct gnet_stats_queue *qstats; }; struct tc_red_qopt_offload { enum tc_red_command command; u32 handle; u32 parent; union { struct tc_red_qopt_offload_params set; struct tc_qopt_offload_stats stats; struct red_stats *xstats; u32 child_handle; }; }; enum tc_gred_command { TC_GRED_REPLACE, TC_GRED_DESTROY, TC_GRED_STATS, }; struct tc_gred_vq_qopt_offload_params { bool present; u32 limit; u32 prio; u32 min; u32 max; bool is_ecn; bool is_harddrop; u32 probability; /* Only need backlog, see struct tc_prio_qopt_offload_params */ u32 *backlog; }; struct tc_gred_qopt_offload_params { bool grio_on; bool wred_on; unsigned int dp_cnt; unsigned int dp_def; struct gnet_stats_queue *qstats; struct tc_gred_vq_qopt_offload_params tab[MAX_DPs]; }; struct tc_gred_qopt_offload_stats { struct gnet_stats_basic_sync bstats[MAX_DPs]; struct gnet_stats_queue qstats[MAX_DPs]; struct red_stats *xstats[MAX_DPs]; }; struct tc_gred_qopt_offload { enum tc_gred_command command; u32 handle; u32 parent; union { struct tc_gred_qopt_offload_params set; struct tc_gred_qopt_offload_stats stats; }; }; enum tc_prio_command { TC_PRIO_REPLACE, TC_PRIO_DESTROY, TC_PRIO_STATS, TC_PRIO_GRAFT, }; struct tc_prio_qopt_offload_params { int bands; u8 priomap[TC_PRIO_MAX + 1]; /* At the point of un-offloading the Qdisc, the reported backlog and * qlen need to be reduced by the portion that is in HW. */ struct gnet_stats_queue *qstats; }; struct tc_prio_qopt_offload_graft_params { u8 band; u32 child_handle; }; struct tc_prio_qopt_offload { enum tc_prio_command command; u32 handle; u32 parent; union { struct tc_prio_qopt_offload_params replace_params; struct tc_qopt_offload_stats stats; struct tc_prio_qopt_offload_graft_params graft_params; }; }; enum tc_root_command { TC_ROOT_GRAFT, }; struct tc_root_qopt_offload { enum tc_root_command command; u32 handle; bool ingress; }; enum tc_ets_command { TC_ETS_REPLACE, TC_ETS_DESTROY, TC_ETS_STATS, TC_ETS_GRAFT, }; struct tc_ets_qopt_offload_replace_params { unsigned int bands; u8 priomap[TC_PRIO_MAX + 1]; unsigned int quanta[TCQ_ETS_MAX_BANDS]; /* 0 for strict bands. */ unsigned int weights[TCQ_ETS_MAX_BANDS]; struct gnet_stats_queue *qstats; }; struct tc_ets_qopt_offload_graft_params { u8 band; u32 child_handle; }; struct tc_ets_qopt_offload { enum tc_ets_command command; u32 handle; u32 parent; union { struct tc_ets_qopt_offload_replace_params replace_params; struct tc_qopt_offload_stats stats; struct tc_ets_qopt_offload_graft_params graft_params; }; }; enum tc_tbf_command { TC_TBF_REPLACE, TC_TBF_DESTROY, TC_TBF_STATS, TC_TBF_GRAFT, }; struct tc_tbf_qopt_offload_replace_params { struct psched_ratecfg rate; u32 max_size; struct gnet_stats_queue *qstats; }; struct tc_tbf_qopt_offload { enum tc_tbf_command command; u32 handle; u32 parent; union { struct tc_tbf_qopt_offload_replace_params replace_params; struct tc_qopt_offload_stats stats; u32 child_handle; }; }; enum tc_fifo_command { TC_FIFO_REPLACE, TC_FIFO_DESTROY, TC_FIFO_STATS, }; struct tc_fifo_qopt_offload { enum tc_fifo_command command; u32 handle; u32 parent; union { struct tc_qopt_offload_stats stats; }; }; #ifdef CONFIG_NET_CLS_ACT DECLARE_STATIC_KEY_FALSE(tc_skb_ext_tc); void tc_skb_ext_tc_enable(void); void tc_skb_ext_tc_disable(void); #define tc_skb_ext_tc_enabled() static_branch_unlikely(&tc_skb_ext_tc) #else /* CONFIG_NET_CLS_ACT */ static inline void tc_skb_ext_tc_enable(void) { } static inline void tc_skb_ext_tc_disable(void) { } #define tc_skb_ext_tc_enabled() false #endif #endif
121 121 121 121 121 121 121 121 121 121 121 121 121 121 121 121 121 121 121 121 121 121 121 121 121 121 121 121 121 121 121 121 121 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 // SPDX-License-Identifier: GPL-2.0-or-later /* audit.c -- Auditing support * Gateway between the kernel (e.g., selinux) and the user-space audit daemon. * System-call specific features have moved to auditsc.c * * Copyright 2003-2007 Red Hat Inc., Durham, North Carolina. * All Rights Reserved. * * Written by Rickard E. (Rik) Faith <faith@redhat.com> * * Goals: 1) Integrate fully with Security Modules. * 2) Minimal run-time overhead: * a) Minimal when syscall auditing is disabled (audit_enable=0). * b) Small when syscall auditing is enabled and no audit record * is generated (defer as much work as possible to record * generation time): * i) context is allocated, * ii) names from getname are stored without a copy, and * iii) inode information stored from path_lookup. * 3) Ability to disable syscall auditing at boot time (audit=0). * 4) Usable by other parts of the kernel (if audit_log* is called, * then a syscall record will be generated automatically for the * current syscall). * 5) Netlink interface to user-space. * 6) Support low-overhead kernel-based filtering to minimize the * information that must be passed to user-space. * * Audit userspace, documentation, tests, and bug/issue trackers: * https://github.com/linux-audit */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/file.h> #include <linux/init.h> #include <linux/types.h> #include <linux/atomic.h> #include <linux/mm.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/err.h> #include <linux/kthread.h> #include <linux/kernel.h> #include <linux/syscalls.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/mutex.h> #include <linux/gfp.h> #include <linux/pid.h> #include <linux/audit.h> #include <net/sock.h> #include <net/netlink.h> #include <linux/skbuff.h> #include <linux/security.h> #include <linux/freezer.h> #include <linux/pid_namespace.h> #include <net/netns/generic.h> #include "audit.h" /* No auditing will take place until audit_initialized == AUDIT_INITIALIZED. * (Initialization happens after skb_init is called.) */ #define AUDIT_DISABLED -1 #define AUDIT_UNINITIALIZED 0 #define AUDIT_INITIALIZED 1 static int audit_initialized = AUDIT_UNINITIALIZED; u32 audit_enabled = AUDIT_OFF; bool audit_ever_enabled = !!AUDIT_OFF; EXPORT_SYMBOL_GPL(audit_enabled); /* Default state when kernel boots without any parameters. */ static u32 audit_default = AUDIT_OFF; /* If auditing cannot proceed, audit_failure selects what happens. */ static u32 audit_failure = AUDIT_FAIL_PRINTK; /* private audit network namespace index */ static unsigned int audit_net_id; /** * struct audit_net - audit private network namespace data * @sk: communication socket */ struct audit_net { struct sock *sk; }; /** * struct auditd_connection - kernel/auditd connection state * @pid: auditd PID * @portid: netlink portid * @net: the associated network namespace * @rcu: RCU head * * Description: * This struct is RCU protected; you must either hold the RCU lock for reading * or the associated spinlock for writing. */ struct auditd_connection { struct pid *pid; u32 portid; struct net *net; struct rcu_head rcu; }; static struct auditd_connection __rcu *auditd_conn; static DEFINE_SPINLOCK(auditd_conn_lock); /* If audit_rate_limit is non-zero, limit the rate of sending audit records * to that number per second. This prevents DoS attacks, but results in * audit records being dropped. */ static u32 audit_rate_limit; /* Number of outstanding audit_buffers allowed. * When set to zero, this means unlimited. */ static u32 audit_backlog_limit = 64; #define AUDIT_BACKLOG_WAIT_TIME (60 * HZ) static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME; /* The identity of the user shutting down the audit system. */ static kuid_t audit_sig_uid = INVALID_UID; static pid_t audit_sig_pid = -1; static u32 audit_sig_sid; /* Records can be lost in several ways: 0) [suppressed in audit_alloc] 1) out of memory in audit_log_start [kmalloc of struct audit_buffer] 2) out of memory in audit_log_move [alloc_skb] 3) suppressed due to audit_rate_limit 4) suppressed due to audit_backlog_limit */ static atomic_t audit_lost = ATOMIC_INIT(0); /* Monotonically increasing sum of time the kernel has spent * waiting while the backlog limit is exceeded. */ static atomic_t audit_backlog_wait_time_actual = ATOMIC_INIT(0); /* Hash for inode-based rules */ struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; static struct kmem_cache *audit_buffer_cache; /* queue msgs to send via kauditd_task */ static struct sk_buff_head audit_queue; /* queue msgs due to temporary unicast send problems */ static struct sk_buff_head audit_retry_queue; /* queue msgs waiting for new auditd connection */ static struct sk_buff_head audit_hold_queue; /* queue servicing thread */ static struct task_struct *kauditd_task; static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); /* waitqueue for callers who are blocked on the audit backlog */ static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); static struct audit_features af = {.vers = AUDIT_FEATURE_VERSION, .mask = -1, .features = 0, .lock = 0,}; static char *audit_feature_names[2] = { "only_unset_loginuid", "loginuid_immutable", }; /** * struct audit_ctl_mutex - serialize requests from userspace * @lock: the mutex used for locking * @owner: the task which owns the lock * * Description: * This is the lock struct used to ensure we only process userspace requests * in an orderly fashion. We can't simply use a mutex/lock here because we * need to track lock ownership so we don't end up blocking the lock owner in * audit_log_start() or similar. */ static struct audit_ctl_mutex { struct mutex lock; void *owner; } audit_cmd_mutex; /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting * audit records. Since printk uses a 1024 byte buffer, this buffer * should be at least that large. */ #define AUDIT_BUFSIZ 1024 /* The audit_buffer is used when formatting an audit record. The caller * locks briefly to get the record off the freelist or to allocate the * buffer, and locks briefly to send the buffer to the netlink layer or * to place it on a transmit queue. Multiple audit_buffers can be in * use simultaneously. */ struct audit_buffer { struct sk_buff *skb; /* formatted skb ready to send */ struct audit_context *ctx; /* NULL or associated context */ gfp_t gfp_mask; }; struct audit_reply { __u32 portid; struct net *net; struct sk_buff *skb; }; /** * auditd_test_task - Check to see if a given task is an audit daemon * @task: the task to check * * Description: * Return 1 if the task is a registered audit daemon, 0 otherwise. */ int auditd_test_task(struct task_struct *task) { int rc; struct auditd_connection *ac; rcu_read_lock(); ac = rcu_dereference(auditd_conn); rc = (ac && ac->pid == task_tgid(task) ? 1 : 0); rcu_read_unlock(); return rc; } /** * audit_ctl_lock - Take the audit control lock */ void audit_ctl_lock(void) { mutex_lock(&audit_cmd_mutex.lock); audit_cmd_mutex.owner = current; } /** * audit_ctl_unlock - Drop the audit control lock */ void audit_ctl_unlock(void) { audit_cmd_mutex.owner = NULL; mutex_unlock(&audit_cmd_mutex.lock); } /** * audit_ctl_owner_current - Test to see if the current task owns the lock * * Description: * Return true if the current task owns the audit control lock, false if it * doesn't own the lock. */ static bool audit_ctl_owner_current(void) { return (current == audit_cmd_mutex.owner); } /** * auditd_pid_vnr - Return the auditd PID relative to the namespace * * Description: * Returns the PID in relation to the namespace, 0 on failure. */ static pid_t auditd_pid_vnr(void) { pid_t pid; const struct auditd_connection *ac; rcu_read_lock(); ac = rcu_dereference(auditd_conn); if (!ac || !ac->pid) pid = 0; else pid = pid_vnr(ac->pid); rcu_read_unlock(); return pid; } /** * audit_get_sk - Return the audit socket for the given network namespace * @net: the destination network namespace * * Description: * Returns the sock pointer if valid, NULL otherwise. The caller must ensure * that a reference is held for the network namespace while the sock is in use. */ static struct sock *audit_get_sk(const struct net *net) { struct audit_net *aunet; if (!net) return NULL; aunet = net_generic(net, audit_net_id); return aunet->sk; } void audit_panic(const char *message) { switch (audit_failure) { case AUDIT_FAIL_SILENT: break; case AUDIT_FAIL_PRINTK: if (printk_ratelimit()) pr_err("%s\n", message); break; case AUDIT_FAIL_PANIC: panic("audit: %s\n", message); break; } } static inline int audit_rate_check(void) { static unsigned long last_check = 0; static int messages = 0; static DEFINE_SPINLOCK(lock); unsigned long flags; unsigned long now; int retval = 0; if (!audit_rate_limit) return 1; spin_lock_irqsave(&lock, flags); if (++messages < audit_rate_limit) { retval = 1; } else { now = jiffies; if (time_after(now, last_check + HZ)) { last_check = now; messages = 0; retval = 1; } } spin_unlock_irqrestore(&lock, flags); return retval; } /** * audit_log_lost - conditionally log lost audit message event * @message: the message stating reason for lost audit message * * Emit at least 1 message per second, even if audit_rate_check is * throttling. * Always increment the lost messages counter. */ void audit_log_lost(const char *message) { static unsigned long last_msg = 0; static DEFINE_SPINLOCK(lock); unsigned long flags; unsigned long now; int print; atomic_inc(&audit_lost); print = (audit_failure == AUDIT_FAIL_PANIC || !audit_rate_limit); if (!print) { spin_lock_irqsave(&lock, flags); now = jiffies; if (time_after(now, last_msg + HZ)) { print = 1; last_msg = now; } spin_unlock_irqrestore(&lock, flags); } if (print) { if (printk_ratelimit()) pr_warn("audit_lost=%u audit_rate_limit=%u audit_backlog_limit=%u\n", atomic_read(&audit_lost), audit_rate_limit, audit_backlog_limit); audit_panic(message); } } static int audit_log_config_change(char *function_name, u32 new, u32 old, int allow_changes) { struct audit_buffer *ab; int rc = 0; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return rc; audit_log_format(ab, "op=set %s=%u old=%u ", function_name, new, old); audit_log_session_info(ab); rc = audit_log_task_context(ab); if (rc) allow_changes = 0; /* Something weird, deny request */ audit_log_format(ab, " res=%d", allow_changes); audit_log_end(ab); return rc; } static int audit_do_config_change(char *function_name, u32 *to_change, u32 new) { int allow_changes, rc = 0; u32 old = *to_change; /* check if we are locked */ if (audit_enabled == AUDIT_LOCKED) allow_changes = 0; else allow_changes = 1; if (audit_enabled != AUDIT_OFF) { rc = audit_log_config_change(function_name, new, old, allow_changes); if (rc) allow_changes = 0; } /* If we are allowed, make the change */ if (allow_changes == 1) *to_change = new; /* Not allowed, update reason */ else if (rc == 0) rc = -EPERM; return rc; } static int audit_set_rate_limit(u32 limit) { return audit_do_config_change("audit_rate_limit", &audit_rate_limit, limit); } static int audit_set_backlog_limit(u32 limit) { return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, limit); } static int audit_set_backlog_wait_time(u32 timeout) { return audit_do_config_change("audit_backlog_wait_time", &audit_backlog_wait_time, timeout); } static int audit_set_enabled(u32 state) { int rc; if (state > AUDIT_LOCKED) return -EINVAL; rc = audit_do_config_change("audit_enabled", &audit_enabled, state); if (!rc) audit_ever_enabled |= !!state; return rc; } static int audit_set_failure(u32 state) { if (state != AUDIT_FAIL_SILENT && state != AUDIT_FAIL_PRINTK && state != AUDIT_FAIL_PANIC) return -EINVAL; return audit_do_config_change("audit_failure", &audit_failure, state); } /** * auditd_conn_free - RCU helper to release an auditd connection struct * @rcu: RCU head * * Description: * Drop any references inside the auditd connection tracking struct and free * the memory. */ static void auditd_conn_free(struct rcu_head *rcu) { struct auditd_connection *ac; ac = container_of(rcu, struct auditd_connection, rcu); put_pid(ac->pid); put_net(ac->net); kfree(ac); } /** * auditd_set - Set/Reset the auditd connection state * @pid: auditd PID * @portid: auditd netlink portid * @net: auditd network namespace pointer * * Description: * This function will obtain and drop network namespace references as * necessary. Returns zero on success, negative values on failure. */ static int auditd_set(struct pid *pid, u32 portid, struct net *net) { unsigned long flags; struct auditd_connection *ac_old, *ac_new; if (!pid || !net) return -EINVAL; ac_new = kzalloc(sizeof(*ac_new), GFP_KERNEL); if (!ac_new) return -ENOMEM; ac_new->pid = get_pid(pid); ac_new->portid = portid; ac_new->net = get_net(net); spin_lock_irqsave(&auditd_conn_lock, flags); ac_old = rcu_dereference_protected(auditd_conn, lockdep_is_held(&auditd_conn_lock)); rcu_assign_pointer(auditd_conn, ac_new); spin_unlock_irqrestore(&auditd_conn_lock, flags); if (ac_old) call_rcu(&ac_old->rcu, auditd_conn_free); return 0; } /** * kauditd_printk_skb - Print the audit record to the ring buffer * @skb: audit record * * Whatever the reason, this packet may not make it to the auditd connection * so write it via printk so the information isn't completely lost. */ static void kauditd_printk_skb(struct sk_buff *skb) { struct nlmsghdr *nlh = nlmsg_hdr(skb); char *data = nlmsg_data(nlh); if (nlh->nlmsg_type != AUDIT_EOE && printk_ratelimit()) pr_notice("type=%d %s\n", nlh->nlmsg_type, data); } /** * kauditd_rehold_skb - Handle a audit record send failure in the hold queue * @skb: audit record * @error: error code (unused) * * Description: * This should only be used by the kauditd_thread when it fails to flush the * hold queue. */ static void kauditd_rehold_skb(struct sk_buff *skb, __always_unused int error) { /* put the record back in the queue */ skb_queue_tail(&audit_hold_queue, skb); } /** * kauditd_hold_skb - Queue an audit record, waiting for auditd * @skb: audit record * @error: error code * * Description: * Queue the audit record, waiting for an instance of auditd. When this * function is called we haven't given up yet on sending the record, but things * are not looking good. The first thing we want to do is try to write the * record via printk and then see if we want to try and hold on to the record * and queue it, if we have room. If we want to hold on to the record, but we * don't have room, record a record lost message. */ static void kauditd_hold_skb(struct sk_buff *skb, int error) { /* at this point it is uncertain if we will ever send this to auditd so * try to send the message via printk before we go any further */ kauditd_printk_skb(skb); /* can we just silently drop the message? */ if (!audit_default) goto drop; /* the hold queue is only for when the daemon goes away completely, * not -EAGAIN failures; if we are in a -EAGAIN state requeue the * record on the retry queue unless it's full, in which case drop it */ if (error == -EAGAIN) { if (!audit_backlog_limit || skb_queue_len(&audit_retry_queue) < audit_backlog_limit) { skb_queue_tail(&audit_retry_queue, skb); return; } audit_log_lost("kauditd retry queue overflow"); goto drop; } /* if we have room in the hold queue, queue the message */ if (!audit_backlog_limit || skb_queue_len(&audit_hold_queue) < audit_backlog_limit) { skb_queue_tail(&audit_hold_queue, skb); return; } /* we have no other options - drop the message */ audit_log_lost("kauditd hold queue overflow"); drop: kfree_skb(skb); } /** * kauditd_retry_skb - Queue an audit record, attempt to send again to auditd * @skb: audit record * @error: error code (unused) * * Description: * Not as serious as kauditd_hold_skb() as we still have a connected auditd, * but for some reason we are having problems sending it audit records so * queue the given record and attempt to resend. */ static void kauditd_retry_skb(struct sk_buff *skb, __always_unused int error) { if (!audit_backlog_limit || skb_queue_len(&audit_retry_queue) < audit_backlog_limit) { skb_queue_tail(&audit_retry_queue, skb); return; } /* we have to drop the record, send it via printk as a last effort */ kauditd_printk_skb(skb); audit_log_lost("kauditd retry queue overflow"); kfree_skb(skb); } /** * auditd_reset - Disconnect the auditd connection * @ac: auditd connection state * * Description: * Break the auditd/kauditd connection and move all the queued records into the * hold queue in case auditd reconnects. It is important to note that the @ac * pointer should never be dereferenced inside this function as it may be NULL * or invalid, you can only compare the memory address! If @ac is NULL then * the connection will always be reset. */ static void auditd_reset(const struct auditd_connection *ac) { unsigned long flags; struct sk_buff *skb; struct auditd_connection *ac_old; /* if it isn't already broken, break the connection */ spin_lock_irqsave(&auditd_conn_lock, flags); ac_old = rcu_dereference_protected(auditd_conn, lockdep_is_held(&auditd_conn_lock)); if (ac && ac != ac_old) { /* someone already registered a new auditd connection */ spin_unlock_irqrestore(&auditd_conn_lock, flags); return; } rcu_assign_pointer(auditd_conn, NULL); spin_unlock_irqrestore(&auditd_conn_lock, flags); if (ac_old) call_rcu(&ac_old->rcu, auditd_conn_free); /* flush the retry queue to the hold queue, but don't touch the main * queue since we need to process that normally for multicast */ while ((skb = skb_dequeue(&audit_retry_queue))) kauditd_hold_skb(skb, -ECONNREFUSED); } /** * auditd_send_unicast_skb - Send a record via unicast to auditd * @skb: audit record * * Description: * Send a skb to the audit daemon, returns positive/zero values on success and * negative values on failure; in all cases the skb will be consumed by this * function. If the send results in -ECONNREFUSED the connection with auditd * will be reset. This function may sleep so callers should not hold any locks * where this would cause a problem. */ static int auditd_send_unicast_skb(struct sk_buff *skb) { int rc; u32 portid; struct net *net; struct sock *sk; struct auditd_connection *ac; /* NOTE: we can't call netlink_unicast while in the RCU section so * take a reference to the network namespace and grab local * copies of the namespace, the sock, and the portid; the * namespace and sock aren't going to go away while we hold a * reference and if the portid does become invalid after the RCU * section netlink_unicast() should safely return an error */ rcu_read_lock(); ac = rcu_dereference(auditd_conn); if (!ac) { rcu_read_unlock(); kfree_skb(skb); rc = -ECONNREFUSED; goto err; } net = get_net(ac->net); sk = audit_get_sk(net); portid = ac->portid; rcu_read_unlock(); rc = netlink_unicast(sk, skb, portid, 0); put_net(net); if (rc < 0) goto err; return rc; err: if (ac && rc == -ECONNREFUSED) auditd_reset(ac); return rc; } /** * kauditd_send_queue - Helper for kauditd_thread to flush skb queues * @sk: the sending sock * @portid: the netlink destination * @queue: the skb queue to process * @retry_limit: limit on number of netlink unicast failures * @skb_hook: per-skb hook for additional processing * @err_hook: hook called if the skb fails the netlink unicast send * * Description: * Run through the given queue and attempt to send the audit records to auditd, * returns zero on success, negative values on failure. It is up to the caller * to ensure that the @sk is valid for the duration of this function. * */ static int kauditd_send_queue(struct sock *sk, u32 portid, struct sk_buff_head *queue, unsigned int retry_limit, void (*skb_hook)(struct sk_buff *skb), void (*err_hook)(struct sk_buff *skb, int error)) { int rc = 0; struct sk_buff *skb = NULL; struct sk_buff *skb_tail; unsigned int failed = 0; /* NOTE: kauditd_thread takes care of all our locking, we just use * the netlink info passed to us (e.g. sk and portid) */ skb_tail = skb_peek_tail(queue); while ((skb != skb_tail) && (skb = skb_dequeue(queue))) { /* call the skb_hook for each skb we touch */ if (skb_hook) (*skb_hook)(skb); /* can we send to anyone via unicast? */ if (!sk) { if (err_hook) (*err_hook)(skb, -ECONNREFUSED); continue; } retry: /* grab an extra skb reference in case of error */ skb_get(skb); rc = netlink_unicast(sk, skb, portid, 0); if (rc < 0) { /* send failed - try a few times unless fatal error */ if (++failed >= retry_limit || rc == -ECONNREFUSED || rc == -EPERM) { sk = NULL; if (err_hook) (*err_hook)(skb, rc); if (rc == -EAGAIN) rc = 0; /* continue to drain the queue */ continue; } else goto retry; } else { /* skb sent - drop the extra reference and continue */ consume_skb(skb); failed = 0; } } return (rc >= 0 ? 0 : rc); } /* * kauditd_send_multicast_skb - Send a record to any multicast listeners * @skb: audit record * * Description: * Write a multicast message to anyone listening in the initial network * namespace. This function doesn't consume an skb as might be expected since * it has to copy it anyways. */ static void kauditd_send_multicast_skb(struct sk_buff *skb) { struct sk_buff *copy; struct sock *sock = audit_get_sk(&init_net); struct nlmsghdr *nlh; /* NOTE: we are not taking an additional reference for init_net since * we don't have to worry about it going away */ if (!netlink_has_listeners(sock, AUDIT_NLGRP_READLOG)) return; /* * The seemingly wasteful skb_copy() rather than bumping the refcount * using skb_get() is necessary because non-standard mods are made to * the skb by the original kaudit unicast socket send routine. The * existing auditd daemon assumes this breakage. Fixing this would * require co-ordinating a change in the established protocol between * the kaudit kernel subsystem and the auditd userspace code. There is * no reason for new multicast clients to continue with this * non-compliance. */ copy = skb_copy(skb, GFP_KERNEL); if (!copy) return; nlh = nlmsg_hdr(copy); nlh->nlmsg_len = skb->len; nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL); } /** * kauditd_thread - Worker thread to send audit records to userspace * @dummy: unused */ static int kauditd_thread(void *dummy) { int rc; u32 portid = 0; struct net *net = NULL; struct sock *sk = NULL; struct auditd_connection *ac; #define UNICAST_RETRIES 5 set_freezable(); while (!kthread_should_stop()) { /* NOTE: see the lock comments in auditd_send_unicast_skb() */ rcu_read_lock(); ac = rcu_dereference(auditd_conn); if (!ac) { rcu_read_unlock(); goto main_queue; } net = get_net(ac->net); sk = audit_get_sk(net); portid = ac->portid; rcu_read_unlock(); /* attempt to flush the hold queue */ rc = kauditd_send_queue(sk, portid, &audit_hold_queue, UNICAST_RETRIES, NULL, kauditd_rehold_skb); if (rc < 0) { sk = NULL; auditd_reset(ac); goto main_queue; } /* attempt to flush the retry queue */ rc = kauditd_send_queue(sk, portid, &audit_retry_queue, UNICAST_RETRIES, NULL, kauditd_hold_skb); if (rc < 0) { sk = NULL; auditd_reset(ac); goto main_queue; } main_queue: /* process the main queue - do the multicast send and attempt * unicast, dump failed record sends to the retry queue; if * sk == NULL due to previous failures we will just do the * multicast send and move the record to the hold queue */ rc = kauditd_send_queue(sk, portid, &audit_queue, 1, kauditd_send_multicast_skb, (sk ? kauditd_retry_skb : kauditd_hold_skb)); if (ac && rc < 0) auditd_reset(ac); sk = NULL; /* drop our netns reference, no auditd sends past this line */ if (net) { put_net(net); net = NULL; } /* we have processed all the queues so wake everyone */ wake_up(&audit_backlog_wait); /* NOTE: we want to wake up if there is anything on the queue, * regardless of if an auditd is connected, as we need to * do the multicast send and rotate records from the * main queue to the retry/hold queues */ wait_event_freezable(kauditd_wait, (skb_queue_len(&audit_queue) ? 1 : 0)); } return 0; } int audit_send_list_thread(void *_dest) { struct audit_netlink_list *dest = _dest; struct sk_buff *skb; struct sock *sk = audit_get_sk(dest->net); /* wait for parent to finish and send an ACK */ audit_ctl_lock(); audit_ctl_unlock(); while ((skb = __skb_dequeue(&dest->q)) != NULL) netlink_unicast(sk, skb, dest->portid, 0); put_net(dest->net); kfree(dest); return 0; } struct sk_buff *audit_make_reply(int seq, int type, int done, int multi, const void *payload, int size) { struct sk_buff *skb; struct nlmsghdr *nlh; void *data; int flags = multi ? NLM_F_MULTI : 0; int t = done ? NLMSG_DONE : type; skb = nlmsg_new(size, GFP_KERNEL); if (!skb) return NULL; nlh = nlmsg_put(skb, 0, seq, t, size, flags); if (!nlh) goto out_kfree_skb; data = nlmsg_data(nlh); memcpy(data, payload, size); return skb; out_kfree_skb: kfree_skb(skb); return NULL; } static void audit_free_reply(struct audit_reply *reply) { if (!reply) return; kfree_skb(reply->skb); if (reply->net) put_net(reply->net); kfree(reply); } static int audit_send_reply_thread(void *arg) { struct audit_reply *reply = (struct audit_reply *)arg; audit_ctl_lock(); audit_ctl_unlock(); /* Ignore failure. It'll only happen if the sender goes away, because our timeout is set to infinite. */ netlink_unicast(audit_get_sk(reply->net), reply->skb, reply->portid, 0); reply->skb = NULL; audit_free_reply(reply); return 0; } /** * audit_send_reply - send an audit reply message via netlink * @request_skb: skb of request we are replying to (used to target the reply) * @seq: sequence number * @type: audit message type * @done: done (last) flag * @multi: multi-part message flag * @payload: payload data * @size: payload size * * Allocates a skb, builds the netlink message, and sends it to the port id. */ static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done, int multi, const void *payload, int size) { struct task_struct *tsk; struct audit_reply *reply; reply = kzalloc(sizeof(*reply), GFP_KERNEL); if (!reply) return; reply->skb = audit_make_reply(seq, type, done, multi, payload, size); if (!reply->skb) goto err; reply->net = get_net(sock_net(NETLINK_CB(request_skb).sk)); reply->portid = NETLINK_CB(request_skb).portid; tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply"); if (IS_ERR(tsk)) goto err; return; err: audit_free_reply(reply); } /* * Check for appropriate CAP_AUDIT_ capabilities on incoming audit * control messages. */ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) { int err = 0; /* Only support initial user namespace for now. */ /* * We return ECONNREFUSED because it tricks userspace into thinking * that audit was not configured into the kernel. Lots of users * configure their PAM stack (because that's what the distro does) * to reject login if unable to send messages to audit. If we return * ECONNREFUSED the PAM stack thinks the kernel does not have audit * configured in and will let login proceed. If we return EPERM * userspace will reject all logins. This should be removed when we * support non init namespaces!! */ if (current_user_ns() != &init_user_ns) return -ECONNREFUSED; switch (msg_type) { case AUDIT_LIST: case AUDIT_ADD: case AUDIT_DEL: return -EOPNOTSUPP; case AUDIT_GET: case AUDIT_SET: case AUDIT_GET_FEATURE: case AUDIT_SET_FEATURE: case AUDIT_LIST_RULES: case AUDIT_ADD_RULE: case AUDIT_DEL_RULE: case AUDIT_SIGNAL_INFO: case AUDIT_TTY_GET: case AUDIT_TTY_SET: case AUDIT_TRIM: case AUDIT_MAKE_EQUIV: /* Only support auditd and auditctl in initial pid namespace * for now. */ if (task_active_pid_ns(current) != &init_pid_ns) return -EPERM; if (!netlink_capable(skb, CAP_AUDIT_CONTROL)) err = -EPERM; break; case AUDIT_USER: case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: if (!netlink_capable(skb, CAP_AUDIT_WRITE)) err = -EPERM; break; default: /* bad msg */ err = -EINVAL; } return err; } static void audit_log_common_recv_msg(struct audit_context *context, struct audit_buffer **ab, u16 msg_type) { uid_t uid = from_kuid(&init_user_ns, current_uid()); pid_t pid = task_tgid_nr(current); if (!audit_enabled && msg_type != AUDIT_USER_AVC) { *ab = NULL; return; } *ab = audit_log_start(context, GFP_KERNEL, msg_type); if (unlikely(!*ab)) return; audit_log_format(*ab, "pid=%d uid=%u ", pid, uid); audit_log_session_info(*ab); audit_log_task_context(*ab); } static inline void audit_log_user_recv_msg(struct audit_buffer **ab, u16 msg_type) { audit_log_common_recv_msg(NULL, ab, msg_type); } static int is_audit_feature_set(int i) { return af.features & AUDIT_FEATURE_TO_MASK(i); } static int audit_get_feature(struct sk_buff *skb) { u32 seq; seq = nlmsg_hdr(skb)->nlmsg_seq; audit_send_reply(skb, seq, AUDIT_GET_FEATURE, 0, 0, &af, sizeof(af)); return 0; } static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature, u32 old_lock, u32 new_lock, int res) { struct audit_buffer *ab; if (audit_enabled == AUDIT_OFF) return; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_FEATURE_CHANGE); if (!ab) return; audit_log_task_info(ab); audit_log_format(ab, " feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d", audit_feature_names[which], !!old_feature, !!new_feature, !!old_lock, !!new_lock, res); audit_log_end(ab); } static int audit_set_feature(struct audit_features *uaf) { int i; BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > ARRAY_SIZE(audit_feature_names)); /* if there is ever a version 2 we should handle that here */ for (i = 0; i <= AUDIT_LAST_FEATURE; i++) { u32 feature = AUDIT_FEATURE_TO_MASK(i); u32 old_feature, new_feature, old_lock, new_lock; /* if we are not changing this feature, move along */ if (!(feature & uaf->mask)) continue; old_feature = af.features & feature; new_feature = uaf->features & feature; new_lock = (uaf->lock | af.lock) & feature; old_lock = af.lock & feature; /* are we changing a locked feature? */ if (old_lock && (new_feature != old_feature)) { audit_log_feature_change(i, old_feature, new_feature, old_lock, new_lock, 0); return -EPERM; } } /* nothing invalid, do the changes */ for (i = 0; i <= AUDIT_LAST_FEATURE; i++) { u32 feature = AUDIT_FEATURE_TO_MASK(i); u32 old_feature, new_feature, old_lock, new_lock; /* if we are not changing this feature, move along */ if (!(feature & uaf->mask)) continue; old_feature = af.features & feature; new_feature = uaf->features & feature; old_lock = af.lock & feature; new_lock = (uaf->lock | af.lock) & feature; if (new_feature != old_feature) audit_log_feature_change(i, old_feature, new_feature, old_lock, new_lock, 1); if (new_feature) af.features |= feature; else af.features &= ~feature; af.lock |= new_lock; } return 0; } static int audit_replace(struct pid *pid) { pid_t pvnr; struct sk_buff *skb; pvnr = pid_vnr(pid); skb = audit_make_reply(0, AUDIT_REPLACE, 0, 0, &pvnr, sizeof(pvnr)); if (!skb) return -ENOMEM; return auditd_send_unicast_skb(skb); } static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { u32 seq; void *data; int data_len; int err; struct audit_buffer *ab; u16 msg_type = nlh->nlmsg_type; struct audit_sig_info *sig_data; char *ctx = NULL; u32 len; err = audit_netlink_ok(skb, msg_type); if (err) return err; seq = nlh->nlmsg_seq; data = nlmsg_data(nlh); data_len = nlmsg_len(nlh); switch (msg_type) { case AUDIT_GET: { struct audit_status s; memset(&s, 0, sizeof(s)); s.enabled = audit_enabled; s.failure = audit_failure; /* NOTE: use pid_vnr() so the PID is relative to the current * namespace */ s.pid = auditd_pid_vnr(); s.rate_limit = audit_rate_limit; s.backlog_limit = audit_backlog_limit; s.lost = atomic_read(&audit_lost); s.backlog = skb_queue_len(&audit_queue); s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL; s.backlog_wait_time = audit_backlog_wait_time; s.backlog_wait_time_actual = atomic_read(&audit_backlog_wait_time_actual); audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s)); break; } case AUDIT_SET: { struct audit_status s; memset(&s, 0, sizeof(s)); /* guard against past and future API changes */ memcpy(&s, data, min_t(size_t, sizeof(s), data_len)); if (s.mask & AUDIT_STATUS_ENABLED) { err = audit_set_enabled(s.enabled); if (err < 0) return err; } if (s.mask & AUDIT_STATUS_FAILURE) { err = audit_set_failure(s.failure); if (err < 0) return err; } if (s.mask & AUDIT_STATUS_PID) { /* NOTE: we are using the vnr PID functions below * because the s.pid value is relative to the * namespace of the caller; at present this * doesn't matter much since you can really only * run auditd from the initial pid namespace, but * something to keep in mind if this changes */ pid_t new_pid = s.pid; pid_t auditd_pid; struct pid *req_pid = task_tgid(current); /* Sanity check - PID values must match. Setting * pid to 0 is how auditd ends auditing. */ if (new_pid && (new_pid != pid_vnr(req_pid))) return -EINVAL; /* test the auditd connection */ audit_replace(req_pid); auditd_pid = auditd_pid_vnr(); if (auditd_pid) { /* replacing a healthy auditd is not allowed */ if (new_pid) { audit_log_config_change("audit_pid", new_pid, auditd_pid, 0); return -EEXIST; } /* only current auditd can unregister itself */ if (pid_vnr(req_pid) != auditd_pid) { audit_log_config_change("audit_pid", new_pid, auditd_pid, 0); return -EACCES; } } if (new_pid) { /* register a new auditd connection */ err = auditd_set(req_pid, NETLINK_CB(skb).portid, sock_net(NETLINK_CB(skb).sk)); if (audit_enabled != AUDIT_OFF) audit_log_config_change("audit_pid", new_pid, auditd_pid, err ? 0 : 1); if (err) return err; /* try to process any backlog */ wake_up_interruptible(&kauditd_wait); } else { if (audit_enabled != AUDIT_OFF) audit_log_config_change("audit_pid", new_pid, auditd_pid, 1); /* unregister the auditd connection */ auditd_reset(NULL); } } if (s.mask & AUDIT_STATUS_RATE_LIMIT) { err = audit_set_rate_limit(s.rate_limit); if (err < 0) return err; } if (s.mask & AUDIT_STATUS_BACKLOG_LIMIT) { err = audit_set_backlog_limit(s.backlog_limit); if (err < 0) return err; } if (s.mask & AUDIT_STATUS_BACKLOG_WAIT_TIME) { if (sizeof(s) > (size_t)nlh->nlmsg_len) return -EINVAL; if (s.backlog_wait_time > 10*AUDIT_BACKLOG_WAIT_TIME) return -EINVAL; err = audit_set_backlog_wait_time(s.backlog_wait_time); if (err < 0) return err; } if (s.mask == AUDIT_STATUS_LOST) { u32 lost = atomic_xchg(&audit_lost, 0); audit_log_config_change("lost", 0, lost, 1); return lost; } if (s.mask == AUDIT_STATUS_BACKLOG_WAIT_TIME_ACTUAL) { u32 actual = atomic_xchg(&audit_backlog_wait_time_actual, 0); audit_log_config_change("backlog_wait_time_actual", 0, actual, 1); return actual; } break; } case AUDIT_GET_FEATURE: err = audit_get_feature(skb); if (err) return err; break; case AUDIT_SET_FEATURE: if (data_len < sizeof(struct audit_features)) return -EINVAL; err = audit_set_feature(data); if (err) return err; break; case AUDIT_USER: case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: if (!audit_enabled && msg_type != AUDIT_USER_AVC) return 0; /* exit early if there isn't at least one character to print */ if (data_len < 2) return -EINVAL; err = audit_filter(msg_type, AUDIT_FILTER_USER); if (err == 1) { /* match or error */ char *str = data; err = 0; if (msg_type == AUDIT_USER_TTY) { err = tty_audit_push(); if (err) break; } audit_log_user_recv_msg(&ab, msg_type); if (msg_type != AUDIT_USER_TTY) { /* ensure NULL termination */ str[data_len - 1] = '\0'; audit_log_format(ab, " msg='%.*s'", AUDIT_MESSAGE_TEXT_MAX, str); } else { audit_log_format(ab, " data="); if (str[data_len - 1] == '\0') data_len--; audit_log_n_untrustedstring(ab, str, data_len); } audit_log_end(ab); } break; case AUDIT_ADD_RULE: case AUDIT_DEL_RULE: if (data_len < sizeof(struct audit_rule_data)) return -EINVAL; if (audit_enabled == AUDIT_LOCKED) { audit_log_common_recv_msg(audit_context(), &ab, AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=%s audit_enabled=%d res=0", msg_type == AUDIT_ADD_RULE ? "add_rule" : "remove_rule", audit_enabled); audit_log_end(ab); return -EPERM; } err = audit_rule_change(msg_type, seq, data, data_len); break; case AUDIT_LIST_RULES: err = audit_list_rules_send(skb, seq); break; case AUDIT_TRIM: audit_trim_trees(); audit_log_common_recv_msg(audit_context(), &ab, AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=trim res=1"); audit_log_end(ab); break; case AUDIT_MAKE_EQUIV: { void *bufp = data; u32 sizes[2]; size_t msglen = data_len; char *old, *new; err = -EINVAL; if (msglen < 2 * sizeof(u32)) break; memcpy(sizes, bufp, 2 * sizeof(u32)); bufp += 2 * sizeof(u32); msglen -= 2 * sizeof(u32); old = audit_unpack_string(&bufp, &msglen, sizes[0]); if (IS_ERR(old)) { err = PTR_ERR(old); break; } new = audit_unpack_string(&bufp, &msglen, sizes[1]); if (IS_ERR(new)) { err = PTR_ERR(new); kfree(old); break; } /* OK, here comes... */ err = audit_tag_tree(old, new); audit_log_common_recv_msg(audit_context(), &ab, AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=make_equiv old="); audit_log_untrustedstring(ab, old); audit_log_format(ab, " new="); audit_log_untrustedstring(ab, new); audit_log_format(ab, " res=%d", !err); audit_log_end(ab); kfree(old); kfree(new); break; } case AUDIT_SIGNAL_INFO: len = 0; if (audit_sig_sid) { err = security_secid_to_secctx(audit_sig_sid, &ctx, &len); if (err) return err; } sig_data = kmalloc(struct_size(sig_data, ctx, len), GFP_KERNEL); if (!sig_data) { if (audit_sig_sid) security_release_secctx(ctx, len); return -ENOMEM; } sig_data->uid = from_kuid(&init_user_ns, audit_sig_uid); sig_data->pid = audit_sig_pid; if (audit_sig_sid) { memcpy(sig_data->ctx, ctx, len); security_release_secctx(ctx, len); } audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0, sig_data, struct_size(sig_data, ctx, len)); kfree(sig_data); break; case AUDIT_TTY_GET: { struct audit_tty_status s; unsigned int t; t = READ_ONCE(current->signal->audit_tty); s.enabled = t & AUDIT_TTY_ENABLE; s.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD); audit_send_reply(skb, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); break; } case AUDIT_TTY_SET: { struct audit_tty_status s, old; struct audit_buffer *ab; unsigned int t; memset(&s, 0, sizeof(s)); /* guard against past and future API changes */ memcpy(&s, data, min_t(size_t, sizeof(s), data_len)); /* check if new data is valid */ if ((s.enabled != 0 && s.enabled != 1) || (s.log_passwd != 0 && s.log_passwd != 1)) err = -EINVAL; if (err) t = READ_ONCE(current->signal->audit_tty); else { t = s.enabled | (-s.log_passwd & AUDIT_TTY_LOG_PASSWD); t = xchg(&current->signal->audit_tty, t); } old.enabled = t & AUDIT_TTY_ENABLE; old.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD); audit_log_common_recv_msg(audit_context(), &ab, AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=tty_set old-enabled=%d new-enabled=%d" " old-log_passwd=%d new-log_passwd=%d res=%d", old.enabled, s.enabled, old.log_passwd, s.log_passwd, !err); audit_log_end(ab); break; } default: err = -EINVAL; break; } return err < 0 ? err : 0; } /** * audit_receive - receive messages from a netlink control socket * @skb: the message buffer * * Parse the provided skb and deal with any messages that may be present, * malformed skbs are discarded. */ static void audit_receive(struct sk_buff *skb) { struct nlmsghdr *nlh; /* * len MUST be signed for nlmsg_next to be able to dec it below 0 * if the nlmsg_len was not aligned */ int len; int err; nlh = nlmsg_hdr(skb); len = skb->len; audit_ctl_lock(); while (nlmsg_ok(nlh, len)) { err = audit_receive_msg(skb, nlh); /* if err or if this message says it wants a response */ if (err || (nlh->nlmsg_flags & NLM_F_ACK)) netlink_ack(skb, nlh, err, NULL); nlh = nlmsg_next(nlh, &len); } audit_ctl_unlock(); /* can't block with the ctrl lock, so penalize the sender now */ if (audit_backlog_limit && (skb_queue_len(&audit_queue) > audit_backlog_limit)) { DECLARE_WAITQUEUE(wait, current); /* wake kauditd to try and flush the queue */ wake_up_interruptible(&kauditd_wait); add_wait_queue_exclusive(&audit_backlog_wait, &wait); set_current_state(TASK_UNINTERRUPTIBLE); schedule_timeout(audit_backlog_wait_time); remove_wait_queue(&audit_backlog_wait, &wait); } } /* Log information about who is connecting to the audit multicast socket */ static void audit_log_multicast(int group, const char *op, int err) { const struct cred *cred; struct tty_struct *tty; char comm[sizeof(current->comm)]; struct audit_buffer *ab; if (!audit_enabled) return; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_EVENT_LISTENER); if (!ab) return; cred = current_cred(); tty = audit_get_tty(); audit_log_format(ab, "pid=%u uid=%u auid=%u tty=%s ses=%u", task_pid_nr(current), from_kuid(&init_user_ns, cred->uid), from_kuid(&init_user_ns, audit_get_loginuid(current)), tty ? tty_name(tty) : "(none)", audit_get_sessionid(current)); audit_put_tty(tty); audit_log_task_context(ab); /* subj= */ audit_log_format(ab, " comm="); audit_log_untrustedstring(ab, get_task_comm(comm, current)); audit_log_d_path_exe(ab, current->mm); /* exe= */ audit_log_format(ab, " nl-mcgrp=%d op=%s res=%d", group, op, !err); audit_log_end(ab); } /* Run custom bind function on netlink socket group connect or bind requests. */ static int audit_multicast_bind(struct net *net, int group) { int err = 0; if (!capable(CAP_AUDIT_READ)) err = -EPERM; audit_log_multicast(group, "connect", err); return err; } static void audit_multicast_unbind(struct net *net, int group) { audit_log_multicast(group, "disconnect", 0); } static int __net_init audit_net_init(struct net *net) { struct netlink_kernel_cfg cfg = { .input = audit_receive, .bind = audit_multicast_bind, .unbind = audit_multicast_unbind, .flags = NL_CFG_F_NONROOT_RECV, .groups = AUDIT_NLGRP_MAX, }; struct audit_net *aunet = net_generic(net, audit_net_id); aunet->sk = netlink_kernel_create(net, NETLINK_AUDIT, &cfg); if (aunet->sk == NULL) { audit_panic("cannot initialize netlink socket in namespace"); return -ENOMEM; } /* limit the timeout in case auditd is blocked/stopped */ aunet->sk->sk_sndtimeo = HZ / 10; return 0; } static void __net_exit audit_net_exit(struct net *net) { struct audit_net *aunet = net_generic(net, audit_net_id); /* NOTE: you would think that we would want to check the auditd * connection and potentially reset it here if it lives in this * namespace, but since the auditd connection tracking struct holds a * reference to this namespace (see auditd_set()) we are only ever * going to get here after that connection has been released */ netlink_kernel_release(aunet->sk); } static struct pernet_operations audit_net_ops __net_initdata = { .init = audit_net_init, .exit = audit_net_exit, .id = &audit_net_id, .size = sizeof(struct audit_net), }; /* Initialize audit support at boot time. */ static int __init audit_init(void) { int i; if (audit_initialized == AUDIT_DISABLED) return 0; audit_buffer_cache = kmem_cache_create("audit_buffer", sizeof(struct audit_buffer), 0, SLAB_PANIC, NULL); skb_queue_head_init(&audit_queue); skb_queue_head_init(&audit_retry_queue); skb_queue_head_init(&audit_hold_queue); for (i = 0; i < AUDIT_INODE_BUCKETS; i++) INIT_LIST_HEAD(&audit_inode_hash[i]); mutex_init(&audit_cmd_mutex.lock); audit_cmd_mutex.owner = NULL; pr_info("initializing netlink subsys (%s)\n", audit_default ? "enabled" : "disabled"); register_pernet_subsys(&audit_net_ops); audit_initialized = AUDIT_INITIALIZED; kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); if (IS_ERR(kauditd_task)) { int err = PTR_ERR(kauditd_task); panic("audit: failed to start the kauditd thread (%d)\n", err); } audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "state=initialized audit_enabled=%u res=1", audit_enabled); return 0; } postcore_initcall(audit_init); /* * Process kernel command-line parameter at boot time. * audit={0|off} or audit={1|on}. */ static int __init audit_enable(char *str) { if (!strcasecmp(str, "off") || !strcmp(str, "0")) audit_default = AUDIT_OFF; else if (!strcasecmp(str, "on") || !strcmp(str, "1")) audit_default = AUDIT_ON; else { pr_err("audit: invalid 'audit' parameter value (%s)\n", str); audit_default = AUDIT_ON; } if (audit_default == AUDIT_OFF) audit_initialized = AUDIT_DISABLED; if (audit_set_enabled(audit_default)) pr_err("audit: error setting audit state (%d)\n", audit_default); pr_info("%s\n", audit_default ? "enabled (after initialization)" : "disabled (until reboot)"); return 1; } __setup("audit=", audit_enable); /* Process kernel command-line parameter at boot time. * audit_backlog_limit=<n> */ static int __init audit_backlog_limit_set(char *str) { u32 audit_backlog_limit_arg; pr_info("audit_backlog_limit: "); if (kstrtouint(str, 0, &audit_backlog_limit_arg)) { pr_cont("using default of %u, unable to parse %s\n", audit_backlog_limit, str); return 1; } audit_backlog_limit = audit_backlog_limit_arg; pr_cont("%d\n", audit_backlog_limit); return 1; } __setup("audit_backlog_limit=", audit_backlog_limit_set); static void audit_buffer_free(struct audit_buffer *ab) { if (!ab) return; kfree_skb(ab->skb); kmem_cache_free(audit_buffer_cache, ab); } static struct audit_buffer *audit_buffer_alloc(struct audit_context *ctx, gfp_t gfp_mask, int type) { struct audit_buffer *ab; ab = kmem_cache_alloc(audit_buffer_cache, gfp_mask); if (!ab) return NULL; ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask); if (!ab->skb) goto err; if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0)) goto err; ab->ctx = ctx; ab->gfp_mask = gfp_mask; return ab; err: audit_buffer_free(ab); return NULL; } /** * audit_serial - compute a serial number for the audit record * * Compute a serial number for the audit record. Audit records are * written to user-space as soon as they are generated, so a complete * audit record may be written in several pieces. The timestamp of the * record and this serial number are used by the user-space tools to * determine which pieces belong to the same audit record. The * (timestamp,serial) tuple is unique for each syscall and is live from * syscall entry to syscall exit. * * NOTE: Another possibility is to store the formatted records off the * audit context (for those records that have a context), and emit them * all at syscall exit. However, this could delay the reporting of * significant errors until syscall exit (or never, if the system * halts). */ unsigned int audit_serial(void) { static atomic_t serial = ATOMIC_INIT(0); return atomic_inc_return(&serial); } static inline void audit_get_stamp(struct audit_context *ctx, struct timespec64 *t, unsigned int *serial) { if (!ctx || !auditsc_get_stamp(ctx, t, serial)) { ktime_get_coarse_real_ts64(t); *serial = audit_serial(); } } /** * audit_log_start - obtain an audit buffer * @ctx: audit_context (may be NULL) * @gfp_mask: type of allocation * @type: audit message type * * Returns audit_buffer pointer on success or NULL on error. * * Obtain an audit buffer. This routine does locking to obtain the * audit buffer, but then no locking is required for calls to * audit_log_*format. If the task (ctx) is a task that is currently in a * syscall, then the syscall is marked as auditable and an audit record * will be written at syscall exit. If there is no associated task, then * task context (ctx) should be NULL. */ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type) { struct audit_buffer *ab; struct timespec64 t; unsigned int serial; if (audit_initialized != AUDIT_INITIALIZED) return NULL; if (unlikely(!audit_filter(type, AUDIT_FILTER_EXCLUDE))) return NULL; /* NOTE: don't ever fail/sleep on these two conditions: * 1. auditd generated record - since we need auditd to drain the * queue; also, when we are checking for auditd, compare PIDs using * task_tgid_vnr() since auditd_pid is set in audit_receive_msg() * using a PID anchored in the caller's namespace * 2. generator holding the audit_cmd_mutex - we don't want to block * while holding the mutex, although we do penalize the sender * later in audit_receive() when it is safe to block */ if (!(auditd_test_task(current) || audit_ctl_owner_current())) { long stime = audit_backlog_wait_time; while (audit_backlog_limit && (skb_queue_len(&audit_queue) > audit_backlog_limit)) { /* wake kauditd to try and flush the queue */ wake_up_interruptible(&kauditd_wait); /* sleep if we are allowed and we haven't exhausted our * backlog wait limit */ if (gfpflags_allow_blocking(gfp_mask) && (stime > 0)) { long rtime = stime; DECLARE_WAITQUEUE(wait, current); add_wait_queue_exclusive(&audit_backlog_wait, &wait); set_current_state(TASK_UNINTERRUPTIBLE); stime = schedule_timeout(rtime); atomic_add(rtime - stime, &audit_backlog_wait_time_actual); remove_wait_queue(&audit_backlog_wait, &wait); } else { if (audit_rate_check() && printk_ratelimit()) pr_warn("audit_backlog=%d > audit_backlog_limit=%d\n", skb_queue_len(&audit_queue), audit_backlog_limit); audit_log_lost("backlog limit exceeded"); return NULL; } } } ab = audit_buffer_alloc(ctx, gfp_mask, type); if (!ab) { audit_log_lost("out of memory in audit_log_start"); return NULL; } audit_get_stamp(ab->ctx, &t, &serial); /* cancel dummy context to enable supporting records */ if (ctx) ctx->dummy = 0; audit_log_format(ab, "audit(%llu.%03lu:%u): ", (unsigned long long)t.tv_sec, t.tv_nsec/1000000, serial); return ab; } /** * audit_expand - expand skb in the audit buffer * @ab: audit_buffer * @extra: space to add at tail of the skb * * Returns 0 (no space) on failed expansion, or available space if * successful. */ static inline int audit_expand(struct audit_buffer *ab, int extra) { struct sk_buff *skb = ab->skb; int oldtail = skb_tailroom(skb); int ret = pskb_expand_head(skb, 0, extra, ab->gfp_mask); int newtail = skb_tailroom(skb); if (ret < 0) { audit_log_lost("out of memory in audit_expand"); return 0; } skb->truesize += newtail - oldtail; return newtail; } /* * Format an audit message into the audit buffer. If there isn't enough * room in the audit buffer, more room will be allocated and vsnprint * will be called a second time. Currently, we assume that a printk * can't format message larger than 1024 bytes, so we don't either. */ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt, va_list args) { int len, avail; struct sk_buff *skb; va_list args2; if (!ab) return; BUG_ON(!ab->skb); skb = ab->skb; avail = skb_tailroom(skb); if (avail == 0) { avail = audit_expand(ab, AUDIT_BUFSIZ); if (!avail) goto out; } va_copy(args2, args); len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args); if (len >= avail) { /* The printk buffer is 1024 bytes long, so if we get * here and AUDIT_BUFSIZ is at least 1024, then we can * log everything that printk could have logged. */ avail = audit_expand(ab, max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail)); if (!avail) goto out_va_end; len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2); } if (len > 0) skb_put(skb, len); out_va_end: va_end(args2); out: return; } /** * audit_log_format - format a message into the audit buffer. * @ab: audit_buffer * @fmt: format string * @...: optional parameters matching @fmt string * * All the work is done in audit_log_vformat. */ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) { va_list args; if (!ab) return; va_start(args, fmt); audit_log_vformat(ab, fmt, args); va_end(args); } /** * audit_log_n_hex - convert a buffer to hex and append it to the audit skb * @ab: the audit_buffer * @buf: buffer to convert to hex * @len: length of @buf to be converted * * No return value; failure to expand is silently ignored. * * This function will take the passed buf and convert it into a string of * ascii hex digits. The new string is placed onto the skb. */ void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf, size_t len) { int i, avail, new_len; unsigned char *ptr; struct sk_buff *skb; if (!ab) return; BUG_ON(!ab->skb); skb = ab->skb; avail = skb_tailroom(skb); new_len = len<<1; if (new_len >= avail) { /* Round the buffer request up to the next multiple */ new_len = AUDIT_BUFSIZ*(((new_len-avail)/AUDIT_BUFSIZ) + 1); avail = audit_expand(ab, new_len); if (!avail) return; } ptr = skb_tail_pointer(skb); for (i = 0; i < len; i++) ptr = hex_byte_pack_upper(ptr, buf[i]); *ptr = 0; skb_put(skb, len << 1); /* new string is twice the old string */ } /* * Format a string of no more than slen characters into the audit buffer, * enclosed in quote marks. */ void audit_log_n_string(struct audit_buffer *ab, const char *string, size_t slen) { int avail, new_len; unsigned char *ptr; struct sk_buff *skb; if (!ab) return; BUG_ON(!ab->skb); skb = ab->skb; avail = skb_tailroom(skb); new_len = slen + 3; /* enclosing quotes + null terminator */ if (new_len > avail) { avail = audit_expand(ab, new_len); if (!avail) return; } ptr = skb_tail_pointer(skb); *ptr++ = '"'; memcpy(ptr, string, slen); ptr += slen; *ptr++ = '"'; *ptr = 0; skb_put(skb, slen + 2); /* don't include null terminator */ } /** * audit_string_contains_control - does a string need to be logged in hex * @string: string to be checked * @len: max length of the string to check */ bool audit_string_contains_control(const char *string, size_t len) { const unsigned char *p; for (p = string; p < (const unsigned char *)string + len; p++) { if (*p == '"' || *p < 0x21 || *p > 0x7e) return true; } return false; } /** * audit_log_n_untrustedstring - log a string that may contain random characters * @ab: audit_buffer * @len: length of string (not including trailing null) * @string: string to be logged * * This code will escape a string that is passed to it if the string * contains a control character, unprintable character, double quote mark, * or a space. Unescaped strings will start and end with a double quote mark. * Strings that are escaped are printed in hex (2 digits per char). * * The caller specifies the number of characters in the string to log, which may * or may not be the entire string. */ void audit_log_n_untrustedstring(struct audit_buffer *ab, const char *string, size_t len) { if (audit_string_contains_control(string, len)) audit_log_n_hex(ab, string, len); else audit_log_n_string(ab, string, len); } /** * audit_log_untrustedstring - log a string that may contain random characters * @ab: audit_buffer * @string: string to be logged * * Same as audit_log_n_untrustedstring(), except that strlen is used to * determine string length. */ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) { audit_log_n_untrustedstring(ab, string, strlen(string)); } /* This is a helper-function to print the escaped d_path */ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, const struct path *path) { char *p, *pathname; if (prefix) audit_log_format(ab, "%s", prefix); /* We will allow 11 spaces for ' (deleted)' to be appended */ pathname = kmalloc(PATH_MAX+11, ab->gfp_mask); if (!pathname) { audit_log_format(ab, "\"<no_memory>\""); return; } p = d_path(path, pathname, PATH_MAX+11); if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */ /* FIXME: can we save some information here? */ audit_log_format(ab, "\"<too_long>\""); } else audit_log_untrustedstring(ab, p); kfree(pathname); } void audit_log_session_info(struct audit_buffer *ab) { unsigned int sessionid = audit_get_sessionid(current); uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current)); audit_log_format(ab, "auid=%u ses=%u", auid, sessionid); } void audit_log_key(struct audit_buffer *ab, char *key) { audit_log_format(ab, " key="); if (key) audit_log_untrustedstring(ab, key); else audit_log_format(ab, "(null)"); } int audit_log_task_context(struct audit_buffer *ab) { char *ctx = NULL; unsigned len; int error; u32 sid; security_current_getsecid_subj(&sid); if (!sid) return 0; error = security_secid_to_secctx(sid, &ctx, &len); if (error) { if (error != -EINVAL) goto error_path; return 0; } audit_log_format(ab, " subj=%s", ctx); security_release_secctx(ctx, len); return 0; error_path: audit_panic("error in audit_log_task_context"); return error; } EXPORT_SYMBOL(audit_log_task_context); void audit_log_d_path_exe(struct audit_buffer *ab, struct mm_struct *mm) { struct file *exe_file; if (!mm) goto out_null; exe_file = get_mm_exe_file(mm); if (!exe_file) goto out_null; audit_log_d_path(ab, " exe=", &exe_file->f_path); fput(exe_file); return; out_null: audit_log_format(ab, " exe=(null)"); } struct tty_struct *audit_get_tty(void) { struct tty_struct *tty = NULL; unsigned long flags; spin_lock_irqsave(&current->sighand->siglock, flags); if (current->signal) tty = tty_kref_get(current->signal->tty); spin_unlock_irqrestore(&current->sighand->siglock, flags); return tty; } void audit_put_tty(struct tty_struct *tty) { tty_kref_put(tty); } void audit_log_task_info(struct audit_buffer *ab) { const struct cred *cred; char comm[sizeof(current->comm)]; struct tty_struct *tty; if (!ab) return; cred = current_cred(); tty = audit_get_tty(); audit_log_format(ab, " ppid=%d pid=%d auid=%u uid=%u gid=%u" " euid=%u suid=%u fsuid=%u" " egid=%u sgid=%u fsgid=%u tty=%s ses=%u", task_ppid_nr(current), task_tgid_nr(current), from_kuid(&init_user_ns, audit_get_loginuid(current)), from_kuid(&init_user_ns, cred->uid), from_kgid(&init_user_ns, cred->gid), from_kuid(&init_user_ns, cred->euid), from_kuid(&init_user_ns, cred->suid), from_kuid(&init_user_ns, cred->fsuid), from_kgid(&init_user_ns, cred->egid), from_kgid(&init_user_ns, cred->sgid), from_kgid(&init_user_ns, cred->fsgid), tty ? tty_name(tty) : "(none)", audit_get_sessionid(current)); audit_put_tty(tty); audit_log_format(ab, " comm="); audit_log_untrustedstring(ab, get_task_comm(comm, current)); audit_log_d_path_exe(ab, current->mm); audit_log_task_context(ab); } EXPORT_SYMBOL(audit_log_task_info); /** * audit_log_path_denied - report a path restriction denial * @type: audit message type (AUDIT_ANOM_LINK, AUDIT_ANOM_CREAT, etc) * @operation: specific operation name */ void audit_log_path_denied(int type, const char *operation) { struct audit_buffer *ab; if (!audit_enabled || audit_dummy_context()) return; /* Generate log with subject, operation, outcome. */ ab = audit_log_start(audit_context(), GFP_KERNEL, type); if (!ab) return; audit_log_format(ab, "op=%s", operation); audit_log_task_info(ab); audit_log_format(ab, " res=0"); audit_log_end(ab); } /* global counter which is incremented every time something logs in */ static atomic_t session_id = ATOMIC_INIT(0); static int audit_set_loginuid_perm(kuid_t loginuid) { /* if we are unset, we don't need privs */ if (!audit_loginuid_set(current)) return 0; /* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/ if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE)) return -EPERM; /* it is set, you need permission */ if (!capable(CAP_AUDIT_CONTROL)) return -EPERM; /* reject if this is not an unset and we don't allow that */ if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID) && uid_valid(loginuid)) return -EPERM; return 0; } static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, unsigned int oldsessionid, unsigned int sessionid, int rc) { struct audit_buffer *ab; uid_t uid, oldloginuid, loginuid; struct tty_struct *tty; if (!audit_enabled) return; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_LOGIN); if (!ab) return; uid = from_kuid(&init_user_ns, task_uid(current)); oldloginuid = from_kuid(&init_user_ns, koldloginuid); loginuid = from_kuid(&init_user_ns, kloginuid); tty = audit_get_tty(); audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid); audit_log_task_context(ab); audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d", oldloginuid, loginuid, tty ? tty_name(tty) : "(none)", oldsessionid, sessionid, !rc); audit_put_tty(tty); audit_log_end(ab); } /** * audit_set_loginuid - set current task's loginuid * @loginuid: loginuid value * * Returns 0. * * Called (set) from fs/proc/base.c::proc_loginuid_write(). */ int audit_set_loginuid(kuid_t loginuid) { unsigned int oldsessionid, sessionid = AUDIT_SID_UNSET; kuid_t oldloginuid; int rc; oldloginuid = audit_get_loginuid(current); oldsessionid = audit_get_sessionid(current); rc = audit_set_loginuid_perm(loginuid); if (rc) goto out; /* are we setting or clearing? */ if (uid_valid(loginuid)) { sessionid = (unsigned int)atomic_inc_return(&session_id); if (unlikely(sessionid == AUDIT_SID_UNSET)) sessionid = (unsigned int)atomic_inc_return(&session_id); } current->sessionid = sessionid; current->loginuid = loginuid; out: audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc); return rc; } /** * audit_signal_info - record signal info for shutting down audit subsystem * @sig: signal value * @t: task being signaled * * If the audit subsystem is being terminated, record the task (pid) * and uid that is doing that. */ int audit_signal_info(int sig, struct task_struct *t) { kuid_t uid = current_uid(), auid; if (auditd_test_task(t) && (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2)) { audit_sig_pid = task_tgid_nr(current); auid = audit_get_loginuid(current); if (uid_valid(auid)) audit_sig_uid = auid; else audit_sig_uid = uid; security_current_getsecid_subj(&audit_sig_sid); } return audit_signal_info_syscall(t); } /** * audit_log_end - end one audit record * @ab: the audit_buffer * * We can not do a netlink send inside an irq context because it blocks (last * arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed on a * queue and a kthread is scheduled to remove them from the queue outside the * irq context. May be called in any context. */ void audit_log_end(struct audit_buffer *ab) { struct sk_buff *skb; struct nlmsghdr *nlh; if (!ab) return; if (audit_rate_check()) { skb = ab->skb; ab->skb = NULL; /* setup the netlink header, see the comments in * kauditd_send_multicast_skb() for length quirks */ nlh = nlmsg_hdr(skb); nlh->nlmsg_len = skb->len - NLMSG_HDRLEN; /* queue the netlink packet and poke the kauditd thread */ skb_queue_tail(&audit_queue, skb); wake_up_interruptible(&kauditd_wait); } else audit_log_lost("rate limit exceeded"); audit_buffer_free(ab); } /** * audit_log - Log an audit record * @ctx: audit context * @gfp_mask: type of allocation * @type: audit message type * @fmt: format string to use * @...: variable parameters matching the format string * * This is a convenience function that calls audit_log_start, * audit_log_vformat, and audit_log_end. It may be called * in any context. */ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, const char *fmt, ...) { struct audit_buffer *ab; va_list args; ab = audit_log_start(ctx, gfp_mask, type); if (ab) { va_start(args, fmt); audit_log_vformat(ab, fmt, args); va_end(args); audit_log_end(ab); } } EXPORT_SYMBOL(audit_log_start); EXPORT_SYMBOL(audit_log_end); EXPORT_SYMBOL(audit_log_format); EXPORT_SYMBOL(audit_log);
29 29 29 29 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ #include "peer.h" #include "device.h" #include "queueing.h" #include "timers.h" #include "peerlookup.h" #include "noise.h" #include <linux/kref.h> #include <linux/lockdep.h> #include <linux/rcupdate.h> #include <linux/list.h> static struct kmem_cache *peer_cache; static atomic64_t peer_counter = ATOMIC64_INIT(0); struct wg_peer *wg_peer_create(struct wg_device *wg, const u8 public_key[NOISE_PUBLIC_KEY_LEN], const u8 preshared_key[NOISE_SYMMETRIC_KEY_LEN]) { struct wg_peer *peer; int ret = -ENOMEM; lockdep_assert_held(&wg->device_update_lock); if (wg->num_peers >= MAX_PEERS_PER_DEVICE) return ERR_PTR(ret); peer = kmem_cache_zalloc(peer_cache, GFP_KERNEL); if (unlikely(!peer)) return ERR_PTR(ret); if (unlikely(dst_cache_init(&peer->endpoint_cache, GFP_KERNEL))) goto err; peer->device = wg; wg_noise_handshake_init(&peer->handshake, &wg->static_identity, public_key, preshared_key, peer); peer->internal_id = atomic64_inc_return(&peer_counter); peer->serial_work_cpu = nr_cpumask_bits; wg_cookie_init(&peer->latest_cookie); wg_timers_init(peer); wg_cookie_checker_precompute_peer_keys(peer); spin_lock_init(&peer->keypairs.keypair_update_lock); INIT_WORK(&peer->transmit_handshake_work, wg_packet_handshake_send_worker); INIT_WORK(&peer->transmit_packet_work, wg_packet_tx_worker); wg_prev_queue_init(&peer->tx_queue); wg_prev_queue_init(&peer->rx_queue); rwlock_init(&peer->endpoint_lock); kref_init(&peer->refcount); skb_queue_head_init(&peer->staged_packet_queue); wg_noise_reset_last_sent_handshake(&peer->last_sent_handshake); set_bit(NAPI_STATE_NO_BUSY_POLL, &peer->napi.state); netif_napi_add(wg->dev, &peer->napi, wg_packet_rx_poll); napi_enable(&peer->napi); list_add_tail(&peer->peer_list, &wg->peer_list); INIT_LIST_HEAD(&peer->allowedips_list); wg_pubkey_hashtable_add(wg->peer_hashtable, peer); ++wg->num_peers; pr_debug("%s: Peer %llu created\n", wg->dev->name, peer->internal_id); return peer; err: kmem_cache_free(peer_cache, peer); return ERR_PTR(ret); } struct wg_peer *wg_peer_get_maybe_zero(struct wg_peer *peer) { RCU_LOCKDEP_WARN(!rcu_read_lock_bh_held(), "Taking peer reference without holding the RCU read lock"); if (unlikely(!peer || !kref_get_unless_zero(&peer->refcount))) return NULL; return peer; } static void peer_make_dead(struct wg_peer *peer) { /* Remove from configuration-time lookup structures. */ list_del_init(&peer->peer_list); wg_allowedips_remove_by_peer(&peer->device->peer_allowedips, peer, &peer->device->device_update_lock); wg_pubkey_hashtable_remove(peer->device->peer_hashtable, peer); /* Mark as dead, so that we don't allow jumping contexts after. */ WRITE_ONCE(peer->is_dead, true); /* The caller must now synchronize_net() for this to take effect. */ } static void peer_remove_after_dead(struct wg_peer *peer) { WARN_ON(!peer->is_dead); /* No more keypairs can be created for this peer, since is_dead protects * add_new_keypair, so we can now destroy existing ones. */ wg_noise_keypairs_clear(&peer->keypairs); /* Destroy all ongoing timers that were in-flight at the beginning of * this function. */ wg_timers_stop(peer); /* The transition between packet encryption/decryption queues isn't * guarded by is_dead, but each reference's life is strictly bounded by * two generations: once for parallel crypto and once for serial * ingestion, so we can simply flush twice, and be sure that we no * longer have references inside these queues. */ /* a) For encrypt/decrypt. */ flush_workqueue(peer->device->packet_crypt_wq); /* b.1) For send (but not receive, since that's napi). */ flush_workqueue(peer->device->packet_crypt_wq); /* b.2.1) For receive (but not send, since that's wq). */ napi_disable(&peer->napi); /* b.2.1) It's now safe to remove the napi struct, which must be done * here from process context. */ netif_napi_del(&peer->napi); /* Ensure any workstructs we own (like transmit_handshake_work or * clear_peer_work) no longer are in use. */ flush_workqueue(peer->device->handshake_send_wq); /* After the above flushes, a peer might still be active in a few * different contexts: 1) from xmit(), before hitting is_dead and * returning, 2) from wg_packet_consume_data(), before hitting is_dead * and returning, 3) from wg_receive_handshake_packet() after a point * where it has processed an incoming handshake packet, but where * all calls to pass it off to timers fails because of is_dead. We won't * have new references in (1) eventually, because we're removed from * allowedips; we won't have new references in (2) eventually, because * wg_index_hashtable_lookup will always return NULL, since we removed * all existing keypairs and no more can be created; we won't have new * references in (3) eventually, because we're removed from the pubkey * hash table, which allows for a maximum of one handshake response, * via the still-uncleared index hashtable entry, but not more than one, * and in wg_cookie_message_consume, the lookup eventually gets a peer * with a refcount of zero, so no new reference is taken. */ --peer->device->num_peers; wg_peer_put(peer); } /* We have a separate "remove" function make sure that all active places where * a peer is currently operating will eventually come to an end and not pass * their reference onto another context. */ void wg_peer_remove(struct wg_peer *peer) { if (unlikely(!peer)) return; lockdep_assert_held(&peer->device->device_update_lock); peer_make_dead(peer); synchronize_net(); peer_remove_after_dead(peer); } void wg_peer_remove_all(struct wg_device *wg) { struct wg_peer *peer, *temp; LIST_HEAD(dead_peers); lockdep_assert_held(&wg->device_update_lock); /* Avoid having to traverse individually for each one. */ wg_allowedips_free(&wg->peer_allowedips, &wg->device_update_lock); list_for_each_entry_safe(peer, temp, &wg->peer_list, peer_list) { peer_make_dead(peer); list_add_tail(&peer->peer_list, &dead_peers); } synchronize_net(); list_for_each_entry_safe(peer, temp, &dead_peers, peer_list) peer_remove_after_dead(peer); } static void rcu_release(struct rcu_head *rcu) { struct wg_peer *peer = container_of(rcu, struct wg_peer, rcu); dst_cache_destroy(&peer->endpoint_cache); WARN_ON(wg_prev_queue_peek(&peer->tx_queue) || wg_prev_queue_peek(&peer->rx_queue)); /* The final zeroing takes care of clearing any remaining handshake key * material and other potentially sensitive information. */ memzero_explicit(peer, sizeof(*peer)); kmem_cache_free(peer_cache, peer); } static void kref_release(struct kref *refcount) { struct wg_peer *peer = container_of(refcount, struct wg_peer, refcount); pr_debug("%s: Peer %llu (%pISpfsc) destroyed\n", peer->device->dev->name, peer->internal_id, &peer->endpoint.addr); /* Remove ourself from dynamic runtime lookup structures, now that the * last reference is gone. */ wg_index_hashtable_remove(peer->device->index_hashtable, &peer->handshake.entry); /* Remove any lingering packets that didn't have a chance to be * transmitted. */ wg_packet_purge_staged_packets(peer); /* Free the memory used. */ call_rcu(&peer->rcu, rcu_release); } void wg_peer_put(struct wg_peer *peer) { if (unlikely(!peer)) return; kref_put(&peer->refcount, kref_release); } int __init wg_peer_init(void) { peer_cache = KMEM_CACHE(wg_peer, 0); return peer_cache ? 0 : -ENOMEM; } void wg_peer_uninit(void) { kmem_cache_destroy(peer_cache); }
7 7 7 7 7 7 7 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 16 16 16 16 16 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 // SPDX-License-Identifier: GPL-2.0-only /* * mac80211 - channel management * Copyright 2020 - 2022 Intel Corporation */ #include <linux/nl80211.h> #include <linux/export.h> #include <linux/rtnetlink.h> #include <net/cfg80211.h> #include "ieee80211_i.h" #include "driver-ops.h" #include "rate.h" static int ieee80211_chanctx_num_assigned(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { struct ieee80211_link_data *link; int num = 0; lockdep_assert_held(&local->chanctx_mtx); list_for_each_entry(link, &ctx->assigned_links, assigned_chanctx_list) num++; return num; } static int ieee80211_chanctx_num_reserved(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { struct ieee80211_link_data *link; int num = 0; lockdep_assert_held(&local->chanctx_mtx); list_for_each_entry(link, &ctx->reserved_links, reserved_chanctx_list) num++; return num; } int ieee80211_chanctx_refcount(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { return ieee80211_chanctx_num_assigned(local, ctx) + ieee80211_chanctx_num_reserved(local, ctx); } static int ieee80211_num_chanctx(struct ieee80211_local *local) { struct ieee80211_chanctx *ctx; int num = 0; lockdep_assert_held(&local->chanctx_mtx); list_for_each_entry(ctx, &local->chanctx_list, list) num++; return num; } static bool ieee80211_can_create_new_chanctx(struct ieee80211_local *local) { lockdep_assert_held(&local->chanctx_mtx); return ieee80211_num_chanctx(local) < ieee80211_max_num_channels(local); } static struct ieee80211_chanctx * ieee80211_link_get_chanctx(struct ieee80211_link_data *link) { struct ieee80211_local *local __maybe_unused = link->sdata->local; struct ieee80211_chanctx_conf *conf; conf = rcu_dereference_protected(link->conf->chanctx_conf, lockdep_is_held(&local->chanctx_mtx)); if (!conf) return NULL; return container_of(conf, struct ieee80211_chanctx, conf); } static const struct cfg80211_chan_def * ieee80211_chanctx_reserved_chandef(struct ieee80211_local *local, struct ieee80211_chanctx *ctx, const struct cfg80211_chan_def *compat) { struct ieee80211_link_data *link; lockdep_assert_held(&local->chanctx_mtx); list_for_each_entry(link, &ctx->reserved_links, reserved_chanctx_list) { if (!compat) compat = &link->reserved_chandef; compat = cfg80211_chandef_compatible(&link->reserved_chandef, compat); if (!compat) break; } return compat; } static const struct cfg80211_chan_def * ieee80211_chanctx_non_reserved_chandef(struct ieee80211_local *local, struct ieee80211_chanctx *ctx, const struct cfg80211_chan_def *compat) { struct ieee80211_link_data *link; lockdep_assert_held(&local->chanctx_mtx); list_for_each_entry(link, &ctx->assigned_links, assigned_chanctx_list) { struct ieee80211_bss_conf *link_conf = link->conf; if (link->reserved_chanctx) continue; if (!compat) compat = &link_conf->chandef; compat = cfg80211_chandef_compatible( &link_conf->chandef, compat); if (!compat) break; } return compat; } static const struct cfg80211_chan_def * ieee80211_chanctx_combined_chandef(struct ieee80211_local *local, struct ieee80211_chanctx *ctx, const struct cfg80211_chan_def *compat) { lockdep_assert_held(&local->chanctx_mtx); compat = ieee80211_chanctx_reserved_chandef(local, ctx, compat); if (!compat) return NULL; compat = ieee80211_chanctx_non_reserved_chandef(local, ctx, compat); if (!compat) return NULL; return compat; } static bool ieee80211_chanctx_can_reserve_chandef(struct ieee80211_local *local, struct ieee80211_chanctx *ctx, const struct cfg80211_chan_def *def) { lockdep_assert_held(&local->chanctx_mtx); if (ieee80211_chanctx_combined_chandef(local, ctx, def)) return true; if (!list_empty(&ctx->reserved_links) && ieee80211_chanctx_reserved_chandef(local, ctx, def)) return true; return false; } static struct ieee80211_chanctx * ieee80211_find_reservation_chanctx(struct ieee80211_local *local, const struct cfg80211_chan_def *chandef, enum ieee80211_chanctx_mode mode) { struct ieee80211_chanctx *ctx; lockdep_assert_held(&local->chanctx_mtx); if (mode == IEEE80211_CHANCTX_EXCLUSIVE) return NULL; list_for_each_entry(ctx, &local->chanctx_list, list) { if (ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED) continue; if (ctx->mode == IEEE80211_CHANCTX_EXCLUSIVE) continue; if (!ieee80211_chanctx_can_reserve_chandef(local, ctx, chandef)) continue; return ctx; } return NULL; } static enum nl80211_chan_width ieee80211_get_sta_bw(struct sta_info *sta, unsigned int link_id) { enum ieee80211_sta_rx_bandwidth width; struct link_sta_info *link_sta; link_sta = rcu_dereference(sta->link[link_id]); /* no effect if this STA has no presence on this link */ if (!link_sta) return NL80211_CHAN_WIDTH_20_NOHT; width = ieee80211_sta_cap_rx_bw(link_sta); switch (width) { case IEEE80211_STA_RX_BW_20: if (link_sta->pub->ht_cap.ht_supported) return NL80211_CHAN_WIDTH_20; else return NL80211_CHAN_WIDTH_20_NOHT; case IEEE80211_STA_RX_BW_40: return NL80211_CHAN_WIDTH_40; case IEEE80211_STA_RX_BW_80: return NL80211_CHAN_WIDTH_80; case IEEE80211_STA_RX_BW_160: /* * This applied for both 160 and 80+80. since we use * the returned value to consider degradation of * ctx->conf.min_def, we have to make sure to take * the bigger one (NL80211_CHAN_WIDTH_160). * Otherwise we might try degrading even when not * needed, as the max required sta_bw returned (80+80) * might be smaller than the configured bw (160). */ return NL80211_CHAN_WIDTH_160; case IEEE80211_STA_RX_BW_320: return NL80211_CHAN_WIDTH_320; default: WARN_ON(1); return NL80211_CHAN_WIDTH_20; } } static enum nl80211_chan_width ieee80211_get_max_required_bw(struct ieee80211_sub_if_data *sdata, unsigned int link_id) { enum nl80211_chan_width max_bw = NL80211_CHAN_WIDTH_20_NOHT; struct sta_info *sta; list_for_each_entry_rcu(sta, &sdata->local->sta_list, list) { if (sdata != sta->sdata && !(sta->sdata->bss && sta->sdata->bss == sdata->bss)) continue; max_bw = max(max_bw, ieee80211_get_sta_bw(sta, link_id)); } return max_bw; } static enum nl80211_chan_width ieee80211_get_chanctx_vif_max_required_bw(struct ieee80211_sub_if_data *sdata, struct ieee80211_chanctx *ctx, struct ieee80211_link_data *rsvd_for) { enum nl80211_chan_width max_bw = NL80211_CHAN_WIDTH_20_NOHT; struct ieee80211_vif *vif = &sdata->vif; int link_id; rcu_read_lock(); for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { enum nl80211_chan_width width = NL80211_CHAN_WIDTH_20_NOHT; struct ieee80211_link_data *link = rcu_dereference(sdata->link[link_id]); if (!link) continue; if (link != rsvd_for && rcu_access_pointer(link->conf->chanctx_conf) != &ctx->conf) continue; switch (vif->type) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: width = ieee80211_get_max_required_bw(sdata, link_id); break; case NL80211_IFTYPE_STATION: /* * The ap's sta->bandwidth is not set yet at this * point, so take the width from the chandef, but * account also for TDLS peers */ width = max(link->conf->chandef.width, ieee80211_get_max_required_bw(sdata, link_id)); break; case NL80211_IFTYPE_P2P_DEVICE: case NL80211_IFTYPE_NAN: continue; case NL80211_IFTYPE_ADHOC: case NL80211_IFTYPE_MESH_POINT: case NL80211_IFTYPE_OCB: width = link->conf->chandef.width; break; case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_UNSPECIFIED: case NUM_NL80211_IFTYPES: case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_P2P_GO: WARN_ON_ONCE(1); } max_bw = max(max_bw, width); } rcu_read_unlock(); return max_bw; } static enum nl80211_chan_width ieee80211_get_chanctx_max_required_bw(struct ieee80211_local *local, struct ieee80211_chanctx *ctx, struct ieee80211_link_data *rsvd_for) { struct ieee80211_sub_if_data *sdata; enum nl80211_chan_width max_bw = NL80211_CHAN_WIDTH_20_NOHT; rcu_read_lock(); list_for_each_entry_rcu(sdata, &local->interfaces, list) { enum nl80211_chan_width width; if (!ieee80211_sdata_running(sdata)) continue; width = ieee80211_get_chanctx_vif_max_required_bw(sdata, ctx, rsvd_for); max_bw = max(max_bw, width); } /* use the configured bandwidth in case of monitor interface */ sdata = rcu_dereference(local->monitor_sdata); if (sdata && rcu_access_pointer(sdata->vif.bss_conf.chanctx_conf) == &ctx->conf) max_bw = max(max_bw, ctx->conf.def.width); rcu_read_unlock(); return max_bw; } /* * recalc the min required chan width of the channel context, which is * the max of min required widths of all the interfaces bound to this * channel context. */ static u32 _ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, struct ieee80211_chanctx *ctx, struct ieee80211_link_data *rsvd_for) { enum nl80211_chan_width max_bw; struct cfg80211_chan_def min_def; lockdep_assert_held(&local->chanctx_mtx); /* don't optimize non-20MHz based and radar_enabled confs */ if (ctx->conf.def.width == NL80211_CHAN_WIDTH_5 || ctx->conf.def.width == NL80211_CHAN_WIDTH_10 || ctx->conf.def.width == NL80211_CHAN_WIDTH_1 || ctx->conf.def.width == NL80211_CHAN_WIDTH_2 || ctx->conf.def.width == NL80211_CHAN_WIDTH_4 || ctx->conf.def.width == NL80211_CHAN_WIDTH_8 || ctx->conf.def.width == NL80211_CHAN_WIDTH_16 || ctx->conf.radar_enabled) { ctx->conf.min_def = ctx->conf.def; return 0; } max_bw = ieee80211_get_chanctx_max_required_bw(local, ctx, rsvd_for); /* downgrade chandef up to max_bw */ min_def = ctx->conf.def; while (min_def.width > max_bw) ieee80211_chandef_downgrade(&min_def); if (cfg80211_chandef_identical(&ctx->conf.min_def, &min_def)) return 0; ctx->conf.min_def = min_def; if (!ctx->driver_present) return 0; return IEEE80211_CHANCTX_CHANGE_MIN_WIDTH; } /* calling this function is assuming that station vif is updated to * lates changes by calling ieee80211_link_update_chandef */ static void ieee80211_chan_bw_change(struct ieee80211_local *local, struct ieee80211_chanctx *ctx, bool narrowed) { struct sta_info *sta; struct ieee80211_supported_band *sband = local->hw.wiphy->bands[ctx->conf.def.chan->band]; rcu_read_lock(); list_for_each_entry_rcu(sta, &local->sta_list, list) { struct ieee80211_sub_if_data *sdata = sta->sdata; enum ieee80211_sta_rx_bandwidth new_sta_bw; unsigned int link_id; if (!ieee80211_sdata_running(sta->sdata)) continue; for (link_id = 0; link_id < ARRAY_SIZE(sta->sdata->link); link_id++) { struct ieee80211_bss_conf *link_conf = rcu_dereference(sdata->vif.link_conf[link_id]); struct link_sta_info *link_sta; if (!link_conf) continue; if (rcu_access_pointer(link_conf->chanctx_conf) != &ctx->conf) continue; link_sta = rcu_dereference(sta->link[link_id]); if (!link_sta) continue; new_sta_bw = ieee80211_sta_cur_vht_bw(link_sta); /* nothing change */ if (new_sta_bw == link_sta->pub->bandwidth) continue; /* vif changed to narrow BW and narrow BW for station wasn't * requested or vise versa */ if ((new_sta_bw < link_sta->pub->bandwidth) == !narrowed) continue; link_sta->pub->bandwidth = new_sta_bw; rate_control_rate_update(local, sband, sta, link_id, IEEE80211_RC_BW_CHANGED); } } rcu_read_unlock(); } /* * recalc the min required chan width of the channel context, which is * the max of min required widths of all the interfaces bound to this * channel context. */ void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, struct ieee80211_chanctx *ctx, struct ieee80211_link_data *rsvd_for) { u32 changed = _ieee80211_recalc_chanctx_min_def(local, ctx, rsvd_for); if (!changed) return; /* check is BW narrowed */ ieee80211_chan_bw_change(local, ctx, true); drv_change_chanctx(local, ctx, changed); /* check is BW wider */ ieee80211_chan_bw_change(local, ctx, false); } static void _ieee80211_change_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *ctx, struct ieee80211_chanctx *old_ctx, const struct cfg80211_chan_def *chandef, struct ieee80211_link_data *rsvd_for) { u32 changed; /* expected to handle only 20/40/80/160/320 channel widths */ switch (chandef->width) { case NL80211_CHAN_WIDTH_20_NOHT: case NL80211_CHAN_WIDTH_20: case NL80211_CHAN_WIDTH_40: case NL80211_CHAN_WIDTH_80: case NL80211_CHAN_WIDTH_80P80: case NL80211_CHAN_WIDTH_160: case NL80211_CHAN_WIDTH_320: break; default: WARN_ON(1); } /* Check maybe BW narrowed - we do this _before_ calling recalc_chanctx_min_def * due to maybe not returning from it, e.g in case new context was added * first time with all parameters up to date. */ ieee80211_chan_bw_change(local, old_ctx, true); if (cfg80211_chandef_identical(&ctx->conf.def, chandef)) { ieee80211_recalc_chanctx_min_def(local, ctx, rsvd_for); return; } WARN_ON(!cfg80211_chandef_compatible(&ctx->conf.def, chandef)); ctx->conf.def = *chandef; /* check if min chanctx also changed */ changed = IEEE80211_CHANCTX_CHANGE_WIDTH | _ieee80211_recalc_chanctx_min_def(local, ctx, rsvd_for); drv_change_chanctx(local, ctx, changed); if (!local->use_chanctx) { local->_oper_chandef = *chandef; ieee80211_hw_config(local, 0); } /* check is BW wider */ ieee80211_chan_bw_change(local, old_ctx, false); } static void ieee80211_change_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *ctx, struct ieee80211_chanctx *old_ctx, const struct cfg80211_chan_def *chandef) { _ieee80211_change_chanctx(local, ctx, old_ctx, chandef, NULL); } static struct ieee80211_chanctx * ieee80211_find_chanctx(struct ieee80211_local *local, const struct cfg80211_chan_def *chandef, enum ieee80211_chanctx_mode mode) { struct ieee80211_chanctx *ctx; lockdep_assert_held(&local->chanctx_mtx); if (mode == IEEE80211_CHANCTX_EXCLUSIVE) return NULL; list_for_each_entry(ctx, &local->chanctx_list, list) { const struct cfg80211_chan_def *compat; if (ctx->replace_state != IEEE80211_CHANCTX_REPLACE_NONE) continue; if (ctx->mode == IEEE80211_CHANCTX_EXCLUSIVE) continue; compat = cfg80211_chandef_compatible(&ctx->conf.def, chandef); if (!compat) continue; compat = ieee80211_chanctx_reserved_chandef(local, ctx, compat); if (!compat) continue; ieee80211_change_chanctx(local, ctx, ctx, compat); return ctx; } return NULL; } bool ieee80211_is_radar_required(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; lockdep_assert_held(&local->mtx); rcu_read_lock(); list_for_each_entry_rcu(sdata, &local->interfaces, list) { unsigned int link_id; for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { struct ieee80211_link_data *link; link = rcu_dereference(sdata->link[link_id]); if (link && link->radar_required) { rcu_read_unlock(); return true; } } } rcu_read_unlock(); return false; } static bool ieee80211_chanctx_radar_required(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { struct ieee80211_chanctx_conf *conf = &ctx->conf; struct ieee80211_sub_if_data *sdata; bool required = false; lockdep_assert_held(&local->chanctx_mtx); lockdep_assert_held(&local->mtx); rcu_read_lock(); list_for_each_entry_rcu(sdata, &local->interfaces, list) { unsigned int link_id; if (!ieee80211_sdata_running(sdata)) continue; for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { struct ieee80211_link_data *link; link = rcu_dereference(sdata->link[link_id]); if (!link) continue; if (rcu_access_pointer(link->conf->chanctx_conf) != conf) continue; if (!link->radar_required) continue; required = true; break; } if (required) break; } rcu_read_unlock(); return required; } static struct ieee80211_chanctx * ieee80211_alloc_chanctx(struct ieee80211_local *local, const struct cfg80211_chan_def *chandef, enum ieee80211_chanctx_mode mode) { struct ieee80211_chanctx *ctx; lockdep_assert_held(&local->chanctx_mtx); ctx = kzalloc(sizeof(*ctx) + local->hw.chanctx_data_size, GFP_KERNEL); if (!ctx) return NULL; INIT_LIST_HEAD(&ctx->assigned_links); INIT_LIST_HEAD(&ctx->reserved_links); ctx->conf.def = *chandef; ctx->conf.rx_chains_static = 1; ctx->conf.rx_chains_dynamic = 1; ctx->mode = mode; ctx->conf.radar_enabled = false; _ieee80211_recalc_chanctx_min_def(local, ctx, NULL); return ctx; } static int ieee80211_add_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { u32 changed; int err; lockdep_assert_held(&local->mtx); lockdep_assert_held(&local->chanctx_mtx); if (!local->use_chanctx) local->hw.conf.radar_enabled = ctx->conf.radar_enabled; /* turn idle off *before* setting channel -- some drivers need that */ changed = ieee80211_idle_off(local); if (changed) ieee80211_hw_config(local, changed); if (!local->use_chanctx) { local->_oper_chandef = ctx->conf.def; ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL); } else { err = drv_add_chanctx(local, ctx); if (err) { ieee80211_recalc_idle(local); return err; } } return 0; } static struct ieee80211_chanctx * ieee80211_new_chanctx(struct ieee80211_local *local, const struct cfg80211_chan_def *chandef, enum ieee80211_chanctx_mode mode) { struct ieee80211_chanctx *ctx; int err; lockdep_assert_held(&local->mtx); lockdep_assert_held(&local->chanctx_mtx); ctx = ieee80211_alloc_chanctx(local, chandef, mode); if (!ctx) return ERR_PTR(-ENOMEM); err = ieee80211_add_chanctx(local, ctx); if (err) { kfree(ctx); return ERR_PTR(err); } list_add_rcu(&ctx->list, &local->chanctx_list); return ctx; } static void ieee80211_del_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { lockdep_assert_held(&local->chanctx_mtx); if (!local->use_chanctx) { struct cfg80211_chan_def *chandef = &local->_oper_chandef; /* S1G doesn't have 20MHz, so get the correct width for the * current channel. */ if (chandef->chan->band == NL80211_BAND_S1GHZ) chandef->width = ieee80211_s1g_channel_width(chandef->chan); else chandef->width = NL80211_CHAN_WIDTH_20_NOHT; chandef->center_freq1 = chandef->chan->center_freq; chandef->freq1_offset = chandef->chan->freq_offset; chandef->center_freq2 = 0; /* NOTE: Disabling radar is only valid here for * single channel context. To be sure, check it ... */ WARN_ON(local->hw.conf.radar_enabled && !list_empty(&local->chanctx_list)); local->hw.conf.radar_enabled = false; ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL); } else { drv_remove_chanctx(local, ctx); } ieee80211_recalc_idle(local); } static void ieee80211_free_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { lockdep_assert_held(&local->chanctx_mtx); WARN_ON_ONCE(ieee80211_chanctx_refcount(local, ctx) != 0); list_del_rcu(&ctx->list); ieee80211_del_chanctx(local, ctx); kfree_rcu(ctx, rcu_head); } void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { struct ieee80211_chanctx_conf *conf = &ctx->conf; struct ieee80211_sub_if_data *sdata; const struct cfg80211_chan_def *compat = NULL; struct sta_info *sta; lockdep_assert_held(&local->chanctx_mtx); rcu_read_lock(); list_for_each_entry_rcu(sdata, &local->interfaces, list) { int link_id; if (!ieee80211_sdata_running(sdata)) continue; if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) continue; for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { struct ieee80211_bss_conf *link_conf = rcu_dereference(sdata->vif.link_conf[link_id]); if (!link_conf) continue; if (rcu_access_pointer(link_conf->chanctx_conf) != conf) continue; if (!compat) compat = &link_conf->chandef; compat = cfg80211_chandef_compatible(&link_conf->chandef, compat); if (WARN_ON_ONCE(!compat)) break; } } if (WARN_ON_ONCE(!compat)) { rcu_read_unlock(); return; } /* TDLS peers can sometimes affect the chandef width */ list_for_each_entry_rcu(sta, &local->sta_list, list) { if (!sta->uploaded || !test_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW) || !test_sta_flag(sta, WLAN_STA_AUTHORIZED) || !sta->tdls_chandef.chan) continue; compat = cfg80211_chandef_compatible(&sta->tdls_chandef, compat); if (WARN_ON_ONCE(!compat)) break; } rcu_read_unlock(); if (!compat) return; ieee80211_change_chanctx(local, ctx, ctx, compat); } static void ieee80211_recalc_radar_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *chanctx) { bool radar_enabled; lockdep_assert_held(&local->chanctx_mtx); /* for ieee80211_is_radar_required */ lockdep_assert_held(&local->mtx); radar_enabled = ieee80211_chanctx_radar_required(local, chanctx); if (radar_enabled == chanctx->conf.radar_enabled) return; chanctx->conf.radar_enabled = radar_enabled; if (!local->use_chanctx) { local->hw.conf.radar_enabled = chanctx->conf.radar_enabled; ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL); } drv_change_chanctx(local, chanctx, IEEE80211_CHANCTX_CHANGE_RADAR); } static int ieee80211_assign_link_chanctx(struct ieee80211_link_data *link, struct ieee80211_chanctx *new_ctx) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx_conf *conf; struct ieee80211_chanctx *curr_ctx = NULL; int ret = 0; if (WARN_ON(sdata->vif.type == NL80211_IFTYPE_NAN)) return -ENOTSUPP; conf = rcu_dereference_protected(link->conf->chanctx_conf, lockdep_is_held(&local->chanctx_mtx)); if (conf) { curr_ctx = container_of(conf, struct ieee80211_chanctx, conf); drv_unassign_vif_chanctx(local, sdata, link->conf, curr_ctx); conf = NULL; list_del(&link->assigned_chanctx_list); } if (new_ctx) { /* recalc considering the link we'll use it for now */ ieee80211_recalc_chanctx_min_def(local, new_ctx, link); ret = drv_assign_vif_chanctx(local, sdata, link->conf, new_ctx); if (ret) goto out; conf = &new_ctx->conf; list_add(&link->assigned_chanctx_list, &new_ctx->assigned_links); } out: rcu_assign_pointer(link->conf->chanctx_conf, conf); sdata->vif.cfg.idle = !conf; if (curr_ctx && ieee80211_chanctx_num_assigned(local, curr_ctx) > 0) { ieee80211_recalc_chanctx_chantype(local, curr_ctx); ieee80211_recalc_smps_chanctx(local, curr_ctx); ieee80211_recalc_radar_chanctx(local, curr_ctx); ieee80211_recalc_chanctx_min_def(local, curr_ctx, NULL); } if (new_ctx && ieee80211_chanctx_num_assigned(local, new_ctx) > 0) { ieee80211_recalc_txpower(sdata, false); ieee80211_recalc_chanctx_min_def(local, new_ctx, NULL); } if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE && sdata->vif.type != NL80211_IFTYPE_MONITOR) ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_IDLE); ieee80211_check_fast_xmit_iface(sdata); return ret; } void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *chanctx) { struct ieee80211_sub_if_data *sdata; u8 rx_chains_static, rx_chains_dynamic; lockdep_assert_held(&local->chanctx_mtx); rx_chains_static = 1; rx_chains_dynamic = 1; rcu_read_lock(); list_for_each_entry_rcu(sdata, &local->interfaces, list) { u8 needed_static, needed_dynamic; unsigned int link_id; if (!ieee80211_sdata_running(sdata)) continue; switch (sdata->vif.type) { case NL80211_IFTYPE_STATION: if (!sdata->u.mgd.associated) continue; break; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_ADHOC: case NL80211_IFTYPE_MESH_POINT: case NL80211_IFTYPE_OCB: break; default: continue; } for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { struct ieee80211_link_data *link; link = rcu_dereference(sdata->link[link_id]); if (!link) continue; if (rcu_access_pointer(link->conf->chanctx_conf) != &chanctx->conf) continue; switch (link->smps_mode) { default: WARN_ONCE(1, "Invalid SMPS mode %d\n", link->smps_mode); fallthrough; case IEEE80211_SMPS_OFF: needed_static = link->needed_rx_chains; needed_dynamic = link->needed_rx_chains; break; case IEEE80211_SMPS_DYNAMIC: needed_static = 1; needed_dynamic = link->needed_rx_chains; break; case IEEE80211_SMPS_STATIC: needed_static = 1; needed_dynamic = 1; break; } rx_chains_static = max(rx_chains_static, needed_static); rx_chains_dynamic = max(rx_chains_dynamic, needed_dynamic); } } /* Disable SMPS for the monitor interface */ sdata = rcu_dereference(local->monitor_sdata); if (sdata && rcu_access_pointer(sdata->vif.bss_conf.chanctx_conf) == &chanctx->conf) rx_chains_dynamic = rx_chains_static = local->rx_chains; rcu_read_unlock(); if (!local->use_chanctx) { if (rx_chains_static > 1) local->smps_mode = IEEE80211_SMPS_OFF; else if (rx_chains_dynamic > 1) local->smps_mode = IEEE80211_SMPS_DYNAMIC; else local->smps_mode = IEEE80211_SMPS_STATIC; ieee80211_hw_config(local, 0); } if (rx_chains_static == chanctx->conf.rx_chains_static && rx_chains_dynamic == chanctx->conf.rx_chains_dynamic) return; chanctx->conf.rx_chains_static = rx_chains_static; chanctx->conf.rx_chains_dynamic = rx_chains_dynamic; drv_change_chanctx(local, chanctx, IEEE80211_CHANCTX_CHANGE_RX_CHAINS); } static void __ieee80211_link_copy_chanctx_to_vlans(struct ieee80211_link_data *link, bool clear) { struct ieee80211_sub_if_data *sdata = link->sdata; unsigned int link_id = link->link_id; struct ieee80211_bss_conf *link_conf = link->conf; struct ieee80211_local *local __maybe_unused = sdata->local; struct ieee80211_sub_if_data *vlan; struct ieee80211_chanctx_conf *conf; if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_AP)) return; lockdep_assert_held(&local->mtx); /* Check that conf exists, even when clearing this function * must be called with the AP's channel context still there * as it would otherwise cause VLANs to have an invalid * channel context pointer for a while, possibly pointing * to a channel context that has already been freed. */ conf = rcu_dereference_protected(link_conf->chanctx_conf, lockdep_is_held(&local->chanctx_mtx)); WARN_ON(!conf); if (clear) conf = NULL; rcu_read_lock(); list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) { struct ieee80211_bss_conf *vlan_conf; vlan_conf = rcu_dereference(vlan->vif.link_conf[link_id]); if (WARN_ON(!vlan_conf)) continue; rcu_assign_pointer(vlan_conf->chanctx_conf, conf); } rcu_read_unlock(); } void ieee80211_link_copy_chanctx_to_vlans(struct ieee80211_link_data *link, bool clear) { struct ieee80211_local *local = link->sdata->local; mutex_lock(&local->chanctx_mtx); __ieee80211_link_copy_chanctx_to_vlans(link, clear); mutex_unlock(&local->chanctx_mtx); } int ieee80211_link_unreserve_chanctx(struct ieee80211_link_data *link) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_chanctx *ctx = link->reserved_chanctx; lockdep_assert_held(&sdata->local->chanctx_mtx); if (WARN_ON(!ctx)) return -EINVAL; list_del(&link->reserved_chanctx_list); link->reserved_chanctx = NULL; if (ieee80211_chanctx_refcount(sdata->local, ctx) == 0) { if (ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER) { if (WARN_ON(!ctx->replace_ctx)) return -EINVAL; WARN_ON(ctx->replace_ctx->replace_state != IEEE80211_CHANCTX_WILL_BE_REPLACED); WARN_ON(ctx->replace_ctx->replace_ctx != ctx); ctx->replace_ctx->replace_ctx = NULL; ctx->replace_ctx->replace_state = IEEE80211_CHANCTX_REPLACE_NONE; list_del_rcu(&ctx->list); kfree_rcu(ctx, rcu_head); } else { ieee80211_free_chanctx(sdata->local, ctx); } } return 0; } int ieee80211_link_reserve_chanctx(struct ieee80211_link_data *link, const struct cfg80211_chan_def *chandef, enum ieee80211_chanctx_mode mode, bool radar_required) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx *new_ctx, *curr_ctx, *ctx; lockdep_assert_held(&local->chanctx_mtx); curr_ctx = ieee80211_link_get_chanctx(link); if (curr_ctx && local->use_chanctx && !local->ops->switch_vif_chanctx) return -ENOTSUPP; new_ctx = ieee80211_find_reservation_chanctx(local, chandef, mode); if (!new_ctx) { if (ieee80211_can_create_new_chanctx(local)) { new_ctx = ieee80211_new_chanctx(local, chandef, mode); if (IS_ERR(new_ctx)) return PTR_ERR(new_ctx); } else { if (!curr_ctx || (curr_ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED) || !list_empty(&curr_ctx->reserved_links)) { /* * Another link already requested this context * for a reservation. Find another one hoping * all links assigned to it will also switch * soon enough. * * TODO: This needs a little more work as some * cases (more than 2 chanctx capable devices) * may fail which could otherwise succeed * provided some channel context juggling was * performed. * * Consider ctx1..3, link1..6, each ctx has 2 * links. link1 and link2 from ctx1 request new * different chandefs starting 2 in-place * reserations with ctx4 and ctx5 replacing * ctx1 and ctx2 respectively. Next link5 and * link6 from ctx3 reserve ctx4. If link3 and * link4 remain on ctx2 as they are then this * fails unless `replace_ctx` from ctx5 is * replaced with ctx3. */ list_for_each_entry(ctx, &local->chanctx_list, list) { if (ctx->replace_state != IEEE80211_CHANCTX_REPLACE_NONE) continue; if (!list_empty(&ctx->reserved_links)) continue; curr_ctx = ctx; break; } } /* * If that's true then all available contexts already * have reservations and cannot be used. */ if (!curr_ctx || (curr_ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED) || !list_empty(&curr_ctx->reserved_links)) return -EBUSY; new_ctx = ieee80211_alloc_chanctx(local, chandef, mode); if (!new_ctx) return -ENOMEM; new_ctx->replace_ctx = curr_ctx; new_ctx->replace_state = IEEE80211_CHANCTX_REPLACES_OTHER; curr_ctx->replace_ctx = new_ctx; curr_ctx->replace_state = IEEE80211_CHANCTX_WILL_BE_REPLACED; list_add_rcu(&new_ctx->list, &local->chanctx_list); } } list_add(&link->reserved_chanctx_list, &new_ctx->reserved_links); link->reserved_chanctx = new_ctx; link->reserved_chandef = *chandef; link->reserved_radar_required = radar_required; link->reserved_ready = false; return 0; } static void ieee80211_link_chanctx_reservation_complete(struct ieee80211_link_data *link) { struct ieee80211_sub_if_data *sdata = link->sdata; switch (sdata->vif.type) { case NL80211_IFTYPE_ADHOC: case NL80211_IFTYPE_AP: case NL80211_IFTYPE_MESH_POINT: case NL80211_IFTYPE_OCB: ieee80211_queue_work(&sdata->local->hw, &link->csa_finalize_work); break; case NL80211_IFTYPE_STATION: wiphy_delayed_work_queue(sdata->local->hw.wiphy, &link->u.mgd.chswitch_work, 0); break; case NL80211_IFTYPE_UNSPECIFIED: case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_P2P_GO: case NL80211_IFTYPE_P2P_DEVICE: case NL80211_IFTYPE_NAN: case NUM_NL80211_IFTYPES: WARN_ON(1); break; } } static void ieee80211_link_update_chandef(struct ieee80211_link_data *link, const struct cfg80211_chan_def *chandef) { struct ieee80211_sub_if_data *sdata = link->sdata; unsigned int link_id = link->link_id; struct ieee80211_sub_if_data *vlan; link->conf->chandef = *chandef; if (sdata->vif.type != NL80211_IFTYPE_AP) return; rcu_read_lock(); list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) { struct ieee80211_bss_conf *vlan_conf; vlan_conf = rcu_dereference(vlan->vif.link_conf[link_id]); if (WARN_ON(!vlan_conf)) continue; vlan_conf->chandef = *chandef; } rcu_read_unlock(); } static int ieee80211_link_use_reserved_reassign(struct ieee80211_link_data *link) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_bss_conf *link_conf = link->conf; struct ieee80211_local *local = sdata->local; struct ieee80211_vif_chanctx_switch vif_chsw[1] = {}; struct ieee80211_chanctx *old_ctx, *new_ctx; const struct cfg80211_chan_def *chandef; u64 changed = 0; int err; lockdep_assert_held(&local->mtx); lockdep_assert_held(&local->chanctx_mtx); new_ctx = link->reserved_chanctx; old_ctx = ieee80211_link_get_chanctx(link); if (WARN_ON(!link->reserved_ready)) return -EBUSY; if (WARN_ON(!new_ctx)) return -EINVAL; if (WARN_ON(!old_ctx)) return -EINVAL; if (WARN_ON(new_ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER)) return -EINVAL; chandef = ieee80211_chanctx_non_reserved_chandef(local, new_ctx, &link->reserved_chandef); if (WARN_ON(!chandef)) return -EINVAL; if (link_conf->chandef.width != link->reserved_chandef.width) changed = BSS_CHANGED_BANDWIDTH; ieee80211_link_update_chandef(link, &link->reserved_chandef); _ieee80211_change_chanctx(local, new_ctx, old_ctx, chandef, link); vif_chsw[0].vif = &sdata->vif; vif_chsw[0].old_ctx = &old_ctx->conf; vif_chsw[0].new_ctx = &new_ctx->conf; vif_chsw[0].link_conf = link->conf; list_del(&link->reserved_chanctx_list); link->reserved_chanctx = NULL; err = drv_switch_vif_chanctx(local, vif_chsw, 1, CHANCTX_SWMODE_REASSIGN_VIF); if (err) { if (ieee80211_chanctx_refcount(local, new_ctx) == 0) ieee80211_free_chanctx(local, new_ctx); goto out; } list_move(&link->assigned_chanctx_list, &new_ctx->assigned_links); rcu_assign_pointer(link_conf->chanctx_conf, &new_ctx->conf); if (sdata->vif.type == NL80211_IFTYPE_AP) __ieee80211_link_copy_chanctx_to_vlans(link, false); ieee80211_check_fast_xmit_iface(sdata); if (ieee80211_chanctx_refcount(local, old_ctx) == 0) ieee80211_free_chanctx(local, old_ctx); ieee80211_recalc_chanctx_min_def(local, new_ctx, NULL); ieee80211_recalc_smps_chanctx(local, new_ctx); ieee80211_recalc_radar_chanctx(local, new_ctx); if (changed) ieee80211_link_info_change_notify(sdata, link, changed); out: ieee80211_link_chanctx_reservation_complete(link); return err; } static int ieee80211_link_use_reserved_assign(struct ieee80211_link_data *link) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx *old_ctx, *new_ctx; const struct cfg80211_chan_def *chandef; int err; old_ctx = ieee80211_link_get_chanctx(link); new_ctx = link->reserved_chanctx; if (WARN_ON(!link->reserved_ready)) return -EINVAL; if (WARN_ON(old_ctx)) return -EINVAL; if (WARN_ON(!new_ctx)) return -EINVAL; if (WARN_ON(new_ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER)) return -EINVAL; chandef = ieee80211_chanctx_non_reserved_chandef(local, new_ctx, &link->reserved_chandef); if (WARN_ON(!chandef)) return -EINVAL; ieee80211_change_chanctx(local, new_ctx, new_ctx, chandef); list_del(&link->reserved_chanctx_list); link->reserved_chanctx = NULL; err = ieee80211_assign_link_chanctx(link, new_ctx); if (err) { if (ieee80211_chanctx_refcount(local, new_ctx) == 0) ieee80211_free_chanctx(local, new_ctx); goto out; } out: ieee80211_link_chanctx_reservation_complete(link); return err; } static bool ieee80211_link_has_in_place_reservation(struct ieee80211_link_data *link) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_chanctx *old_ctx, *new_ctx; lockdep_assert_held(&sdata->local->chanctx_mtx); new_ctx = link->reserved_chanctx; old_ctx = ieee80211_link_get_chanctx(link); if (!old_ctx) return false; if (WARN_ON(!new_ctx)) return false; if (old_ctx->replace_state != IEEE80211_CHANCTX_WILL_BE_REPLACED) return false; if (new_ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) return false; return true; } static int ieee80211_chsw_switch_hwconf(struct ieee80211_local *local, struct ieee80211_chanctx *new_ctx) { const struct cfg80211_chan_def *chandef; lockdep_assert_held(&local->mtx); lockdep_assert_held(&local->chanctx_mtx); chandef = ieee80211_chanctx_reserved_chandef(local, new_ctx, NULL); if (WARN_ON(!chandef)) return -EINVAL; local->hw.conf.radar_enabled = new_ctx->conf.radar_enabled; local->_oper_chandef = *chandef; ieee80211_hw_config(local, 0); return 0; } static int ieee80211_chsw_switch_vifs(struct ieee80211_local *local, int n_vifs) { struct ieee80211_vif_chanctx_switch *vif_chsw; struct ieee80211_link_data *link; struct ieee80211_chanctx *ctx, *old_ctx; int i, err; lockdep_assert_held(&local->mtx); lockdep_assert_held(&local->chanctx_mtx); vif_chsw = kcalloc(n_vifs, sizeof(vif_chsw[0]), GFP_KERNEL); if (!vif_chsw) return -ENOMEM; i = 0; list_for_each_entry(ctx, &local->chanctx_list, list) { if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) continue; if (WARN_ON(!ctx->replace_ctx)) { err = -EINVAL; goto out; } list_for_each_entry(link, &ctx->reserved_links, reserved_chanctx_list) { if (!ieee80211_link_has_in_place_reservation(link)) continue; old_ctx = ieee80211_link_get_chanctx(link); vif_chsw[i].vif = &link->sdata->vif; vif_chsw[i].old_ctx = &old_ctx->conf; vif_chsw[i].new_ctx = &ctx->conf; vif_chsw[i].link_conf = link->conf; i++; } } err = drv_switch_vif_chanctx(local, vif_chsw, n_vifs, CHANCTX_SWMODE_SWAP_CONTEXTS); out: kfree(vif_chsw); return err; } static int ieee80211_chsw_switch_ctxs(struct ieee80211_local *local) { struct ieee80211_chanctx *ctx; int err; lockdep_assert_held(&local->mtx); lockdep_assert_held(&local->chanctx_mtx); list_for_each_entry(ctx, &local->chanctx_list, list) { if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) continue; if (!list_empty(&ctx->replace_ctx->assigned_links)) continue; ieee80211_del_chanctx(local, ctx->replace_ctx); err = ieee80211_add_chanctx(local, ctx); if (err) goto err; } return 0; err: WARN_ON(ieee80211_add_chanctx(local, ctx)); list_for_each_entry_continue_reverse(ctx, &local->chanctx_list, list) { if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) continue; if (!list_empty(&ctx->replace_ctx->assigned_links)) continue; ieee80211_del_chanctx(local, ctx); WARN_ON(ieee80211_add_chanctx(local, ctx->replace_ctx)); } return err; } static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) { struct ieee80211_chanctx *ctx, *ctx_tmp, *old_ctx; struct ieee80211_chanctx *new_ctx = NULL; int err, n_assigned, n_reserved, n_ready; int n_ctx = 0, n_vifs_switch = 0, n_vifs_assign = 0, n_vifs_ctxless = 0; lockdep_assert_held(&local->mtx); lockdep_assert_held(&local->chanctx_mtx); /* * If there are 2 independent pairs of channel contexts performing * cross-switch of their vifs this code will still wait until both are * ready even though it could be possible to switch one before the * other is ready. * * For practical reasons and code simplicity just do a single huge * switch. */ /* * Verify if the reservation is still feasible. * - if it's not then disconnect * - if it is but not all vifs necessary are ready then defer */ list_for_each_entry(ctx, &local->chanctx_list, list) { struct ieee80211_link_data *link; if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) continue; if (WARN_ON(!ctx->replace_ctx)) { err = -EINVAL; goto err; } if (!local->use_chanctx) new_ctx = ctx; n_ctx++; n_assigned = 0; n_reserved = 0; n_ready = 0; list_for_each_entry(link, &ctx->replace_ctx->assigned_links, assigned_chanctx_list) { n_assigned++; if (link->reserved_chanctx) { n_reserved++; if (link->reserved_ready) n_ready++; } } if (n_assigned != n_reserved) { if (n_ready == n_reserved) { wiphy_info(local->hw.wiphy, "channel context reservation cannot be finalized because some interfaces aren't switching\n"); err = -EBUSY; goto err; } return -EAGAIN; } ctx->conf.radar_enabled = false; list_for_each_entry(link, &ctx->reserved_links, reserved_chanctx_list) { if (ieee80211_link_has_in_place_reservation(link) && !link->reserved_ready) return -EAGAIN; old_ctx = ieee80211_link_get_chanctx(link); if (old_ctx) { if (old_ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED) n_vifs_switch++; else n_vifs_assign++; } else { n_vifs_ctxless++; } if (link->reserved_radar_required) ctx->conf.radar_enabled = true; } } if (WARN_ON(n_ctx == 0) || WARN_ON(n_vifs_switch == 0 && n_vifs_assign == 0 && n_vifs_ctxless == 0) || WARN_ON(n_ctx > 1 && !local->use_chanctx) || WARN_ON(!new_ctx && !local->use_chanctx)) { err = -EINVAL; goto err; } /* * All necessary vifs are ready. Perform the switch now depending on * reservations and driver capabilities. */ if (local->use_chanctx) { if (n_vifs_switch > 0) { err = ieee80211_chsw_switch_vifs(local, n_vifs_switch); if (err) goto err; } if (n_vifs_assign > 0 || n_vifs_ctxless > 0) { err = ieee80211_chsw_switch_ctxs(local); if (err) goto err; } } else { err = ieee80211_chsw_switch_hwconf(local, new_ctx); if (err) goto err; } /* * Update all structures, values and pointers to point to new channel * context(s). */ list_for_each_entry(ctx, &local->chanctx_list, list) { struct ieee80211_link_data *link, *link_tmp; if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) continue; if (WARN_ON(!ctx->replace_ctx)) { err = -EINVAL; goto err; } list_for_each_entry(link, &ctx->reserved_links, reserved_chanctx_list) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_bss_conf *link_conf = link->conf; u64 changed = 0; if (!ieee80211_link_has_in_place_reservation(link)) continue; rcu_assign_pointer(link_conf->chanctx_conf, &ctx->conf); if (sdata->vif.type == NL80211_IFTYPE_AP) __ieee80211_link_copy_chanctx_to_vlans(link, false); ieee80211_check_fast_xmit_iface(sdata); link->radar_required = link->reserved_radar_required; if (link_conf->chandef.width != link->reserved_chandef.width) changed = BSS_CHANGED_BANDWIDTH; ieee80211_link_update_chandef(link, &link->reserved_chandef); if (changed) ieee80211_link_info_change_notify(sdata, link, changed); ieee80211_recalc_txpower(sdata, false); } ieee80211_recalc_chanctx_chantype(local, ctx); ieee80211_recalc_smps_chanctx(local, ctx); ieee80211_recalc_radar_chanctx(local, ctx); ieee80211_recalc_chanctx_min_def(local, ctx, NULL); list_for_each_entry_safe(link, link_tmp, &ctx->reserved_links, reserved_chanctx_list) { if (ieee80211_link_get_chanctx(link) != ctx) continue; list_del(&link->reserved_chanctx_list); list_move(&link->assigned_chanctx_list, &ctx->assigned_links); link->reserved_chanctx = NULL; ieee80211_link_chanctx_reservation_complete(link); } /* * This context might have been a dependency for an already * ready re-assign reservation interface that was deferred. Do * not propagate error to the caller though. The in-place * reservation for originally requested interface has already * succeeded at this point. */ list_for_each_entry_safe(link, link_tmp, &ctx->reserved_links, reserved_chanctx_list) { if (WARN_ON(ieee80211_link_has_in_place_reservation(link))) continue; if (WARN_ON(link->reserved_chanctx != ctx)) continue; if (!link->reserved_ready) continue; if (ieee80211_link_get_chanctx(link)) err = ieee80211_link_use_reserved_reassign(link); else err = ieee80211_link_use_reserved_assign(link); if (err) { link_info(link, "failed to finalize (re-)assign reservation (err=%d)\n", err); ieee80211_link_unreserve_chanctx(link); cfg80211_stop_iface(local->hw.wiphy, &link->sdata->wdev, GFP_KERNEL); } } } /* * Finally free old contexts */ list_for_each_entry_safe(ctx, ctx_tmp, &local->chanctx_list, list) { if (ctx->replace_state != IEEE80211_CHANCTX_WILL_BE_REPLACED) continue; ctx->replace_ctx->replace_ctx = NULL; ctx->replace_ctx->replace_state = IEEE80211_CHANCTX_REPLACE_NONE; list_del_rcu(&ctx->list); kfree_rcu(ctx, rcu_head); } return 0; err: list_for_each_entry(ctx, &local->chanctx_list, list) { struct ieee80211_link_data *link, *link_tmp; if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) continue; list_for_each_entry_safe(link, link_tmp, &ctx->reserved_links, reserved_chanctx_list) { ieee80211_link_unreserve_chanctx(link); ieee80211_link_chanctx_reservation_complete(link); } } return err; } static void __ieee80211_link_release_channel(struct ieee80211_link_data *link) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_bss_conf *link_conf = link->conf; struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx_conf *conf; struct ieee80211_chanctx *ctx; bool use_reserved_switch = false; lockdep_assert_held(&local->chanctx_mtx); conf = rcu_dereference_protected(link_conf->chanctx_conf, lockdep_is_held(&local->chanctx_mtx)); if (!conf) return; ctx = container_of(conf, struct ieee80211_chanctx, conf); if (link->reserved_chanctx) { if (link->reserved_chanctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER && ieee80211_chanctx_num_reserved(local, link->reserved_chanctx) > 1) use_reserved_switch = true; ieee80211_link_unreserve_chanctx(link); } ieee80211_assign_link_chanctx(link, NULL); if (ieee80211_chanctx_refcount(local, ctx) == 0) ieee80211_free_chanctx(local, ctx); link->radar_required = false; /* Unreserving may ready an in-place reservation. */ if (use_reserved_switch) ieee80211_vif_use_reserved_switch(local); } int ieee80211_link_use_channel(struct ieee80211_link_data *link, const struct cfg80211_chan_def *chandef, enum ieee80211_chanctx_mode mode) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx *ctx; u8 radar_detect_width = 0; int ret; lockdep_assert_held(&local->mtx); if (sdata->vif.active_links && !(sdata->vif.active_links & BIT(link->link_id))) { ieee80211_link_update_chandef(link, chandef); return 0; } mutex_lock(&local->chanctx_mtx); ret = cfg80211_chandef_dfs_required(local->hw.wiphy, chandef, sdata->wdev.iftype); if (ret < 0) goto out; if (ret > 0) radar_detect_width = BIT(chandef->width); link->radar_required = ret; ret = ieee80211_check_combinations(sdata, chandef, mode, radar_detect_width); if (ret < 0) goto out; __ieee80211_link_release_channel(link); ctx = ieee80211_find_chanctx(local, chandef, mode); if (!ctx) ctx = ieee80211_new_chanctx(local, chandef, mode); if (IS_ERR(ctx)) { ret = PTR_ERR(ctx); goto out; } ieee80211_link_update_chandef(link, chandef); ret = ieee80211_assign_link_chanctx(link, ctx); if (ret) { /* if assign fails refcount stays the same */ if (ieee80211_chanctx_refcount(local, ctx) == 0) ieee80211_free_chanctx(local, ctx); goto out; } ieee80211_recalc_smps_chanctx(local, ctx); ieee80211_recalc_radar_chanctx(local, ctx); out: if (ret) link->radar_required = false; mutex_unlock(&local->chanctx_mtx); return ret; } int ieee80211_link_use_reserved_context(struct ieee80211_link_data *link) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx *new_ctx; struct ieee80211_chanctx *old_ctx; int err; lockdep_assert_held(&local->mtx); lockdep_assert_held(&local->chanctx_mtx); new_ctx = link->reserved_chanctx; old_ctx = ieee80211_link_get_chanctx(link); if (WARN_ON(!new_ctx)) return -EINVAL; if (WARN_ON(new_ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED)) return -EINVAL; if (WARN_ON(link->reserved_ready)) return -EINVAL; link->reserved_ready = true; if (new_ctx->replace_state == IEEE80211_CHANCTX_REPLACE_NONE) { if (old_ctx) return ieee80211_link_use_reserved_reassign(link); return ieee80211_link_use_reserved_assign(link); } /* * In-place reservation may need to be finalized now either if: * a) sdata is taking part in the swapping itself and is the last one * b) sdata has switched with a re-assign reservation to an existing * context readying in-place switching of old_ctx * * In case of (b) do not propagate the error up because the requested * sdata already switched successfully. Just spill an extra warning. * The ieee80211_vif_use_reserved_switch() already stops all necessary * interfaces upon failure. */ if ((old_ctx && old_ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED) || new_ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER) { err = ieee80211_vif_use_reserved_switch(local); if (err && err != -EAGAIN) { if (new_ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER) return err; wiphy_info(local->hw.wiphy, "depending in-place reservation failed (err=%d)\n", err); } } return 0; } int ieee80211_link_change_bandwidth(struct ieee80211_link_data *link, const struct cfg80211_chan_def *chandef, u64 *changed) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_bss_conf *link_conf = link->conf; struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx_conf *conf; struct ieee80211_chanctx *ctx; const struct cfg80211_chan_def *compat; int ret; if (!cfg80211_chandef_usable(sdata->local->hw.wiphy, chandef, IEEE80211_CHAN_DISABLED)) return -EINVAL; mutex_lock(&local->chanctx_mtx); if (cfg80211_chandef_identical(chandef, &link_conf->chandef)) { ret = 0; goto out; } if (chandef->width == NL80211_CHAN_WIDTH_20_NOHT || link_conf->chandef.width == NL80211_CHAN_WIDTH_20_NOHT) { ret = -EINVAL; goto out; } conf = rcu_dereference_protected(link_conf->chanctx_conf, lockdep_is_held(&local->chanctx_mtx)); if (!conf) { ret = -EINVAL; goto out; } ctx = container_of(conf, struct ieee80211_chanctx, conf); compat = cfg80211_chandef_compatible(&conf->def, chandef); if (!compat) { ret = -EINVAL; goto out; } switch (ctx->replace_state) { case IEEE80211_CHANCTX_REPLACE_NONE: if (!ieee80211_chanctx_reserved_chandef(local, ctx, compat)) { ret = -EBUSY; goto out; } break; case IEEE80211_CHANCTX_WILL_BE_REPLACED: /* TODO: Perhaps the bandwidth change could be treated as a * reservation itself? */ ret = -EBUSY; goto out; case IEEE80211_CHANCTX_REPLACES_OTHER: /* channel context that is going to replace another channel * context doesn't really exist and shouldn't be assigned * anywhere yet */ WARN_ON(1); break; } ieee80211_link_update_chandef(link, chandef); ieee80211_recalc_chanctx_chantype(local, ctx); *changed |= BSS_CHANGED_BANDWIDTH; ret = 0; out: mutex_unlock(&local->chanctx_mtx); return ret; } void ieee80211_link_release_channel(struct ieee80211_link_data *link) { struct ieee80211_sub_if_data *sdata = link->sdata; mutex_lock(&sdata->local->chanctx_mtx); if (rcu_access_pointer(link->conf->chanctx_conf)) { lockdep_assert_held(&sdata->local->mtx); __ieee80211_link_release_channel(link); } mutex_unlock(&sdata->local->chanctx_mtx); } void ieee80211_link_vlan_copy_chanctx(struct ieee80211_link_data *link) { struct ieee80211_sub_if_data *sdata = link->sdata; unsigned int link_id = link->link_id; struct ieee80211_bss_conf *link_conf = link->conf; struct ieee80211_bss_conf *ap_conf; struct ieee80211_local *local = sdata->local; struct ieee80211_sub_if_data *ap; struct ieee80211_chanctx_conf *conf; if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_AP_VLAN || !sdata->bss)) return; ap = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); mutex_lock(&local->chanctx_mtx); rcu_read_lock(); ap_conf = rcu_dereference(ap->vif.link_conf[link_id]); conf = rcu_dereference_protected(ap_conf->chanctx_conf, lockdep_is_held(&local->chanctx_mtx)); rcu_assign_pointer(link_conf->chanctx_conf, conf); rcu_read_unlock(); mutex_unlock(&local->chanctx_mtx); } void ieee80211_iter_chan_contexts_atomic( struct ieee80211_hw *hw, void (*iter)(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *chanctx_conf, void *data), void *iter_data) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_chanctx *ctx; rcu_read_lock(); list_for_each_entry_rcu(ctx, &local->chanctx_list, list) if (ctx->driver_present) iter(hw, &ctx->conf, iter_data); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ieee80211_iter_chan_contexts_atomic);
30 9 9 9 7 9 7 9 6 2 9 9 9 131 131 25 5 5 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/dsa/slave.c - Slave device handling * Copyright (c) 2008-2009 Marvell Semiconductor */ #include <linux/list.h> #include <linux/etherdevice.h> #include <linux/netdevice.h> #include <linux/phy.h> #include <linux/phy_fixed.h> #include <linux/phylink.h> #include <linux/of_net.h> #include <linux/of_mdio.h> #include <linux/mdio.h> #include <net/rtnetlink.h> #include <net/pkt_cls.h> #include <net/selftests.h> #include <net/tc_act/tc_mirred.h> #include <linux/if_bridge.h> #include <linux/if_hsr.h> #include <net/dcbnl.h> #include <linux/netpoll.h> #include <linux/string.h> #include "dsa.h" #include "port.h" #include "master.h" #include "netlink.h" #include "slave.h" #include "switch.h" #include "tag.h" struct dsa_switchdev_event_work { struct net_device *dev; struct net_device *orig_dev; struct work_struct work; unsigned long event; /* Specific for SWITCHDEV_FDB_ADD_TO_DEVICE and * SWITCHDEV_FDB_DEL_TO_DEVICE */ unsigned char addr[ETH_ALEN]; u16 vid; bool host_addr; }; enum dsa_standalone_event { DSA_UC_ADD, DSA_UC_DEL, DSA_MC_ADD, DSA_MC_DEL, }; struct dsa_standalone_event_work { struct work_struct work; struct net_device *dev; enum dsa_standalone_event event; unsigned char addr[ETH_ALEN]; u16 vid; }; struct dsa_host_vlan_rx_filtering_ctx { struct net_device *dev; const unsigned char *addr; enum dsa_standalone_event event; }; static bool dsa_switch_supports_uc_filtering(struct dsa_switch *ds) { return ds->ops->port_fdb_add && ds->ops->port_fdb_del && ds->fdb_isolation && !ds->vlan_filtering_is_global && !ds->needs_standalone_vlan_filtering; } static bool dsa_switch_supports_mc_filtering(struct dsa_switch *ds) { return ds->ops->port_mdb_add && ds->ops->port_mdb_del && ds->fdb_isolation && !ds->vlan_filtering_is_global && !ds->needs_standalone_vlan_filtering; } static void dsa_slave_standalone_event_work(struct work_struct *work) { struct dsa_standalone_event_work *standalone_work = container_of(work, struct dsa_standalone_event_work, work); const unsigned char *addr = standalone_work->addr; struct net_device *dev = standalone_work->dev; struct dsa_port *dp = dsa_slave_to_port(dev); struct switchdev_obj_port_mdb mdb; struct dsa_switch *ds = dp->ds; u16 vid = standalone_work->vid; int err; switch (standalone_work->event) { case DSA_UC_ADD: err = dsa_port_standalone_host_fdb_add(dp, addr, vid); if (err) { dev_err(ds->dev, "port %d failed to add %pM vid %d to fdb: %d\n", dp->index, addr, vid, err); break; } break; case DSA_UC_DEL: err = dsa_port_standalone_host_fdb_del(dp, addr, vid); if (err) { dev_err(ds->dev, "port %d failed to delete %pM vid %d from fdb: %d\n", dp->index, addr, vid, err); } break; case DSA_MC_ADD: ether_addr_copy(mdb.addr, addr); mdb.vid = vid; err = dsa_port_standalone_host_mdb_add(dp, &mdb); if (err) { dev_err(ds->dev, "port %d failed to add %pM vid %d to mdb: %d\n", dp->index, addr, vid, err); break; } break; case DSA_MC_DEL: ether_addr_copy(mdb.addr, addr); mdb.vid = vid; err = dsa_port_standalone_host_mdb_del(dp, &mdb); if (err) { dev_err(ds->dev, "port %d failed to delete %pM vid %d from mdb: %d\n", dp->index, addr, vid, err); } break; } kfree(standalone_work); } static int dsa_slave_schedule_standalone_work(struct net_device *dev, enum dsa_standalone_event event, const unsigned char *addr, u16 vid) { struct dsa_standalone_event_work *standalone_work; standalone_work = kzalloc(sizeof(*standalone_work), GFP_ATOMIC); if (!standalone_work) return -ENOMEM; INIT_WORK(&standalone_work->work, dsa_slave_standalone_event_work); standalone_work->event = event; standalone_work->dev = dev; ether_addr_copy(standalone_work->addr, addr); standalone_work->vid = vid; dsa_schedule_work(&standalone_work->work); return 0; } static int dsa_slave_host_vlan_rx_filtering(void *arg, int vid) { struct dsa_host_vlan_rx_filtering_ctx *ctx = arg; return dsa_slave_schedule_standalone_work(ctx->dev, ctx->event, ctx->addr, vid); } static int dsa_slave_vlan_for_each(struct net_device *dev, int (*cb)(void *arg, int vid), void *arg) { struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_vlan *v; int err; lockdep_assert_held(&dev->addr_list_lock); err = cb(arg, 0); if (err) return err; list_for_each_entry(v, &dp->user_vlans, list) { err = cb(arg, v->vid); if (err) return err; } return 0; } static int dsa_slave_sync_uc(struct net_device *dev, const unsigned char *addr) { struct net_device *master = dsa_slave_to_master(dev); struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_host_vlan_rx_filtering_ctx ctx = { .dev = dev, .addr = addr, .event = DSA_UC_ADD, }; dev_uc_add(master, addr); if (!dsa_switch_supports_uc_filtering(dp->ds)) return 0; return dsa_slave_vlan_for_each(dev, dsa_slave_host_vlan_rx_filtering, &ctx); } static int dsa_slave_unsync_uc(struct net_device *dev, const unsigned char *addr) { struct net_device *master = dsa_slave_to_master(dev); struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_host_vlan_rx_filtering_ctx ctx = { .dev = dev, .addr = addr, .event = DSA_UC_DEL, }; dev_uc_del(master, addr); if (!dsa_switch_supports_uc_filtering(dp->ds)) return 0; return dsa_slave_vlan_for_each(dev, dsa_slave_host_vlan_rx_filtering, &ctx); } static int dsa_slave_sync_mc(struct net_device *dev, const unsigned char *addr) { struct net_device *master = dsa_slave_to_master(dev); struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_host_vlan_rx_filtering_ctx ctx = { .dev = dev, .addr = addr, .event = DSA_MC_ADD, }; dev_mc_add(master, addr); if (!dsa_switch_supports_mc_filtering(dp->ds)) return 0; return dsa_slave_vlan_for_each(dev, dsa_slave_host_vlan_rx_filtering, &ctx); } static int dsa_slave_unsync_mc(struct net_device *dev, const unsigned char *addr) { struct net_device *master = dsa_slave_to_master(dev); struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_host_vlan_rx_filtering_ctx ctx = { .dev = dev, .addr = addr, .event = DSA_MC_DEL, }; dev_mc_del(master, addr); if (!dsa_switch_supports_mc_filtering(dp->ds)) return 0; return dsa_slave_vlan_for_each(dev, dsa_slave_host_vlan_rx_filtering, &ctx); } void dsa_slave_sync_ha(struct net_device *dev) { struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; struct netdev_hw_addr *ha; netif_addr_lock_bh(dev); netdev_for_each_synced_mc_addr(ha, dev) dsa_slave_sync_mc(dev, ha->addr); netdev_for_each_synced_uc_addr(ha, dev) dsa_slave_sync_uc(dev, ha->addr); netif_addr_unlock_bh(dev); if (dsa_switch_supports_uc_filtering(ds) || dsa_switch_supports_mc_filtering(ds)) dsa_flush_workqueue(); } void dsa_slave_unsync_ha(struct net_device *dev) { struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; struct netdev_hw_addr *ha; netif_addr_lock_bh(dev); netdev_for_each_synced_uc_addr(ha, dev) dsa_slave_unsync_uc(dev, ha->addr); netdev_for_each_synced_mc_addr(ha, dev) dsa_slave_unsync_mc(dev, ha->addr); netif_addr_unlock_bh(dev); if (dsa_switch_supports_uc_filtering(ds) || dsa_switch_supports_mc_filtering(ds)) dsa_flush_workqueue(); } /* slave mii_bus handling ***************************************************/ static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg) { struct dsa_switch *ds = bus->priv; if (ds->phys_mii_mask & (1 << addr)) return ds->ops->phy_read(ds, addr, reg); return 0xffff; } static int dsa_slave_phy_write(struct mii_bus *bus, int addr, int reg, u16 val) { struct dsa_switch *ds = bus->priv; if (ds->phys_mii_mask & (1 << addr)) return ds->ops->phy_write(ds, addr, reg, val); return 0; } void dsa_slave_mii_bus_init(struct dsa_switch *ds) { ds->slave_mii_bus->priv = (void *)ds; ds->slave_mii_bus->name = "dsa slave smi"; ds->slave_mii_bus->read = dsa_slave_phy_read; ds->slave_mii_bus->write = dsa_slave_phy_write; snprintf(ds->slave_mii_bus->id, MII_BUS_ID_SIZE, "dsa-%d.%d", ds->dst->index, ds->index); ds->slave_mii_bus->parent = ds->dev; ds->slave_mii_bus->phy_mask = ~ds->phys_mii_mask; } /* slave device handling ****************************************************/ static int dsa_slave_get_iflink(const struct net_device *dev) { return dsa_slave_to_master(dev)->ifindex; } static int dsa_slave_open(struct net_device *dev) { struct net_device *master = dsa_slave_to_master(dev); struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; int err; err = dev_open(master, NULL); if (err < 0) { netdev_err(dev, "failed to open master %s\n", master->name); goto out; } if (dsa_switch_supports_uc_filtering(ds)) { err = dsa_port_standalone_host_fdb_add(dp, dev->dev_addr, 0); if (err) goto out; } if (!ether_addr_equal(dev->dev_addr, master->dev_addr)) { err = dev_uc_add(master, dev->dev_addr); if (err < 0) goto del_host_addr; } err = dsa_port_enable_rt(dp, dev->phydev); if (err) goto del_unicast; return 0; del_unicast: if (!ether_addr_equal(dev->dev_addr, master->dev_addr)) dev_uc_del(master, dev->dev_addr); del_host_addr: if (dsa_switch_supports_uc_filtering(ds)) dsa_port_standalone_host_fdb_del(dp, dev->dev_addr, 0); out: return err; } static int dsa_slave_close(struct net_device *dev) { struct net_device *master = dsa_slave_to_master(dev); struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; dsa_port_disable_rt(dp); if (!ether_addr_equal(dev->dev_addr, master->dev_addr)) dev_uc_del(master, dev->dev_addr); if (dsa_switch_supports_uc_filtering(ds)) dsa_port_standalone_host_fdb_del(dp, dev->dev_addr, 0); return 0; } static void dsa_slave_manage_host_flood(struct net_device *dev) { bool mc = dev->flags & (IFF_PROMISC | IFF_ALLMULTI); struct dsa_port *dp = dsa_slave_to_port(dev); bool uc = dev->flags & IFF_PROMISC; dsa_port_set_host_flood(dp, uc, mc); } static void dsa_slave_change_rx_flags(struct net_device *dev, int change) { struct net_device *master = dsa_slave_to_master(dev); struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; if (change & IFF_ALLMULTI) dev_set_allmulti(master, dev->flags & IFF_ALLMULTI ? 1 : -1); if (change & IFF_PROMISC) dev_set_promiscuity(master, dev->flags & IFF_PROMISC ? 1 : -1); if (dsa_switch_supports_uc_filtering(ds) && dsa_switch_supports_mc_filtering(ds)) dsa_slave_manage_host_flood(dev); } static void dsa_slave_set_rx_mode(struct net_device *dev) { __dev_mc_sync(dev, dsa_slave_sync_mc, dsa_slave_unsync_mc); __dev_uc_sync(dev, dsa_slave_sync_uc, dsa_slave_unsync_uc); } static int dsa_slave_set_mac_address(struct net_device *dev, void *a) { struct net_device *master = dsa_slave_to_master(dev); struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; struct sockaddr *addr = a; int err; if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; /* If the port is down, the address isn't synced yet to hardware or * to the DSA master, so there is nothing to change. */ if (!(dev->flags & IFF_UP)) goto out_change_dev_addr; if (dsa_switch_supports_uc_filtering(ds)) { err = dsa_port_standalone_host_fdb_add(dp, addr->sa_data, 0); if (err) return err; } if (!ether_addr_equal(addr->sa_data, master->dev_addr)) { err = dev_uc_add(master, addr->sa_data); if (err < 0) goto del_unicast; } if (!ether_addr_equal(dev->dev_addr, master->dev_addr)) dev_uc_del(master, dev->dev_addr); if (dsa_switch_supports_uc_filtering(ds)) dsa_port_standalone_host_fdb_del(dp, dev->dev_addr, 0); out_change_dev_addr: eth_hw_addr_set(dev, addr->sa_data); return 0; del_unicast: if (dsa_switch_supports_uc_filtering(ds)) dsa_port_standalone_host_fdb_del(dp, addr->sa_data, 0); return err; } struct dsa_slave_dump_ctx { struct net_device *dev; struct sk_buff *skb; struct netlink_callback *cb; int idx; }; static int dsa_slave_port_fdb_do_dump(const unsigned char *addr, u16 vid, bool is_static, void *data) { struct dsa_slave_dump_ctx *dump = data; u32 portid = NETLINK_CB(dump->cb->skb).portid; u32 seq = dump->cb->nlh->nlmsg_seq; struct nlmsghdr *nlh; struct ndmsg *ndm; if (dump->idx < dump->cb->args[2]) goto skip; nlh = nlmsg_put(dump->skb, portid, seq, RTM_NEWNEIGH, sizeof(*ndm), NLM_F_MULTI); if (!nlh) return -EMSGSIZE; ndm = nlmsg_data(nlh); ndm->ndm_family = AF_BRIDGE; ndm->ndm_pad1 = 0; ndm->ndm_pad2 = 0; ndm->ndm_flags = NTF_SELF; ndm->ndm_type = 0; ndm->ndm_ifindex = dump->dev->ifindex; ndm->ndm_state = is_static ? NUD_NOARP : NUD_REACHABLE; if (nla_put(dump->skb, NDA_LLADDR, ETH_ALEN, addr)) goto nla_put_failure; if (vid && nla_put_u16(dump->skb, NDA_VLAN, vid)) goto nla_put_failure; nlmsg_end(dump->skb, nlh); skip: dump->idx++; return 0; nla_put_failure: nlmsg_cancel(dump->skb, nlh); return -EMSGSIZE; } static int dsa_slave_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, int *idx) { struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_slave_dump_ctx dump = { .dev = dev, .skb = skb, .cb = cb, .idx = *idx, }; int err; err = dsa_port_fdb_dump(dp, dsa_slave_port_fdb_do_dump, &dump); *idx = dump.idx; return err; } static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->dp->ds; int port = p->dp->index; /* Pass through to switch driver if it supports timestamping */ switch (cmd) { case SIOCGHWTSTAMP: if (ds->ops->port_hwtstamp_get) return ds->ops->port_hwtstamp_get(ds, port, ifr); break; case SIOCSHWTSTAMP: if (ds->ops->port_hwtstamp_set) return ds->ops->port_hwtstamp_set(ds, port, ifr); break; } return phylink_mii_ioctl(p->dp->pl, ifr, cmd); } static int dsa_slave_port_attr_set(struct net_device *dev, const void *ctx, const struct switchdev_attr *attr, struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_slave_to_port(dev); int ret; if (ctx && ctx != dp) return 0; switch (attr->id) { case SWITCHDEV_ATTR_ID_PORT_STP_STATE: if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_set_state(dp, attr->u.stp_state, true); break; case SWITCHDEV_ATTR_ID_PORT_MST_STATE: if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_set_mst_state(dp, &attr->u.mst_state, extack); break; case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING: if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_vlan_filtering(dp, attr->u.vlan_filtering, extack); break; case SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME: if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_ageing_time(dp, attr->u.ageing_time); break; case SWITCHDEV_ATTR_ID_BRIDGE_MST: if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_mst_enable(dp, attr->u.mst, extack); break; case SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS: if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_pre_bridge_flags(dp, attr->u.brport_flags, extack); break; case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS: if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_bridge_flags(dp, attr->u.brport_flags, extack); break; case SWITCHDEV_ATTR_ID_VLAN_MSTI: if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_vlan_msti(dp, &attr->u.vlan_msti); break; default: ret = -EOPNOTSUPP; break; } return ret; } /* Must be called under rcu_read_lock() */ static int dsa_slave_vlan_check_for_8021q_uppers(struct net_device *slave, const struct switchdev_obj_port_vlan *vlan) { struct net_device *upper_dev; struct list_head *iter; netdev_for_each_upper_dev_rcu(slave, upper_dev, iter) { u16 vid; if (!is_vlan_dev(upper_dev)) continue; vid = vlan_dev_vlan_id(upper_dev); if (vid == vlan->vid) return -EBUSY; } return 0; } static int dsa_slave_vlan_add(struct net_device *dev, const struct switchdev_obj *obj, struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_slave_to_port(dev); struct switchdev_obj_port_vlan *vlan; int err; if (dsa_port_skip_vlan_configuration(dp)) { NL_SET_ERR_MSG_MOD(extack, "skipping configuration of VLAN"); return 0; } vlan = SWITCHDEV_OBJ_PORT_VLAN(obj); /* Deny adding a bridge VLAN when there is already an 802.1Q upper with * the same VID. */ if (br_vlan_enabled(dsa_port_bridge_dev_get(dp))) { rcu_read_lock(); err = dsa_slave_vlan_check_for_8021q_uppers(dev, vlan); rcu_read_unlock(); if (err) { NL_SET_ERR_MSG_MOD(extack, "Port already has a VLAN upper with this VID"); return err; } } return dsa_port_vlan_add(dp, vlan, extack); } /* Offload a VLAN installed on the bridge or on a foreign interface by * installing it as a VLAN towards the CPU port. */ static int dsa_slave_host_vlan_add(struct net_device *dev, const struct switchdev_obj *obj, struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_slave_to_port(dev); struct switchdev_obj_port_vlan vlan; /* Do nothing if this is a software bridge */ if (!dp->bridge) return -EOPNOTSUPP; if (dsa_port_skip_vlan_configuration(dp)) { NL_SET_ERR_MSG_MOD(extack, "skipping configuration of VLAN"); return 0; } vlan = *SWITCHDEV_OBJ_PORT_VLAN(obj); /* Even though drivers often handle CPU membership in special ways, * it doesn't make sense to program a PVID, so clear this flag. */ vlan.flags &= ~BRIDGE_VLAN_INFO_PVID; return dsa_port_host_vlan_add(dp, &vlan, extack); } static int dsa_slave_port_obj_add(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj, struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_slave_to_port(dev); int err; if (ctx && ctx != dp) return 0; switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_MDB: if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_HOST_MDB: if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_bridge_host_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_PORT_VLAN: if (dsa_port_offloads_bridge_port(dp, obj->orig_dev)) err = dsa_slave_vlan_add(dev, obj, extack); else err = dsa_slave_host_vlan_add(dev, obj, extack); break; case SWITCHDEV_OBJ_ID_MRP: if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_mrp_add(dp, SWITCHDEV_OBJ_MRP(obj)); break; case SWITCHDEV_OBJ_ID_RING_ROLE_MRP: if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_mrp_add_ring_role(dp, SWITCHDEV_OBJ_RING_ROLE_MRP(obj)); break; default: err = -EOPNOTSUPP; break; } return err; } static int dsa_slave_vlan_del(struct net_device *dev, const struct switchdev_obj *obj) { struct dsa_port *dp = dsa_slave_to_port(dev); struct switchdev_obj_port_vlan *vlan; if (dsa_port_skip_vlan_configuration(dp)) return 0; vlan = SWITCHDEV_OBJ_PORT_VLAN(obj); return dsa_port_vlan_del(dp, vlan); } static int dsa_slave_host_vlan_del(struct net_device *dev, const struct switchdev_obj *obj) { struct dsa_port *dp = dsa_slave_to_port(dev); struct switchdev_obj_port_vlan *vlan; /* Do nothing if this is a software bridge */ if (!dp->bridge) return -EOPNOTSUPP; if (dsa_port_skip_vlan_configuration(dp)) return 0; vlan = SWITCHDEV_OBJ_PORT_VLAN(obj); return dsa_port_host_vlan_del(dp, vlan); } static int dsa_slave_port_obj_del(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj) { struct dsa_port *dp = dsa_slave_to_port(dev); int err; if (ctx && ctx != dp) return 0; switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_MDB: if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_HOST_MDB: if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_bridge_host_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_PORT_VLAN: if (dsa_port_offloads_bridge_port(dp, obj->orig_dev)) err = dsa_slave_vlan_del(dev, obj); else err = dsa_slave_host_vlan_del(dev, obj); break; case SWITCHDEV_OBJ_ID_MRP: if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_mrp_del(dp, SWITCHDEV_OBJ_MRP(obj)); break; case SWITCHDEV_OBJ_ID_RING_ROLE_MRP: if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_mrp_del_ring_role(dp, SWITCHDEV_OBJ_RING_ROLE_MRP(obj)); break; default: err = -EOPNOTSUPP; break; } return err; } static inline netdev_tx_t dsa_slave_netpoll_send_skb(struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_NET_POLL_CONTROLLER struct dsa_slave_priv *p = netdev_priv(dev); return netpoll_send_skb(p->netpoll, skb); #else BUG(); return NETDEV_TX_OK; #endif } static void dsa_skb_tx_timestamp(struct dsa_slave_priv *p, struct sk_buff *skb) { struct dsa_switch *ds = p->dp->ds; if (!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) return; if (!ds->ops->port_txtstamp) return; ds->ops->port_txtstamp(ds, p->dp->index, skb); } netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev) { /* SKB for netpoll still need to be mangled with the protocol-specific * tag to be successfully transmitted */ if (unlikely(netpoll_tx_running(dev))) return dsa_slave_netpoll_send_skb(dev, skb); /* Queue the SKB for transmission on the parent interface, but * do not modify its EtherType */ skb->dev = dsa_slave_to_master(dev); dev_queue_xmit(skb); return NETDEV_TX_OK; } EXPORT_SYMBOL_GPL(dsa_enqueue_skb); static int dsa_realloc_skb(struct sk_buff *skb, struct net_device *dev) { int needed_headroom = dev->needed_headroom; int needed_tailroom = dev->needed_tailroom; /* For tail taggers, we need to pad short frames ourselves, to ensure * that the tail tag does not fail at its role of being at the end of * the packet, once the master interface pads the frame. Account for * that pad length here, and pad later. */ if (unlikely(needed_tailroom && skb->len < ETH_ZLEN)) needed_tailroom += ETH_ZLEN - skb->len; /* skb_headroom() returns unsigned int... */ needed_headroom = max_t(int, needed_headroom - skb_headroom(skb), 0); needed_tailroom = max_t(int, needed_tailroom - skb_tailroom(skb), 0); if (likely(!needed_headroom && !needed_tailroom && !skb_cloned(skb))) /* No reallocation needed, yay! */ return 0; return pskb_expand_head(skb, needed_headroom, needed_tailroom, GFP_ATOMIC); } static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); struct sk_buff *nskb; dev_sw_netstats_tx_add(dev, 1, skb->len); memset(skb->cb, 0, sizeof(skb->cb)); /* Handle tx timestamp if any */ dsa_skb_tx_timestamp(p, skb); if (dsa_realloc_skb(skb, dev)) { dev_kfree_skb_any(skb); return NETDEV_TX_OK; } /* needed_tailroom should still be 'warm' in the cache line from * dsa_realloc_skb(), which has also ensured that padding is safe. */ if (dev->needed_tailroom) eth_skb_pad(skb); /* Transmit function may have to reallocate the original SKB, * in which case it must have freed it. Only free it here on error. */ nskb = p->xmit(skb, dev); if (!nskb) { kfree_skb(skb); return NETDEV_TX_OK; } return dsa_enqueue_skb(nskb, dev); } /* ethtool operations *******************************************************/ static void dsa_slave_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo) { strscpy(drvinfo->driver, "dsa", sizeof(drvinfo->driver)); strscpy(drvinfo->fw_version, "N/A", sizeof(drvinfo->fw_version)); strscpy(drvinfo->bus_info, "platform", sizeof(drvinfo->bus_info)); } static int dsa_slave_get_regs_len(struct net_device *dev) { struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_regs_len) return ds->ops->get_regs_len(ds, dp->index); return -EOPNOTSUPP; } static void dsa_slave_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p) { struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_regs) ds->ops->get_regs(ds, dp->index, regs, _p); } static int dsa_slave_nway_reset(struct net_device *dev) { struct dsa_port *dp = dsa_slave_to_port(dev); return phylink_ethtool_nway_reset(dp->pl); } static int dsa_slave_get_eeprom_len(struct net_device *dev) { struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->cd && ds->cd->eeprom_len) return ds->cd->eeprom_len; if (ds->ops->get_eeprom_len) return ds->ops->get_eeprom_len(ds); return 0; } static int dsa_slave_get_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom, u8 *data) { struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_eeprom) return ds->ops->get_eeprom(ds, eeprom, data); return -EOPNOTSUPP; } static int dsa_slave_set_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom, u8 *data) { struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->set_eeprom) return ds->ops->set_eeprom(ds, eeprom, data); return -EOPNOTSUPP; } static void dsa_slave_get_strings(struct net_device *dev, uint32_t stringset, uint8_t *data) { struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; if (stringset == ETH_SS_STATS) { int len = ETH_GSTRING_LEN; strscpy_pad(data, "tx_packets", len); strscpy_pad(data + len, "tx_bytes", len); strscpy_pad(data + 2 * len, "rx_packets", len); strscpy_pad(data + 3 * len, "rx_bytes", len); if (ds->ops->get_strings) ds->ops->get_strings(ds, dp->index, stringset, data + 4 * len); } else if (stringset == ETH_SS_TEST) { net_selftest_get_strings(data); } } static void dsa_slave_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, uint64_t *data) { struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; struct pcpu_sw_netstats *s; unsigned int start; int i; for_each_possible_cpu(i) { u64 tx_packets, tx_bytes, rx_packets, rx_bytes; s = per_cpu_ptr(dev->tstats, i); do { start = u64_stats_fetch_begin(&s->syncp); tx_packets = u64_stats_read(&s->tx_packets); tx_bytes = u64_stats_read(&s->tx_bytes); rx_packets = u64_stats_read(&s->rx_packets); rx_bytes = u64_stats_read(&s->rx_bytes); } while (u64_stats_fetch_retry(&s->syncp, start)); data[0] += tx_packets; data[1] += tx_bytes; data[2] += rx_packets; data[3] += rx_bytes; } if (ds->ops->get_ethtool_stats) ds->ops->get_ethtool_stats(ds, dp->index, data + 4); } static int dsa_slave_get_sset_count(struct net_device *dev, int sset) { struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; if (sset == ETH_SS_STATS) { int count = 0; if (ds->ops->get_sset_count) { count = ds->ops->get_sset_count(ds, dp->index, sset); if (count < 0) return count; } return count + 4; } else if (sset == ETH_SS_TEST) { return net_selftest_get_count(); } return -EOPNOTSUPP; } static void dsa_slave_get_eth_phy_stats(struct net_device *dev, struct ethtool_eth_phy_stats *phy_stats) { struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_eth_phy_stats) ds->ops->get_eth_phy_stats(ds, dp->index, phy_stats); } static void dsa_slave_get_eth_mac_stats(struct net_device *dev, struct ethtool_eth_mac_stats *mac_stats) { struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_eth_mac_stats) ds->ops->get_eth_mac_stats(ds, dp->index, mac_stats); } static void dsa_slave_get_eth_ctrl_stats(struct net_device *dev, struct ethtool_eth_ctrl_stats *ctrl_stats) { struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_eth_ctrl_stats) ds->ops->get_eth_ctrl_stats(ds, dp->index, ctrl_stats); } static void dsa_slave_get_rmon_stats(struct net_device *dev, struct ethtool_rmon_stats *rmon_stats, const struct ethtool_rmon_hist_range **ranges) { struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_rmon_stats) ds->ops->get_rmon_stats(ds, dp->index, rmon_stats, ranges); } static void dsa_slave_net_selftest(struct net_device *ndev, struct ethtool_test *etest, u64 *buf) { struct dsa_port *dp = dsa_slave_to_port(ndev); struct dsa_switch *ds = dp->ds; if (ds->ops->self_test) { ds->ops->self_test(ds, dp->index, etest, buf); return; } net_selftest(ndev, etest, buf); } static int dsa_slave_get_mm(struct net_device *dev, struct ethtool_mm_state *state) { struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; if (!ds->ops->get_mm) return -EOPNOTSUPP; return ds->ops->get_mm(ds, dp->index, state); } static int dsa_slave_set_mm(struct net_device *dev, struct ethtool_mm_cfg *cfg, struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; if (!ds->ops->set_mm) return -EOPNOTSUPP; return ds->ops->set_mm(ds, dp->index, cfg, extack); } static void dsa_slave_get_mm_stats(struct net_device *dev, struct ethtool_mm_stats *stats) { struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_mm_stats) ds->ops->get_mm_stats(ds, dp->index, stats); } static void dsa_slave_get_wol(struct net_device *dev, struct ethtool_wolinfo *w) { struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; phylink_ethtool_get_wol(dp->pl, w); if (ds->ops->get_wol) ds->ops->get_wol(ds, dp->index, w); } static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w) { struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; int ret = -EOPNOTSUPP; phylink_ethtool_set_wol(dp->pl, w); if (ds->ops->set_wol) ret = ds->ops->set_wol(ds, dp->index, w); return ret; } static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e) { struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; int ret; /* Port's PHY and MAC both need to be EEE capable */ if (!dev->phydev || !dp->pl) return -ENODEV; if (!ds->ops->set_mac_eee) return -EOPNOTSUPP; ret = ds->ops->set_mac_eee(ds, dp->index, e); if (ret) return ret; return phylink_ethtool_set_eee(dp->pl, e); } static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e) { struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; int ret; /* Port's PHY and MAC both need to be EEE capable */ if (!dev->phydev || !dp->pl) return -ENODEV; if (!ds->ops->get_mac_eee) return -EOPNOTSUPP; ret = ds->ops->get_mac_eee(ds, dp->index, e); if (ret) return ret; return phylink_ethtool_get_eee(dp->pl, e); } static int dsa_slave_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { struct dsa_port *dp = dsa_slave_to_port(dev); return phylink_ethtool_ksettings_get(dp->pl, cmd); } static int dsa_slave_set_link_ksettings(struct net_device *dev, const struct ethtool_link_ksettings *cmd) { struct dsa_port *dp = dsa_slave_to_port(dev); return phylink_ethtool_ksettings_set(dp->pl, cmd); } static void dsa_slave_get_pause_stats(struct net_device *dev, struct ethtool_pause_stats *pause_stats) { struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_pause_stats) ds->ops->get_pause_stats(ds, dp->index, pause_stats); } static void dsa_slave_get_pauseparam(struct net_device *dev, struct ethtool_pauseparam *pause) { struct dsa_port *dp = dsa_slave_to_port(dev); phylink_ethtool_get_pauseparam(dp->pl, pause); } static int dsa_slave_set_pauseparam(struct net_device *dev, struct ethtool_pauseparam *pause) { struct dsa_port *dp = dsa_slave_to_port(dev); return phylink_ethtool_set_pauseparam(dp->pl, pause); } #ifdef CONFIG_NET_POLL_CONTROLLER static int dsa_slave_netpoll_setup(struct net_device *dev, struct netpoll_info *ni) { struct net_device *master = dsa_slave_to_master(dev); struct dsa_slave_priv *p = netdev_priv(dev); struct netpoll *netpoll; int err = 0; netpoll = kzalloc(sizeof(*netpoll), GFP_KERNEL); if (!netpoll) return -ENOMEM; err = __netpoll_setup(netpoll, master); if (err) { kfree(netpoll); goto out; } p->netpoll = netpoll; out: return err; } static void dsa_slave_netpoll_cleanup(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); struct netpoll *netpoll = p->netpoll; if (!netpoll) return; p->netpoll = NULL; __netpoll_free(netpoll); } static void dsa_slave_poll_controller(struct net_device *dev) { } #endif static struct dsa_mall_tc_entry * dsa_slave_mall_tc_entry_find(struct net_device *dev, unsigned long cookie) { struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_mall_tc_entry *mall_tc_entry; list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list) if (mall_tc_entry->cookie == cookie) return mall_tc_entry; return NULL; } static int dsa_slave_add_cls_matchall_mirred(struct net_device *dev, struct tc_cls_matchall_offload *cls, bool ingress) { struct netlink_ext_ack *extack = cls->common.extack; struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_mall_mirror_tc_entry *mirror; struct dsa_mall_tc_entry *mall_tc_entry; struct dsa_switch *ds = dp->ds; struct flow_action_entry *act; struct dsa_port *to_dp; int err; if (!ds->ops->port_mirror_add) return -EOPNOTSUPP; if (!flow_action_basic_hw_stats_check(&cls->rule->action, cls->common.extack)) return -EOPNOTSUPP; act = &cls->rule->action.entries[0]; if (!act->dev) return -EINVAL; if (!dsa_slave_dev_check(act->dev)) return -EOPNOTSUPP; mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL); if (!mall_tc_entry) return -ENOMEM; mall_tc_entry->cookie = cls->cookie; mall_tc_entry->type = DSA_PORT_MALL_MIRROR; mirror = &mall_tc_entry->mirror; to_dp = dsa_slave_to_port(act->dev); mirror->to_local_port = to_dp->index; mirror->ingress = ingress; err = ds->ops->port_mirror_add(ds, dp->index, mirror, ingress, extack); if (err) { kfree(mall_tc_entry); return err; } list_add_tail(&mall_tc_entry->list, &p->mall_tc_list); return err; } static int dsa_slave_add_cls_matchall_police(struct net_device *dev, struct tc_cls_matchall_offload *cls, bool ingress) { struct netlink_ext_ack *extack = cls->common.extack; struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_mall_policer_tc_entry *policer; struct dsa_mall_tc_entry *mall_tc_entry; struct dsa_switch *ds = dp->ds; struct flow_action_entry *act; int err; if (!ds->ops->port_policer_add) { NL_SET_ERR_MSG_MOD(extack, "Policing offload not implemented"); return -EOPNOTSUPP; } if (!ingress) { NL_SET_ERR_MSG_MOD(extack, "Only supported on ingress qdisc"); return -EOPNOTSUPP; } if (!flow_action_basic_hw_stats_check(&cls->rule->action, cls->common.extack)) return -EOPNOTSUPP; list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list) { if (mall_tc_entry->type == DSA_PORT_MALL_POLICER) { NL_SET_ERR_MSG_MOD(extack, "Only one port policer allowed"); return -EEXIST; } } act = &cls->rule->action.entries[0]; mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL); if (!mall_tc_entry) return -ENOMEM; mall_tc_entry->cookie = cls->cookie; mall_tc_entry->type = DSA_PORT_MALL_POLICER; policer = &mall_tc_entry->policer; policer->rate_bytes_per_sec = act->police.rate_bytes_ps; policer->burst = act->police.burst; err = ds->ops->port_policer_add(ds, dp->index, policer); if (err) { kfree(mall_tc_entry); return err; } list_add_tail(&mall_tc_entry->list, &p->mall_tc_list); return err; } static int dsa_slave_add_cls_matchall(struct net_device *dev, struct tc_cls_matchall_offload *cls, bool ingress) { int err = -EOPNOTSUPP; if (cls->common.protocol == htons(ETH_P_ALL) && flow_offload_has_one_action(&cls->rule->action) && cls->rule->action.entries[0].id == FLOW_ACTION_MIRRED) err = dsa_slave_add_cls_matchall_mirred(dev, cls, ingress); else if (flow_offload_has_one_action(&cls->rule->action) && cls->rule->action.entries[0].id == FLOW_ACTION_POLICE) err = dsa_slave_add_cls_matchall_police(dev, cls, ingress); return err; } static void dsa_slave_del_cls_matchall(struct net_device *dev, struct tc_cls_matchall_offload *cls) { struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_mall_tc_entry *mall_tc_entry; struct dsa_switch *ds = dp->ds; mall_tc_entry = dsa_slave_mall_tc_entry_find(dev, cls->cookie); if (!mall_tc_entry) return; list_del(&mall_tc_entry->list); switch (mall_tc_entry->type) { case DSA_PORT_MALL_MIRROR: if (ds->ops->port_mirror_del) ds->ops->port_mirror_del(ds, dp->index, &mall_tc_entry->mirror); break; case DSA_PORT_MALL_POLICER: if (ds->ops->port_policer_del) ds->ops->port_policer_del(ds, dp->index); break; default: WARN_ON(1); } kfree(mall_tc_entry); } static int dsa_slave_setup_tc_cls_matchall(struct net_device *dev, struct tc_cls_matchall_offload *cls, bool ingress) { if (cls->common.chain_index) return -EOPNOTSUPP; switch (cls->command) { case TC_CLSMATCHALL_REPLACE: return dsa_slave_add_cls_matchall(dev, cls, ingress); case TC_CLSMATCHALL_DESTROY: dsa_slave_del_cls_matchall(dev, cls); return 0; default: return -EOPNOTSUPP; } } static int dsa_slave_add_cls_flower(struct net_device *dev, struct flow_cls_offload *cls, bool ingress) { struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; int port = dp->index; if (!ds->ops->cls_flower_add) return -EOPNOTSUPP; return ds->ops->cls_flower_add(ds, port, cls, ingress); } static int dsa_slave_del_cls_flower(struct net_device *dev, struct flow_cls_offload *cls, bool ingress) { struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; int port = dp->index; if (!ds->ops->cls_flower_del) return -EOPNOTSUPP; return ds->ops->cls_flower_del(ds, port, cls, ingress); } static int dsa_slave_stats_cls_flower(struct net_device *dev, struct flow_cls_offload *cls, bool ingress) { struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; int port = dp->index; if (!ds->ops->cls_flower_stats) return -EOPNOTSUPP; return ds->ops->cls_flower_stats(ds, port, cls, ingress); } static int dsa_slave_setup_tc_cls_flower(struct net_device *dev, struct flow_cls_offload *cls, bool ingress) { switch (cls->command) { case FLOW_CLS_REPLACE: return dsa_slave_add_cls_flower(dev, cls, ingress); case FLOW_CLS_DESTROY: return dsa_slave_del_cls_flower(dev, cls, ingress); case FLOW_CLS_STATS: return dsa_slave_stats_cls_flower(dev, cls, ingress); default: return -EOPNOTSUPP; } } static int dsa_slave_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv, bool ingress) { struct net_device *dev = cb_priv; if (!tc_can_offload(dev)) return -EOPNOTSUPP; switch (type) { case TC_SETUP_CLSMATCHALL: return dsa_slave_setup_tc_cls_matchall(dev, type_data, ingress); case TC_SETUP_CLSFLOWER: return dsa_slave_setup_tc_cls_flower(dev, type_data, ingress); default: return -EOPNOTSUPP; } } static int dsa_slave_setup_tc_block_cb_ig(enum tc_setup_type type, void *type_data, void *cb_priv) { return dsa_slave_setup_tc_block_cb(type, type_data, cb_priv, true); } static int dsa_slave_setup_tc_block_cb_eg(enum tc_setup_type type, void *type_data, void *cb_priv) { return dsa_slave_setup_tc_block_cb(type, type_data, cb_priv, false); } static LIST_HEAD(dsa_slave_block_cb_list); static int dsa_slave_setup_tc_block(struct net_device *dev, struct flow_block_offload *f) { struct flow_block_cb *block_cb; flow_setup_cb_t *cb; if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS) cb = dsa_slave_setup_tc_block_cb_ig; else if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS) cb = dsa_slave_setup_tc_block_cb_eg; else return -EOPNOTSUPP; f->driver_block_list = &dsa_slave_block_cb_list; switch (f->command) { case FLOW_BLOCK_BIND: if (flow_block_cb_is_busy(cb, dev, &dsa_slave_block_cb_list)) return -EBUSY; block_cb = flow_block_cb_alloc(cb, dev, dev, NULL); if (IS_ERR(block_cb)) return PTR_ERR(block_cb); flow_block_cb_add(block_cb, f); list_add_tail(&block_cb->driver_list, &dsa_slave_block_cb_list); return 0; case FLOW_BLOCK_UNBIND: block_cb = flow_block_cb_lookup(f->block, cb, dev); if (!block_cb) return -ENOENT; flow_block_cb_remove(block_cb, f); list_del(&block_cb->driver_list); return 0; default: return -EOPNOTSUPP; } } static int dsa_slave_setup_ft_block(struct dsa_switch *ds, int port, void *type_data) { struct net_device *master = dsa_port_to_master(dsa_to_port(ds, port)); if (!master->netdev_ops->ndo_setup_tc) return -EOPNOTSUPP; return master->netdev_ops->ndo_setup_tc(master, TC_SETUP_FT, type_data); } static int dsa_slave_setup_tc(struct net_device *dev, enum tc_setup_type type, void *type_data) { struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; switch (type) { case TC_SETUP_BLOCK: return dsa_slave_setup_tc_block(dev, type_data); case TC_SETUP_FT: return dsa_slave_setup_ft_block(ds, dp->index, type_data); default: break; } if (!ds->ops->port_setup_tc) return -EOPNOTSUPP; return ds->ops->port_setup_tc(ds, dp->index, type, type_data); } static int dsa_slave_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *nfc, u32 *rule_locs) { struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; if (!ds->ops->get_rxnfc) return -EOPNOTSUPP; return ds->ops->get_rxnfc(ds, dp->index, nfc, rule_locs); } static int dsa_slave_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *nfc) { struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; if (!ds->ops->set_rxnfc) return -EOPNOTSUPP; return ds->ops->set_rxnfc(ds, dp->index, nfc); } static int dsa_slave_get_ts_info(struct net_device *dev, struct ethtool_ts_info *ts) { struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->dp->ds; if (!ds->ops->get_ts_info) return -EOPNOTSUPP; return ds->ops->get_ts_info(ds, p->dp->index, ts); } static int dsa_slave_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) { struct dsa_port *dp = dsa_slave_to_port(dev); struct switchdev_obj_port_vlan vlan = { .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, .vid = vid, /* This API only allows programming tagged, non-PVID VIDs */ .flags = 0, }; struct netlink_ext_ack extack = {0}; struct dsa_switch *ds = dp->ds; struct netdev_hw_addr *ha; struct dsa_vlan *v; int ret; /* User port... */ ret = dsa_port_vlan_add(dp, &vlan, &extack); if (ret) { if (extack._msg) netdev_err(dev, "%s\n", extack._msg); return ret; } /* And CPU port... */ ret = dsa_port_host_vlan_add(dp, &vlan, &extack); if (ret) { if (extack._msg) netdev_err(dev, "CPU port %d: %s\n", dp->cpu_dp->index, extack._msg); return ret; } if (!dsa_switch_supports_uc_filtering(ds) && !dsa_switch_supports_mc_filtering(ds)) return 0; v = kzalloc(sizeof(*v), GFP_KERNEL); if (!v) { ret = -ENOMEM; goto rollback; } netif_addr_lock_bh(dev); v->vid = vid; list_add_tail(&v->list, &dp->user_vlans); if (dsa_switch_supports_mc_filtering(ds)) { netdev_for_each_synced_mc_addr(ha, dev) { dsa_slave_schedule_standalone_work(dev, DSA_MC_ADD, ha->addr, vid); } } if (dsa_switch_supports_uc_filtering(ds)) { netdev_for_each_synced_uc_addr(ha, dev) { dsa_slave_schedule_standalone_work(dev, DSA_UC_ADD, ha->addr, vid); } } netif_addr_unlock_bh(dev); dsa_flush_workqueue(); return 0; rollback: dsa_port_host_vlan_del(dp, &vlan); dsa_port_vlan_del(dp, &vlan); return ret; } static int dsa_slave_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid) { struct dsa_port *dp = dsa_slave_to_port(dev); struct switchdev_obj_port_vlan vlan = { .vid = vid, /* This API only allows programming tagged, non-PVID VIDs */ .flags = 0, }; struct dsa_switch *ds = dp->ds; struct netdev_hw_addr *ha; struct dsa_vlan *v; int err; err = dsa_port_vlan_del(dp, &vlan); if (err) return err; err = dsa_port_host_vlan_del(dp, &vlan); if (err) return err; if (!dsa_switch_supports_uc_filtering(ds) && !dsa_switch_supports_mc_filtering(ds)) return 0; netif_addr_lock_bh(dev); v = dsa_vlan_find(&dp->user_vlans, &vlan); if (!v) { netif_addr_unlock_bh(dev); return -ENOENT; } list_del(&v->list); kfree(v); if (dsa_switch_supports_mc_filtering(ds)) { netdev_for_each_synced_mc_addr(ha, dev) { dsa_slave_schedule_standalone_work(dev, DSA_MC_DEL, ha->addr, vid); } } if (dsa_switch_supports_uc_filtering(ds)) { netdev_for_each_synced_uc_addr(ha, dev) { dsa_slave_schedule_standalone_work(dev, DSA_UC_DEL, ha->addr, vid); } } netif_addr_unlock_bh(dev); dsa_flush_workqueue(); return 0; } static int dsa_slave_restore_vlan(struct net_device *vdev, int vid, void *arg) { __be16 proto = vdev ? vlan_dev_vlan_proto(vdev) : htons(ETH_P_8021Q); return dsa_slave_vlan_rx_add_vid(arg, proto, vid); } static int dsa_slave_clear_vlan(struct net_device *vdev, int vid, void *arg) { __be16 proto = vdev ? vlan_dev_vlan_proto(vdev) : htons(ETH_P_8021Q); return dsa_slave_vlan_rx_kill_vid(arg, proto, vid); } /* Keep the VLAN RX filtering list in sync with the hardware only if VLAN * filtering is enabled. The baseline is that only ports that offload a * VLAN-aware bridge are VLAN-aware, and standalone ports are VLAN-unaware, * but there are exceptions for quirky hardware. * * If ds->vlan_filtering_is_global = true, then standalone ports which share * the same switch with other ports that offload a VLAN-aware bridge are also * inevitably VLAN-aware. * * To summarize, a DSA switch port offloads: * * - If standalone (this includes software bridge, software LAG): * - if ds->needs_standalone_vlan_filtering = true, OR if * (ds->vlan_filtering_is_global = true AND there are bridges spanning * this switch chip which have vlan_filtering=1) * - the 8021q upper VLANs * - else (standalone VLAN filtering is not needed, VLAN filtering is not * global, or it is, but no port is under a VLAN-aware bridge): * - no VLAN (any 8021q upper is a software VLAN) * * - If under a vlan_filtering=0 bridge which it offload: * - if ds->configure_vlan_while_not_filtering = true (default): * - the bridge VLANs. These VLANs are committed to hardware but inactive. * - else (deprecated): * - no VLAN. The bridge VLANs are not restored when VLAN awareness is * enabled, so this behavior is broken and discouraged. * * - If under a vlan_filtering=1 bridge which it offload: * - the bridge VLANs * - the 8021q upper VLANs */ int dsa_slave_manage_vlan_filtering(struct net_device *slave, bool vlan_filtering) { int err; if (vlan_filtering) { slave->features |= NETIF_F_HW_VLAN_CTAG_FILTER; err = vlan_for_each(slave, dsa_slave_restore_vlan, slave); if (err) { vlan_for_each(slave, dsa_slave_clear_vlan, slave); slave->features &= ~NETIF_F_HW_VLAN_CTAG_FILTER; return err; } } else { err = vlan_for_each(slave, dsa_slave_clear_vlan, slave); if (err) return err; slave->features &= ~NETIF_F_HW_VLAN_CTAG_FILTER; } return 0; } struct dsa_hw_port { struct list_head list; struct net_device *dev; int old_mtu; }; static int dsa_hw_port_list_set_mtu(struct list_head *hw_port_list, int mtu) { const struct dsa_hw_port *p; int err; list_for_each_entry(p, hw_port_list, list) { if (p->dev->mtu == mtu) continue; err = dev_set_mtu(p->dev, mtu); if (err) goto rollback; } return 0; rollback: list_for_each_entry_continue_reverse(p, hw_port_list, list) { if (p->dev->mtu == p->old_mtu) continue; if (dev_set_mtu(p->dev, p->old_mtu)) netdev_err(p->dev, "Failed to restore MTU\n"); } return err; } static void dsa_hw_port_list_free(struct list_head *hw_port_list) { struct dsa_hw_port *p, *n; list_for_each_entry_safe(p, n, hw_port_list, list) kfree(p); } /* Make the hardware datapath to/from @dev limited to a common MTU */ static void dsa_bridge_mtu_normalization(struct dsa_port *dp) { struct list_head hw_port_list; struct dsa_switch_tree *dst; int min_mtu = ETH_MAX_MTU; struct dsa_port *other_dp; int err; if (!dp->ds->mtu_enforcement_ingress) return; if (!dp->bridge) return; INIT_LIST_HEAD(&hw_port_list); /* Populate the list of ports that are part of the same bridge * as the newly added/modified port */ list_for_each_entry(dst, &dsa_tree_list, list) { list_for_each_entry(other_dp, &dst->ports, list) { struct dsa_hw_port *hw_port; struct net_device *slave; if (other_dp->type != DSA_PORT_TYPE_USER) continue; if (!dsa_port_bridge_same(dp, other_dp)) continue; if (!other_dp->ds->mtu_enforcement_ingress) continue; slave = other_dp->slave; if (min_mtu > slave->mtu) min_mtu = slave->mtu; hw_port = kzalloc(sizeof(*hw_port), GFP_KERNEL); if (!hw_port) goto out; hw_port->dev = slave; hw_port->old_mtu = slave->mtu; list_add(&hw_port->list, &hw_port_list); } } /* Attempt to configure the entire hardware bridge to the newly added * interface's MTU first, regardless of whether the intention of the * user was to raise or lower it. */ err = dsa_hw_port_list_set_mtu(&hw_port_list, dp->slave->mtu); if (!err) goto out; /* Clearly that didn't work out so well, so just set the minimum MTU on * all hardware bridge ports now. If this fails too, then all ports will * still have their old MTU rolled back anyway. */ dsa_hw_port_list_set_mtu(&hw_port_list, min_mtu); out: dsa_hw_port_list_free(&hw_port_list); } int dsa_slave_change_mtu(struct net_device *dev, int new_mtu) { struct net_device *master = dsa_slave_to_master(dev); struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_port *cpu_dp = dp->cpu_dp; struct dsa_switch *ds = dp->ds; struct dsa_port *other_dp; int largest_mtu = 0; int new_master_mtu; int old_master_mtu; int mtu_limit; int overhead; int cpu_mtu; int err; if (!ds->ops->port_change_mtu) return -EOPNOTSUPP; dsa_tree_for_each_user_port(other_dp, ds->dst) { int slave_mtu; /* During probe, this function will be called for each slave * device, while not all of them have been allocated. That's * ok, it doesn't change what the maximum is, so ignore it. */ if (!other_dp->slave) continue; /* Pretend that we already applied the setting, which we * actually haven't (still haven't done all integrity checks) */ if (dp == other_dp) slave_mtu = new_mtu; else slave_mtu = other_dp->slave->mtu; if (largest_mtu < slave_mtu) largest_mtu = slave_mtu; } overhead = dsa_tag_protocol_overhead(cpu_dp->tag_ops); mtu_limit = min_t(int, master->max_mtu, dev->max_mtu + overhead); old_master_mtu = master->mtu; new_master_mtu = largest_mtu + overhead; if (new_master_mtu > mtu_limit) return -ERANGE; /* If the master MTU isn't over limit, there's no need to check the CPU * MTU, since that surely isn't either. */ cpu_mtu = largest_mtu; /* Start applying stuff */ if (new_master_mtu != old_master_mtu) { err = dev_set_mtu(master, new_master_mtu); if (err < 0) goto out_master_failed; /* We only need to propagate the MTU of the CPU port to * upstream switches, so emit a notifier which updates them. */ err = dsa_port_mtu_change(cpu_dp, cpu_mtu); if (err) goto out_cpu_failed; } err = ds->ops->port_change_mtu(ds, dp->index, new_mtu); if (err) goto out_port_failed; dev->mtu = new_mtu; dsa_bridge_mtu_normalization(dp); return 0; out_port_failed: if (new_master_mtu != old_master_mtu) dsa_port_mtu_change(cpu_dp, old_master_mtu - overhead); out_cpu_failed: if (new_master_mtu != old_master_mtu) dev_set_mtu(master, old_master_mtu); out_master_failed: return err; } static int __maybe_unused dsa_slave_dcbnl_set_default_prio(struct net_device *dev, struct dcb_app *app) { struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; unsigned long mask, new_prio; int err, port = dp->index; if (!ds->ops->port_set_default_prio) return -EOPNOTSUPP; err = dcb_ieee_setapp(dev, app); if (err) return err; mask = dcb_ieee_getapp_mask(dev, app); new_prio = __fls(mask); err = ds->ops->port_set_default_prio(ds, port, new_prio); if (err) { dcb_ieee_delapp(dev, app); return err; } return 0; } static int __maybe_unused dsa_slave_dcbnl_add_dscp_prio(struct net_device *dev, struct dcb_app *app) { struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; unsigned long mask, new_prio; int err, port = dp->index; u8 dscp = app->protocol; if (!ds->ops->port_add_dscp_prio) return -EOPNOTSUPP; if (dscp >= 64) { netdev_err(dev, "DSCP APP entry with protocol value %u is invalid\n", dscp); return -EINVAL; } err = dcb_ieee_setapp(dev, app); if (err) return err; mask = dcb_ieee_getapp_mask(dev, app); new_prio = __fls(mask); err = ds->ops->port_add_dscp_prio(ds, port, dscp, new_prio); if (err) { dcb_ieee_delapp(dev, app); return err; } return 0; } static int __maybe_unused dsa_slave_dcbnl_ieee_setapp(struct net_device *dev, struct dcb_app *app) { switch (app->selector) { case IEEE_8021QAZ_APP_SEL_ETHERTYPE: switch (app->protocol) { case 0: return dsa_slave_dcbnl_set_default_prio(dev, app); default: return -EOPNOTSUPP; } break; case IEEE_8021QAZ_APP_SEL_DSCP: return dsa_slave_dcbnl_add_dscp_prio(dev, app); default: return -EOPNOTSUPP; } } static int __maybe_unused dsa_slave_dcbnl_del_default_prio(struct net_device *dev, struct dcb_app *app) { struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; unsigned long mask, new_prio; int err, port = dp->index; if (!ds->ops->port_set_default_prio) return -EOPNOTSUPP; err = dcb_ieee_delapp(dev, app); if (err) return err; mask = dcb_ieee_getapp_mask(dev, app); new_prio = mask ? __fls(mask) : 0; err = ds->ops->port_set_default_prio(ds, port, new_prio); if (err) { dcb_ieee_setapp(dev, app); return err; } return 0; } static int __maybe_unused dsa_slave_dcbnl_del_dscp_prio(struct net_device *dev, struct dcb_app *app) { struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; int err, port = dp->index; u8 dscp = app->protocol; if (!ds->ops->port_del_dscp_prio) return -EOPNOTSUPP; err = dcb_ieee_delapp(dev, app); if (err) return err; err = ds->ops->port_del_dscp_prio(ds, port, dscp, app->priority); if (err) { dcb_ieee_setapp(dev, app); return err; } return 0; } static int __maybe_unused dsa_slave_dcbnl_ieee_delapp(struct net_device *dev, struct dcb_app *app) { switch (app->selector) { case IEEE_8021QAZ_APP_SEL_ETHERTYPE: switch (app->protocol) { case 0: return dsa_slave_dcbnl_del_default_prio(dev, app); default: return -EOPNOTSUPP; } break; case IEEE_8021QAZ_APP_SEL_DSCP: return dsa_slave_dcbnl_del_dscp_prio(dev, app); default: return -EOPNOTSUPP; } } /* Pre-populate the DCB application priority table with the priorities * configured during switch setup, which we read from hardware here. */ static int dsa_slave_dcbnl_init(struct net_device *dev) { struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; int port = dp->index; int err; if (ds->ops->port_get_default_prio) { int prio = ds->ops->port_get_default_prio(ds, port); struct dcb_app app = { .selector = IEEE_8021QAZ_APP_SEL_ETHERTYPE, .protocol = 0, .priority = prio, }; if (prio < 0) return prio; err = dcb_ieee_setapp(dev, &app); if (err) return err; } if (ds->ops->port_get_dscp_prio) { int protocol; for (protocol = 0; protocol < 64; protocol++) { struct dcb_app app = { .selector = IEEE_8021QAZ_APP_SEL_DSCP, .protocol = protocol, }; int prio; prio = ds->ops->port_get_dscp_prio(ds, port, protocol); if (prio == -EOPNOTSUPP) continue; if (prio < 0) return prio; app.priority = prio; err = dcb_ieee_setapp(dev, &app); if (err) return err; } } return 0; } static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_drvinfo = dsa_slave_get_drvinfo, .get_regs_len = dsa_slave_get_regs_len, .get_regs = dsa_slave_get_regs, .nway_reset = dsa_slave_nway_reset, .get_link = ethtool_op_get_link, .get_eeprom_len = dsa_slave_get_eeprom_len, .get_eeprom = dsa_slave_get_eeprom, .set_eeprom = dsa_slave_set_eeprom, .get_strings = dsa_slave_get_strings, .get_ethtool_stats = dsa_slave_get_ethtool_stats, .get_sset_count = dsa_slave_get_sset_count, .get_eth_phy_stats = dsa_slave_get_eth_phy_stats, .get_eth_mac_stats = dsa_slave_get_eth_mac_stats, .get_eth_ctrl_stats = dsa_slave_get_eth_ctrl_stats, .get_rmon_stats = dsa_slave_get_rmon_stats, .set_wol = dsa_slave_set_wol, .get_wol = dsa_slave_get_wol, .set_eee = dsa_slave_set_eee, .get_eee = dsa_slave_get_eee, .get_link_ksettings = dsa_slave_get_link_ksettings, .set_link_ksettings = dsa_slave_set_link_ksettings, .get_pause_stats = dsa_slave_get_pause_stats, .get_pauseparam = dsa_slave_get_pauseparam, .set_pauseparam = dsa_slave_set_pauseparam, .get_rxnfc = dsa_slave_get_rxnfc, .set_rxnfc = dsa_slave_set_rxnfc, .get_ts_info = dsa_slave_get_ts_info, .self_test = dsa_slave_net_selftest, .get_mm = dsa_slave_get_mm, .set_mm = dsa_slave_set_mm, .get_mm_stats = dsa_slave_get_mm_stats, }; static const struct dcbnl_rtnl_ops __maybe_unused dsa_slave_dcbnl_ops = { .ieee_setapp = dsa_slave_dcbnl_ieee_setapp, .ieee_delapp = dsa_slave_dcbnl_ieee_delapp, }; static void dsa_slave_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *s) { struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_stats64) ds->ops->get_stats64(ds, dp->index, s); else dev_get_tstats64(dev, s); } static int dsa_slave_fill_forward_path(struct net_device_path_ctx *ctx, struct net_device_path *path) { struct dsa_port *dp = dsa_slave_to_port(ctx->dev); struct net_device *master = dsa_port_to_master(dp); struct dsa_port *cpu_dp = dp->cpu_dp; path->dev = ctx->dev; path->type = DEV_PATH_DSA; path->dsa.proto = cpu_dp->tag_ops->proto; path->dsa.port = dp->index; ctx->dev = master; return 0; } static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_open = dsa_slave_open, .ndo_stop = dsa_slave_close, .ndo_start_xmit = dsa_slave_xmit, .ndo_change_rx_flags = dsa_slave_change_rx_flags, .ndo_set_rx_mode = dsa_slave_set_rx_mode, .ndo_set_mac_address = dsa_slave_set_mac_address, .ndo_fdb_dump = dsa_slave_fdb_dump, .ndo_eth_ioctl = dsa_slave_ioctl, .ndo_get_iflink = dsa_slave_get_iflink, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_netpoll_setup = dsa_slave_netpoll_setup, .ndo_netpoll_cleanup = dsa_slave_netpoll_cleanup, .ndo_poll_controller = dsa_slave_poll_controller, #endif .ndo_setup_tc = dsa_slave_setup_tc, .ndo_get_stats64 = dsa_slave_get_stats64, .ndo_vlan_rx_add_vid = dsa_slave_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = dsa_slave_vlan_rx_kill_vid, .ndo_change_mtu = dsa_slave_change_mtu, .ndo_fill_forward_path = dsa_slave_fill_forward_path, }; static struct device_type dsa_type = { .name = "dsa", }; void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up) { const struct dsa_port *dp = dsa_to_port(ds, port); if (dp->pl) phylink_mac_change(dp->pl, up); } EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_change); static void dsa_slave_phylink_fixed_state(struct phylink_config *config, struct phylink_link_state *state) { struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); struct dsa_switch *ds = dp->ds; /* No need to check that this operation is valid, the callback would * not be called if it was not. */ ds->ops->phylink_fixed_state(ds, dp->index, state); } /* slave device setup *******************************************************/ static int dsa_slave_phy_connect(struct net_device *slave_dev, int addr, u32 flags) { struct dsa_port *dp = dsa_slave_to_port(slave_dev); struct dsa_switch *ds = dp->ds; slave_dev->phydev = mdiobus_get_phy(ds->slave_mii_bus, addr); if (!slave_dev->phydev) { netdev_err(slave_dev, "no phy at %d\n", addr); return -ENODEV; } slave_dev->phydev->dev_flags |= flags; return phylink_connect_phy(dp->pl, slave_dev->phydev); } static int dsa_slave_phy_setup(struct net_device *slave_dev) { struct dsa_port *dp = dsa_slave_to_port(slave_dev); struct device_node *port_dn = dp->dn; struct dsa_switch *ds = dp->ds; u32 phy_flags = 0; int ret; dp->pl_config.dev = &slave_dev->dev; dp->pl_config.type = PHYLINK_NETDEV; /* The get_fixed_state callback takes precedence over polling the * link GPIO in PHYLINK (see phylink_get_fixed_state). Only set * this if the switch provides such a callback. */ if (ds->ops->phylink_fixed_state) { dp->pl_config.get_fixed_state = dsa_slave_phylink_fixed_state; dp->pl_config.poll_fixed_state = true; } ret = dsa_port_phylink_create(dp); if (ret) return ret; if (ds->ops->get_phy_flags) phy_flags = ds->ops->get_phy_flags(ds, dp->index); ret = phylink_of_phy_connect(dp->pl, port_dn, phy_flags); if (ret == -ENODEV && ds->slave_mii_bus) { /* We could not connect to a designated PHY or SFP, so try to * use the switch internal MDIO bus instead */ ret = dsa_slave_phy_connect(slave_dev, dp->index, phy_flags); } if (ret) { netdev_err(slave_dev, "failed to connect to PHY: %pe\n", ERR_PTR(ret)); dsa_port_phylink_destroy(dp); } return ret; } void dsa_slave_setup_tagger(struct net_device *slave) { struct dsa_port *dp = dsa_slave_to_port(slave); struct net_device *master = dsa_port_to_master(dp); struct dsa_slave_priv *p = netdev_priv(slave); const struct dsa_port *cpu_dp = dp->cpu_dp; const struct dsa_switch *ds = dp->ds; slave->needed_headroom = cpu_dp->tag_ops->needed_headroom; slave->needed_tailroom = cpu_dp->tag_ops->needed_tailroom; /* Try to save one extra realloc later in the TX path (in the master) * by also inheriting the master's needed headroom and tailroom. * The 8021q driver also does this. */ slave->needed_headroom += master->needed_headroom; slave->needed_tailroom += master->needed_tailroom; p->xmit = cpu_dp->tag_ops->xmit; slave->features = master->vlan_features | NETIF_F_HW_TC; slave->hw_features |= NETIF_F_HW_TC; slave->features |= NETIF_F_LLTX; if (slave->needed_tailroom) slave->features &= ~(NETIF_F_SG | NETIF_F_FRAGLIST); if (ds->needs_standalone_vlan_filtering) slave->features |= NETIF_F_HW_VLAN_CTAG_FILTER; } int dsa_slave_suspend(struct net_device *slave_dev) { struct dsa_port *dp = dsa_slave_to_port(slave_dev); if (!netif_running(slave_dev)) return 0; netif_device_detach(slave_dev); rtnl_lock(); phylink_stop(dp->pl); rtnl_unlock(); return 0; } int dsa_slave_resume(struct net_device *slave_dev) { struct dsa_port *dp = dsa_slave_to_port(slave_dev); if (!netif_running(slave_dev)) return 0; netif_device_attach(slave_dev); rtnl_lock(); phylink_start(dp->pl); rtnl_unlock(); return 0; } int dsa_slave_create(struct dsa_port *port) { struct net_device *master = dsa_port_to_master(port); struct dsa_switch *ds = port->ds; struct net_device *slave_dev; struct dsa_slave_priv *p; const char *name; int assign_type; int ret; if (!ds->num_tx_queues) ds->num_tx_queues = 1; if (port->name) { name = port->name; assign_type = NET_NAME_PREDICTABLE; } else { name = "eth%d"; assign_type = NET_NAME_ENUM; } slave_dev = alloc_netdev_mqs(sizeof(struct dsa_slave_priv), name, assign_type, ether_setup, ds->num_tx_queues, 1); if (slave_dev == NULL) return -ENOMEM; slave_dev->rtnl_link_ops = &dsa_link_ops; slave_dev->ethtool_ops = &dsa_slave_ethtool_ops; #if IS_ENABLED(CONFIG_DCB) slave_dev->dcbnl_ops = &dsa_slave_dcbnl_ops; #endif if (!is_zero_ether_addr(port->mac)) eth_hw_addr_set(slave_dev, port->mac); else eth_hw_addr_inherit(slave_dev, master); slave_dev->priv_flags |= IFF_NO_QUEUE; if (dsa_switch_supports_uc_filtering(ds)) slave_dev->priv_flags |= IFF_UNICAST_FLT; slave_dev->netdev_ops = &dsa_slave_netdev_ops; if (ds->ops->port_max_mtu) slave_dev->max_mtu = ds->ops->port_max_mtu(ds, port->index); SET_NETDEV_DEVTYPE(slave_dev, &dsa_type); SET_NETDEV_DEV(slave_dev, port->ds->dev); SET_NETDEV_DEVLINK_PORT(slave_dev, &port->devlink_port); slave_dev->dev.of_node = port->dn; slave_dev->vlan_features = master->vlan_features; p = netdev_priv(slave_dev); slave_dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!slave_dev->tstats) { free_netdev(slave_dev); return -ENOMEM; } ret = gro_cells_init(&p->gcells, slave_dev); if (ret) goto out_free; p->dp = port; INIT_LIST_HEAD(&p->mall_tc_list); port->slave = slave_dev; dsa_slave_setup_tagger(slave_dev); netif_carrier_off(slave_dev); ret = dsa_slave_phy_setup(slave_dev); if (ret) { netdev_err(slave_dev, "error %d setting up PHY for tree %d, switch %d, port %d\n", ret, ds->dst->index, ds->index, port->index); goto out_gcells; } rtnl_lock(); ret = dsa_slave_change_mtu(slave_dev, ETH_DATA_LEN); if (ret && ret != -EOPNOTSUPP) dev_warn(ds->dev, "nonfatal error %d setting MTU to %d on port %d\n", ret, ETH_DATA_LEN, port->index); ret = register_netdevice(slave_dev); if (ret) { netdev_err(master, "error %d registering interface %s\n", ret, slave_dev->name); rtnl_unlock(); goto out_phy; } if (IS_ENABLED(CONFIG_DCB)) { ret = dsa_slave_dcbnl_init(slave_dev); if (ret) { netdev_err(slave_dev, "failed to initialize DCB: %pe\n", ERR_PTR(ret)); rtnl_unlock(); goto out_unregister; } } ret = netdev_upper_dev_link(master, slave_dev, NULL); rtnl_unlock(); if (ret) goto out_unregister; return 0; out_unregister: unregister_netdev(slave_dev); out_phy: rtnl_lock(); phylink_disconnect_phy(p->dp->pl); rtnl_unlock(); dsa_port_phylink_destroy(p->dp); out_gcells: gro_cells_destroy(&p->gcells); out_free: free_percpu(slave_dev->tstats); free_netdev(slave_dev); port->slave = NULL; return ret; } void dsa_slave_destroy(struct net_device *slave_dev) { struct net_device *master = dsa_slave_to_master(slave_dev); struct dsa_port *dp = dsa_slave_to_port(slave_dev); struct dsa_slave_priv *p = netdev_priv(slave_dev); netif_carrier_off(slave_dev); rtnl_lock(); netdev_upper_dev_unlink(master, slave_dev); unregister_netdevice(slave_dev); phylink_disconnect_phy(dp->pl); rtnl_unlock(); dsa_port_phylink_destroy(dp); gro_cells_destroy(&p->gcells); free_percpu(slave_dev->tstats); free_netdev(slave_dev); } int dsa_slave_change_master(struct net_device *dev, struct net_device *master, struct netlink_ext_ack *extack) { struct net_device *old_master = dsa_slave_to_master(dev); struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; struct net_device *upper; struct list_head *iter; int err; if (master == old_master) return 0; if (!ds->ops->port_change_master) { NL_SET_ERR_MSG_MOD(extack, "Driver does not support changing DSA master"); return -EOPNOTSUPP; } if (!netdev_uses_dsa(master)) { NL_SET_ERR_MSG_MOD(extack, "Interface not eligible as DSA master"); return -EOPNOTSUPP; } netdev_for_each_upper_dev_rcu(master, upper, iter) { if (dsa_slave_dev_check(upper)) continue; if (netif_is_bridge_master(upper)) continue; NL_SET_ERR_MSG_MOD(extack, "Cannot join master with unknown uppers"); return -EOPNOTSUPP; } /* Since we allow live-changing the DSA master, plus we auto-open the * DSA master when the user port opens => we need to ensure that the * new DSA master is open too. */ if (dev->flags & IFF_UP) { err = dev_open(master, extack); if (err) return err; } netdev_upper_dev_unlink(old_master, dev); err = netdev_upper_dev_link(master, dev, extack); if (err) goto out_revert_old_master_unlink; err = dsa_port_change_master(dp, master, extack); if (err) goto out_revert_master_link; /* Update the MTU of the new CPU port through cross-chip notifiers */ err = dsa_slave_change_mtu(dev, dev->mtu); if (err && err != -EOPNOTSUPP) { netdev_warn(dev, "nonfatal error updating MTU with new master: %pe\n", ERR_PTR(err)); } /* If the port doesn't have its own MAC address and relies on the DSA * master's one, inherit it again from the new DSA master. */ if (is_zero_ether_addr(dp->mac)) eth_hw_addr_inherit(dev, master); return 0; out_revert_master_link: netdev_upper_dev_unlink(master, dev); out_revert_old_master_unlink: netdev_upper_dev_link(old_master, dev, NULL); return err; } bool dsa_slave_dev_check(const struct net_device *dev) { return dev->netdev_ops == &dsa_slave_netdev_ops; } EXPORT_SYMBOL_GPL(dsa_slave_dev_check); static int dsa_slave_changeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct dsa_port *dp = dsa_slave_to_port(dev); struct netlink_ext_ack *extack; int err = NOTIFY_DONE; if (!dsa_slave_dev_check(dev)) return err; extack = netdev_notifier_info_to_extack(&info->info); if (netif_is_bridge_master(info->upper_dev)) { if (info->linking) { err = dsa_port_bridge_join(dp, info->upper_dev, extack); if (!err) dsa_bridge_mtu_normalization(dp); if (err == -EOPNOTSUPP) { NL_SET_ERR_MSG_WEAK_MOD(extack, "Offloading not supported"); err = 0; } err = notifier_from_errno(err); } else { dsa_port_bridge_leave(dp, info->upper_dev); err = NOTIFY_OK; } } else if (netif_is_lag_master(info->upper_dev)) { if (info->linking) { err = dsa_port_lag_join(dp, info->upper_dev, info->upper_info, extack); if (err == -EOPNOTSUPP) { NL_SET_ERR_MSG_WEAK_MOD(extack, "Offloading not supported"); err = 0; } err = notifier_from_errno(err); } else { dsa_port_lag_leave(dp, info->upper_dev); err = NOTIFY_OK; } } else if (is_hsr_master(info->upper_dev)) { if (info->linking) { err = dsa_port_hsr_join(dp, info->upper_dev); if (err == -EOPNOTSUPP) { NL_SET_ERR_MSG_WEAK_MOD(extack, "Offloading not supported"); err = 0; } err = notifier_from_errno(err); } else { dsa_port_hsr_leave(dp, info->upper_dev); err = NOTIFY_OK; } } return err; } static int dsa_slave_prechangeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct dsa_port *dp = dsa_slave_to_port(dev); if (!dsa_slave_dev_check(dev)) return NOTIFY_DONE; if (netif_is_bridge_master(info->upper_dev) && !info->linking) dsa_port_pre_bridge_leave(dp, info->upper_dev); else if (netif_is_lag_master(info->upper_dev) && !info->linking) dsa_port_pre_lag_leave(dp, info->upper_dev); /* dsa_port_pre_hsr_leave is not yet necessary since hsr cannot be * meaningfully enslaved to a bridge yet */ return NOTIFY_DONE; } static int dsa_slave_lag_changeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct net_device *lower; struct list_head *iter; int err = NOTIFY_DONE; struct dsa_port *dp; if (!netif_is_lag_master(dev)) return err; netdev_for_each_lower_dev(dev, lower, iter) { if (!dsa_slave_dev_check(lower)) continue; dp = dsa_slave_to_port(lower); if (!dp->lag) /* Software LAG */ continue; err = dsa_slave_changeupper(lower, info); if (notifier_to_errno(err)) break; } return err; } /* Same as dsa_slave_lag_changeupper() except that it calls * dsa_slave_prechangeupper() */ static int dsa_slave_lag_prechangeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct net_device *lower; struct list_head *iter; int err = NOTIFY_DONE; struct dsa_port *dp; if (!netif_is_lag_master(dev)) return err; netdev_for_each_lower_dev(dev, lower, iter) { if (!dsa_slave_dev_check(lower)) continue; dp = dsa_slave_to_port(lower); if (!dp->lag) /* Software LAG */ continue; err = dsa_slave_prechangeupper(lower, info); if (notifier_to_errno(err)) break; } return err; } static int dsa_prevent_bridging_8021q_upper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct netlink_ext_ack *ext_ack; struct net_device *slave, *br; struct dsa_port *dp; ext_ack = netdev_notifier_info_to_extack(&info->info); if (!is_vlan_dev(dev)) return NOTIFY_DONE; slave = vlan_dev_real_dev(dev); if (!dsa_slave_dev_check(slave)) return NOTIFY_DONE; dp = dsa_slave_to_port(slave); br = dsa_port_bridge_dev_get(dp); if (!br) return NOTIFY_DONE; /* Deny enslaving a VLAN device into a VLAN-aware bridge */ if (br_vlan_enabled(br) && netif_is_bridge_master(info->upper_dev) && info->linking) { NL_SET_ERR_MSG_MOD(ext_ack, "Cannot enslave VLAN device into VLAN aware bridge"); return notifier_from_errno(-EINVAL); } return NOTIFY_DONE; } static int dsa_slave_check_8021q_upper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct dsa_port *dp = dsa_slave_to_port(dev); struct net_device *br = dsa_port_bridge_dev_get(dp); struct bridge_vlan_info br_info; struct netlink_ext_ack *extack; int err = NOTIFY_DONE; u16 vid; if (!br || !br_vlan_enabled(br)) return NOTIFY_DONE; extack = netdev_notifier_info_to_extack(&info->info); vid = vlan_dev_vlan_id(info->upper_dev); /* br_vlan_get_info() returns -EINVAL or -ENOENT if the * device, respectively the VID is not found, returning * 0 means success, which is a failure for us here. */ err = br_vlan_get_info(br, vid, &br_info); if (err == 0) { NL_SET_ERR_MSG_MOD(extack, "This VLAN is already configured by the bridge"); return notifier_from_errno(-EBUSY); } return NOTIFY_DONE; } static int dsa_slave_prechangeupper_sanity_check(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct dsa_switch *ds; struct dsa_port *dp; int err; if (!dsa_slave_dev_check(dev)) return dsa_prevent_bridging_8021q_upper(dev, info); dp = dsa_slave_to_port(dev); ds = dp->ds; if (ds->ops->port_prechangeupper) { err = ds->ops->port_prechangeupper(ds, dp->index, info); if (err) return notifier_from_errno(err); } if (is_vlan_dev(info->upper_dev)) return dsa_slave_check_8021q_upper(dev, info); return NOTIFY_DONE; } /* To be eligible as a DSA master, a LAG must have all lower interfaces be * eligible DSA masters. Additionally, all LAG slaves must be DSA masters of * switches in the same switch tree. */ static int dsa_lag_master_validate(struct net_device *lag_dev, struct netlink_ext_ack *extack) { struct net_device *lower1, *lower2; struct list_head *iter1, *iter2; netdev_for_each_lower_dev(lag_dev, lower1, iter1) { netdev_for_each_lower_dev(lag_dev, lower2, iter2) { if (!netdev_uses_dsa(lower1) || !netdev_uses_dsa(lower2)) { NL_SET_ERR_MSG_MOD(extack, "All LAG ports must be eligible as DSA masters"); return notifier_from_errno(-EINVAL); } if (lower1 == lower2) continue; if (!dsa_port_tree_same(lower1->dsa_ptr, lower2->dsa_ptr)) { NL_SET_ERR_MSG_MOD(extack, "LAG contains DSA masters of disjoint switch trees"); return notifier_from_errno(-EINVAL); } } } return NOTIFY_DONE; } static int dsa_master_prechangeupper_sanity_check(struct net_device *master, struct netdev_notifier_changeupper_info *info) { struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(&info->info); if (!netdev_uses_dsa(master)) return NOTIFY_DONE; if (!info->linking) return NOTIFY_DONE; /* Allow DSA switch uppers */ if (dsa_slave_dev_check(info->upper_dev)) return NOTIFY_DONE; /* Allow bridge uppers of DSA masters, subject to further * restrictions in dsa_bridge_prechangelower_sanity_check() */ if (netif_is_bridge_master(info->upper_dev)) return NOTIFY_DONE; /* Allow LAG uppers, subject to further restrictions in * dsa_lag_master_prechangelower_sanity_check() */ if (netif_is_lag_master(info->upper_dev)) return dsa_lag_master_validate(info->upper_dev, extack); NL_SET_ERR_MSG_MOD(extack, "DSA master cannot join unknown upper interfaces"); return notifier_from_errno(-EBUSY); } static int dsa_lag_master_prechangelower_sanity_check(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(&info->info); struct net_device *lag_dev = info->upper_dev; struct net_device *lower; struct list_head *iter; if (!netdev_uses_dsa(lag_dev) || !netif_is_lag_master(lag_dev)) return NOTIFY_DONE; if (!info->linking) return NOTIFY_DONE; if (!netdev_uses_dsa(dev)) { NL_SET_ERR_MSG(extack, "Only DSA masters can join a LAG DSA master"); return notifier_from_errno(-EINVAL); } netdev_for_each_lower_dev(lag_dev, lower, iter) { if (!dsa_port_tree_same(dev->dsa_ptr, lower->dsa_ptr)) { NL_SET_ERR_MSG(extack, "Interface is DSA master for a different switch tree than this LAG"); return notifier_from_errno(-EINVAL); } break; } return NOTIFY_DONE; } /* Don't allow bridging of DSA masters, since the bridge layer rx_handler * prevents the DSA fake ethertype handler to be invoked, so we don't get the * chance to strip off and parse the DSA switch tag protocol header (the bridge * layer just returns RX_HANDLER_CONSUMED, stopping RX processing for these * frames). * The only case where that would not be an issue is when bridging can already * be offloaded, such as when the DSA master is itself a DSA or plain switchdev * port, and is bridged only with other ports from the same hardware device. */ static int dsa_bridge_prechangelower_sanity_check(struct net_device *new_lower, struct netdev_notifier_changeupper_info *info) { struct net_device *br = info->upper_dev; struct netlink_ext_ack *extack; struct net_device *lower; struct list_head *iter; if (!netif_is_bridge_master(br)) return NOTIFY_DONE; if (!info->linking) return NOTIFY_DONE; extack = netdev_notifier_info_to_extack(&info->info); netdev_for_each_lower_dev(br, lower, iter) { if (!netdev_uses_dsa(new_lower) && !netdev_uses_dsa(lower)) continue; if (!netdev_port_same_parent_id(lower, new_lower)) { NL_SET_ERR_MSG(extack, "Cannot do software bridging with a DSA master"); return notifier_from_errno(-EINVAL); } } return NOTIFY_DONE; } static void dsa_tree_migrate_ports_from_lag_master(struct dsa_switch_tree *dst, struct net_device *lag_dev) { struct net_device *new_master = dsa_tree_find_first_master(dst); struct dsa_port *dp; int err; dsa_tree_for_each_user_port(dp, dst) { if (dsa_port_to_master(dp) != lag_dev) continue; err = dsa_slave_change_master(dp->slave, new_master, NULL); if (err) { netdev_err(dp->slave, "failed to restore master to %s: %pe\n", new_master->name, ERR_PTR(err)); } } } static int dsa_master_lag_join(struct net_device *master, struct net_device *lag_dev, struct netdev_lag_upper_info *uinfo, struct netlink_ext_ack *extack) { struct dsa_port *cpu_dp = master->dsa_ptr; struct dsa_switch_tree *dst = cpu_dp->dst; struct dsa_port *dp; int err; err = dsa_master_lag_setup(lag_dev, cpu_dp, uinfo, extack); if (err) return err; dsa_tree_for_each_user_port(dp, dst) { if (dsa_port_to_master(dp) != master) continue; err = dsa_slave_change_master(dp->slave, lag_dev, extack); if (err) goto restore; } return 0; restore: dsa_tree_for_each_user_port_continue_reverse(dp, dst) { if (dsa_port_to_master(dp) != lag_dev) continue; err = dsa_slave_change_master(dp->slave, master, NULL); if (err) { netdev_err(dp->slave, "failed to restore master to %s: %pe\n", master->name, ERR_PTR(err)); } } dsa_master_lag_teardown(lag_dev, master->dsa_ptr); return err; } static void dsa_master_lag_leave(struct net_device *master, struct net_device *lag_dev) { struct dsa_port *dp, *cpu_dp = lag_dev->dsa_ptr; struct dsa_switch_tree *dst = cpu_dp->dst; struct dsa_port *new_cpu_dp = NULL; struct net_device *lower; struct list_head *iter; netdev_for_each_lower_dev(lag_dev, lower, iter) { if (netdev_uses_dsa(lower)) { new_cpu_dp = lower->dsa_ptr; break; } } if (new_cpu_dp) { /* Update the CPU port of the user ports still under the LAG * so that dsa_port_to_master() continues to work properly */ dsa_tree_for_each_user_port(dp, dst) if (dsa_port_to_master(dp) == lag_dev) dp->cpu_dp = new_cpu_dp; /* Update the index of the virtual CPU port to match the lowest * physical CPU port */ lag_dev->dsa_ptr = new_cpu_dp; wmb(); } else { /* If the LAG DSA master has no ports left, migrate back all * user ports to the first physical CPU port */ dsa_tree_migrate_ports_from_lag_master(dst, lag_dev); } /* This DSA master has left its LAG in any case, so let * the CPU port leave the hardware LAG as well */ dsa_master_lag_teardown(lag_dev, master->dsa_ptr); } static int dsa_master_changeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct netlink_ext_ack *extack; int err = NOTIFY_DONE; if (!netdev_uses_dsa(dev)) return err; extack = netdev_notifier_info_to_extack(&info->info); if (netif_is_lag_master(info->upper_dev)) { if (info->linking) { err = dsa_master_lag_join(dev, info->upper_dev, info->upper_info, extack); err = notifier_from_errno(err); } else { dsa_master_lag_leave(dev, info->upper_dev); err = NOTIFY_OK; } } return err; } static int dsa_slave_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); switch (event) { case NETDEV_PRECHANGEUPPER: { struct netdev_notifier_changeupper_info *info = ptr; int err; err = dsa_slave_prechangeupper_sanity_check(dev, info); if (notifier_to_errno(err)) return err; err = dsa_master_prechangeupper_sanity_check(dev, info); if (notifier_to_errno(err)) return err; err = dsa_lag_master_prechangelower_sanity_check(dev, info); if (notifier_to_errno(err)) return err; err = dsa_bridge_prechangelower_sanity_check(dev, info); if (notifier_to_errno(err)) return err; err = dsa_slave_prechangeupper(dev, ptr); if (notifier_to_errno(err)) return err; err = dsa_slave_lag_prechangeupper(dev, ptr); if (notifier_to_errno(err)) return err; break; } case NETDEV_CHANGEUPPER: { int err; err = dsa_slave_changeupper(dev, ptr); if (notifier_to_errno(err)) return err; err = dsa_slave_lag_changeupper(dev, ptr); if (notifier_to_errno(err)) return err; err = dsa_master_changeupper(dev, ptr); if (notifier_to_errno(err)) return err; break; } case NETDEV_CHANGELOWERSTATE: { struct netdev_notifier_changelowerstate_info *info = ptr; struct dsa_port *dp; int err = 0; if (dsa_slave_dev_check(dev)) { dp = dsa_slave_to_port(dev); err = dsa_port_lag_change(dp, info->lower_state_info); } /* Mirror LAG port events on DSA masters that are in * a LAG towards their respective switch CPU ports */ if (netdev_uses_dsa(dev)) { dp = dev->dsa_ptr; err = dsa_port_lag_change(dp, info->lower_state_info); } return notifier_from_errno(err); } case NETDEV_CHANGE: case NETDEV_UP: { /* Track state of master port. * DSA driver may require the master port (and indirectly * the tagger) to be available for some special operation. */ if (netdev_uses_dsa(dev)) { struct dsa_port *cpu_dp = dev->dsa_ptr; struct dsa_switch_tree *dst = cpu_dp->ds->dst; /* Track when the master port is UP */ dsa_tree_master_oper_state_change(dst, dev, netif_oper_up(dev)); /* Track when the master port is ready and can accept * packet. * NETDEV_UP event is not enough to flag a port as ready. * We also have to wait for linkwatch_do_dev to dev_activate * and emit a NETDEV_CHANGE event. * We check if a master port is ready by checking if the dev * have a qdisc assigned and is not noop. */ dsa_tree_master_admin_state_change(dst, dev, !qdisc_tx_is_noop(dev)); return NOTIFY_OK; } return NOTIFY_DONE; } case NETDEV_GOING_DOWN: { struct dsa_port *dp, *cpu_dp; struct dsa_switch_tree *dst; LIST_HEAD(close_list); if (!netdev_uses_dsa(dev)) return NOTIFY_DONE; cpu_dp = dev->dsa_ptr; dst = cpu_dp->ds->dst; dsa_tree_master_admin_state_change(dst, dev, false); list_for_each_entry(dp, &dst->ports, list) { if (!dsa_port_is_user(dp)) continue; if (dp->cpu_dp != cpu_dp) continue; list_add(&dp->slave->close_list, &close_list); } dev_close_many(&close_list, true); return NOTIFY_OK; } default: break; } return NOTIFY_DONE; } static void dsa_fdb_offload_notify(struct dsa_switchdev_event_work *switchdev_work) { struct switchdev_notifier_fdb_info info = {}; info.addr = switchdev_work->addr; info.vid = switchdev_work->vid; info.offloaded = true; call_switchdev_notifiers(SWITCHDEV_FDB_OFFLOADED, switchdev_work->orig_dev, &info.info, NULL); } static void dsa_slave_switchdev_event_work(struct work_struct *work) { struct dsa_switchdev_event_work *switchdev_work = container_of(work, struct dsa_switchdev_event_work, work); const unsigned char *addr = switchdev_work->addr; struct net_device *dev = switchdev_work->dev; u16 vid = switchdev_work->vid; struct dsa_switch *ds; struct dsa_port *dp; int err; dp = dsa_slave_to_port(dev); ds = dp->ds; switch (switchdev_work->event) { case SWITCHDEV_FDB_ADD_TO_DEVICE: if (switchdev_work->host_addr) err = dsa_port_bridge_host_fdb_add(dp, addr, vid); else if (dp->lag) err = dsa_port_lag_fdb_add(dp, addr, vid); else err = dsa_port_fdb_add(dp, addr, vid); if (err) { dev_err(ds->dev, "port %d failed to add %pM vid %d to fdb: %d\n", dp->index, addr, vid, err); break; } dsa_fdb_offload_notify(switchdev_work); break; case SWITCHDEV_FDB_DEL_TO_DEVICE: if (switchdev_work->host_addr) err = dsa_port_bridge_host_fdb_del(dp, addr, vid); else if (dp->lag) err = dsa_port_lag_fdb_del(dp, addr, vid); else err = dsa_port_fdb_del(dp, addr, vid); if (err) { dev_err(ds->dev, "port %d failed to delete %pM vid %d from fdb: %d\n", dp->index, addr, vid, err); } break; } kfree(switchdev_work); } static bool dsa_foreign_dev_check(const struct net_device *dev, const struct net_device *foreign_dev) { const struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch_tree *dst = dp->ds->dst; if (netif_is_bridge_master(foreign_dev)) return !dsa_tree_offloads_bridge_dev(dst, foreign_dev); if (netif_is_bridge_port(foreign_dev)) return !dsa_tree_offloads_bridge_port(dst, foreign_dev); /* Everything else is foreign */ return true; } static int dsa_slave_fdb_event(struct net_device *dev, struct net_device *orig_dev, unsigned long event, const void *ctx, const struct switchdev_notifier_fdb_info *fdb_info) { struct dsa_switchdev_event_work *switchdev_work; struct dsa_port *dp = dsa_slave_to_port(dev); bool host_addr = fdb_info->is_local; struct dsa_switch *ds = dp->ds; if (ctx && ctx != dp) return 0; if (!dp->bridge) return 0; if (switchdev_fdb_is_dynamically_learned(fdb_info)) { if (dsa_port_offloads_bridge_port(dp, orig_dev)) return 0; /* FDB entries learned by the software bridge or by foreign * bridge ports should be installed as host addresses only if * the driver requests assisted learning. */ if (!ds->assisted_learning_on_cpu_port) return 0; } /* Also treat FDB entries on foreign interfaces bridged with us as host * addresses. */ if (dsa_foreign_dev_check(dev, orig_dev)) host_addr = true; /* Check early that we're not doing work in vain. * Host addresses on LAG ports still require regular FDB ops, * since the CPU port isn't in a LAG. */ if (dp->lag && !host_addr) { if (!ds->ops->lag_fdb_add || !ds->ops->lag_fdb_del) return -EOPNOTSUPP; } else { if (!ds->ops->port_fdb_add || !ds->ops->port_fdb_del) return -EOPNOTSUPP; } switchdev_work = kzalloc(sizeof(*switchdev_work), GFP_ATOMIC); if (!switchdev_work) return -ENOMEM; netdev_dbg(dev, "%s FDB entry towards %s, addr %pM vid %d%s\n", event == SWITCHDEV_FDB_ADD_TO_DEVICE ? "Adding" : "Deleting", orig_dev->name, fdb_info->addr, fdb_info->vid, host_addr ? " as host address" : ""); INIT_WORK(&switchdev_work->work, dsa_slave_switchdev_event_work); switchdev_work->event = event; switchdev_work->dev = dev; switchdev_work->orig_dev = orig_dev; ether_addr_copy(switchdev_work->addr, fdb_info->addr); switchdev_work->vid = fdb_info->vid; switchdev_work->host_addr = host_addr; dsa_schedule_work(&switchdev_work->work); return 0; } /* Called under rcu_read_lock() */ static int dsa_slave_switchdev_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = switchdev_notifier_info_to_dev(ptr); int err; switch (event) { case SWITCHDEV_PORT_ATTR_SET: err = switchdev_handle_port_attr_set(dev, ptr, dsa_slave_dev_check, dsa_slave_port_attr_set); return notifier_from_errno(err); case SWITCHDEV_FDB_ADD_TO_DEVICE: case SWITCHDEV_FDB_DEL_TO_DEVICE: err = switchdev_handle_fdb_event_to_device(dev, event, ptr, dsa_slave_dev_check, dsa_foreign_dev_check, dsa_slave_fdb_event); return notifier_from_errno(err); default: return NOTIFY_DONE; } return NOTIFY_OK; } static int dsa_slave_switchdev_blocking_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = switchdev_notifier_info_to_dev(ptr); int err; switch (event) { case SWITCHDEV_PORT_OBJ_ADD: err = switchdev_handle_port_obj_add_foreign(dev, ptr, dsa_slave_dev_check, dsa_foreign_dev_check, dsa_slave_port_obj_add); return notifier_from_errno(err); case SWITCHDEV_PORT_OBJ_DEL: err = switchdev_handle_port_obj_del_foreign(dev, ptr, dsa_slave_dev_check, dsa_foreign_dev_check, dsa_slave_port_obj_del); return notifier_from_errno(err); case SWITCHDEV_PORT_ATTR_SET: err = switchdev_handle_port_attr_set(dev, ptr, dsa_slave_dev_check, dsa_slave_port_attr_set); return notifier_from_errno(err); } return NOTIFY_DONE; } static struct notifier_block dsa_slave_nb __read_mostly = { .notifier_call = dsa_slave_netdevice_event, }; struct notifier_block dsa_slave_switchdev_notifier = { .notifier_call = dsa_slave_switchdev_event, }; struct notifier_block dsa_slave_switchdev_blocking_notifier = { .notifier_call = dsa_slave_switchdev_blocking_event, }; int dsa_slave_register_notifier(void) { struct notifier_block *nb; int err; err = register_netdevice_notifier(&dsa_slave_nb); if (err) return err; err = register_switchdev_notifier(&dsa_slave_switchdev_notifier); if (err) goto err_switchdev_nb; nb = &dsa_slave_switchdev_blocking_notifier; err = register_switchdev_blocking_notifier(nb); if (err) goto err_switchdev_blocking_nb; return 0; err_switchdev_blocking_nb: unregister_switchdev_notifier(&dsa_slave_switchdev_notifier); err_switchdev_nb: unregister_netdevice_notifier(&dsa_slave_nb); return err; } void dsa_slave_unregister_notifier(void) { struct notifier_block *nb; int err; nb = &dsa_slave_switchdev_blocking_notifier; err = unregister_switchdev_blocking_notifier(nb); if (err) pr_err("DSA: failed to unregister switchdev blocking notifier (%d)\n", err); err = unregister_switchdev_notifier(&dsa_slave_switchdev_notifier); if (err) pr_err("DSA: failed to unregister switchdev notifier (%d)\n", err); err = unregister_netdevice_notifier(&dsa_slave_nb); if (err) pr_err("DSA: failed to unregister slave notifier (%d)\n", err); }
1536 1536 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ /* Copyright (c) 2008-2019, IBM Corporation */ #include <linux/init.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <net/net_namespace.h> #include <linux/rtnetlink.h> #include <linux/if_arp.h> #include <linux/list.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/module.h> #include <linux/dma-mapping.h> #include <net/addrconf.h> #include <rdma/ib_verbs.h> #include <rdma/ib_user_verbs.h> #include <rdma/rdma_netlink.h> #include <linux/kthread.h> #include "siw.h" #include "siw_verbs.h" MODULE_AUTHOR("Bernard Metzler"); MODULE_DESCRIPTION("Software iWARP Driver"); MODULE_LICENSE("Dual BSD/GPL"); /* transmit from user buffer, if possible */ const bool zcopy_tx = true; /* Restrict usage of GSO, if hardware peer iwarp is unable to process * large packets. try_gso = true lets siw try to use local GSO, * if peer agrees. Not using GSO severly limits siw maximum tx bandwidth. */ const bool try_gso; /* Attach siw also with loopback devices */ const bool loopback_enabled = true; /* We try to negotiate CRC on, if true */ const bool mpa_crc_required; /* MPA CRC on/off enforced */ const bool mpa_crc_strict; /* Control TCP_NODELAY socket option */ const bool siw_tcp_nagle; /* Select MPA version to be used during connection setup */ u_char mpa_version = MPA_REVISION_2; /* Selects MPA P2P mode (additional handshake during connection * setup, if true. */ const bool peer_to_peer; struct task_struct *siw_tx_thread[NR_CPUS]; struct crypto_shash *siw_crypto_shash; static int siw_device_register(struct siw_device *sdev, const char *name) { struct ib_device *base_dev = &sdev->base_dev; static int dev_id = 1; int rv; sdev->vendor_part_id = dev_id++; rv = ib_register_device(base_dev, name, NULL); if (rv) { pr_warn("siw: device registration error %d\n", rv); return rv; } siw_dbg(base_dev, "HWaddr=%pM\n", sdev->raw_gid); return 0; } static void siw_device_cleanup(struct ib_device *base_dev) { struct siw_device *sdev = to_siw_dev(base_dev); xa_destroy(&sdev->qp_xa); xa_destroy(&sdev->mem_xa); } static int siw_dev_qualified(struct net_device *netdev) { /* * Additional hardware support can be added here * (e.g. ARPHRD_FDDI, ARPHRD_ATM, ...) - see * <linux/if_arp.h> for type identifiers. */ if (netdev->type == ARPHRD_ETHER || netdev->type == ARPHRD_IEEE802 || netdev->type == ARPHRD_NONE || (netdev->type == ARPHRD_LOOPBACK && loopback_enabled)) return 1; return 0; } static DEFINE_PER_CPU(atomic_t, siw_use_cnt); static struct { struct cpumask **tx_valid_cpus; int num_nodes; } siw_cpu_info; static int siw_init_cpulist(void) { int i, num_nodes = nr_node_ids; memset(siw_tx_thread, 0, sizeof(siw_tx_thread)); siw_cpu_info.num_nodes = num_nodes; siw_cpu_info.tx_valid_cpus = kcalloc(num_nodes, sizeof(struct cpumask *), GFP_KERNEL); if (!siw_cpu_info.tx_valid_cpus) { siw_cpu_info.num_nodes = 0; return -ENOMEM; } for (i = 0; i < siw_cpu_info.num_nodes; i++) { siw_cpu_info.tx_valid_cpus[i] = kzalloc(sizeof(struct cpumask), GFP_KERNEL); if (!siw_cpu_info.tx_valid_cpus[i]) goto out_err; cpumask_clear(siw_cpu_info.tx_valid_cpus[i]); } for_each_possible_cpu(i) cpumask_set_cpu(i, siw_cpu_info.tx_valid_cpus[cpu_to_node(i)]); return 0; out_err: siw_cpu_info.num_nodes = 0; while (--i >= 0) kfree(siw_cpu_info.tx_valid_cpus[i]); kfree(siw_cpu_info.tx_valid_cpus); siw_cpu_info.tx_valid_cpus = NULL; return -ENOMEM; } static void siw_destroy_cpulist(void) { int i = 0; while (i < siw_cpu_info.num_nodes) kfree(siw_cpu_info.tx_valid_cpus[i++]); kfree(siw_cpu_info.tx_valid_cpus); } /* * Choose CPU with least number of active QP's from NUMA node of * TX interface. */ int siw_get_tx_cpu(struct siw_device *sdev) { const struct cpumask *tx_cpumask; int i, num_cpus, cpu, min_use, node = sdev->numa_node, tx_cpu = -1; if (node < 0) tx_cpumask = cpu_online_mask; else tx_cpumask = siw_cpu_info.tx_valid_cpus[node]; num_cpus = cpumask_weight(tx_cpumask); if (!num_cpus) { /* no CPU on this NUMA node */ tx_cpumask = cpu_online_mask; num_cpus = cpumask_weight(tx_cpumask); } if (!num_cpus) goto out; cpu = cpumask_first(tx_cpumask); for (i = 0, min_use = SIW_MAX_QP; i < num_cpus; i++, cpu = cpumask_next(cpu, tx_cpumask)) { int usage; /* Skip any cores which have no TX thread */ if (!siw_tx_thread[cpu]) continue; usage = atomic_read(&per_cpu(siw_use_cnt, cpu)); if (usage <= min_use) { tx_cpu = cpu; min_use = usage; } } siw_dbg(&sdev->base_dev, "tx cpu %d, node %d, %d qp's\n", tx_cpu, node, min_use); out: if (tx_cpu >= 0) atomic_inc(&per_cpu(siw_use_cnt, tx_cpu)); else pr_warn("siw: no tx cpu found\n"); return tx_cpu; } void siw_put_tx_cpu(int cpu) { atomic_dec(&per_cpu(siw_use_cnt, cpu)); } static struct ib_qp *siw_get_base_qp(struct ib_device *base_dev, int id) { struct siw_qp *qp = siw_qp_id2obj(to_siw_dev(base_dev), id); if (qp) { /* * siw_qp_id2obj() increments object reference count */ siw_qp_put(qp); return &qp->base_qp; } return NULL; } static const struct ib_device_ops siw_device_ops = { .owner = THIS_MODULE, .uverbs_abi_ver = SIW_ABI_VERSION, .driver_id = RDMA_DRIVER_SIW, .alloc_mr = siw_alloc_mr, .alloc_pd = siw_alloc_pd, .alloc_ucontext = siw_alloc_ucontext, .create_cq = siw_create_cq, .create_qp = siw_create_qp, .create_srq = siw_create_srq, .dealloc_driver = siw_device_cleanup, .dealloc_pd = siw_dealloc_pd, .dealloc_ucontext = siw_dealloc_ucontext, .dereg_mr = siw_dereg_mr, .destroy_cq = siw_destroy_cq, .destroy_qp = siw_destroy_qp, .destroy_srq = siw_destroy_srq, .get_dma_mr = siw_get_dma_mr, .get_port_immutable = siw_get_port_immutable, .iw_accept = siw_accept, .iw_add_ref = siw_qp_get_ref, .iw_connect = siw_connect, .iw_create_listen = siw_create_listen, .iw_destroy_listen = siw_destroy_listen, .iw_get_qp = siw_get_base_qp, .iw_reject = siw_reject, .iw_rem_ref = siw_qp_put_ref, .map_mr_sg = siw_map_mr_sg, .mmap = siw_mmap, .mmap_free = siw_mmap_free, .modify_qp = siw_verbs_modify_qp, .modify_srq = siw_modify_srq, .poll_cq = siw_poll_cq, .post_recv = siw_post_receive, .post_send = siw_post_send, .post_srq_recv = siw_post_srq_recv, .query_device = siw_query_device, .query_gid = siw_query_gid, .query_port = siw_query_port, .query_qp = siw_query_qp, .query_srq = siw_query_srq, .req_notify_cq = siw_req_notify_cq, .reg_user_mr = siw_reg_user_mr, INIT_RDMA_OBJ_SIZE(ib_cq, siw_cq, base_cq), INIT_RDMA_OBJ_SIZE(ib_pd, siw_pd, base_pd), INIT_RDMA_OBJ_SIZE(ib_qp, siw_qp, base_qp), INIT_RDMA_OBJ_SIZE(ib_srq, siw_srq, base_srq), INIT_RDMA_OBJ_SIZE(ib_ucontext, siw_ucontext, base_ucontext), }; static struct siw_device *siw_device_create(struct net_device *netdev) { struct siw_device *sdev = NULL; struct ib_device *base_dev; int rv; sdev = ib_alloc_device(siw_device, base_dev); if (!sdev) return NULL; base_dev = &sdev->base_dev; sdev->netdev = netdev; if (netdev->addr_len) { memcpy(sdev->raw_gid, netdev->dev_addr, min_t(unsigned int, netdev->addr_len, ETH_ALEN)); } else { /* * This device does not have a HW address, but * connection mangagement requires a unique gid. */ eth_random_addr(sdev->raw_gid); } addrconf_addr_eui48((u8 *)&base_dev->node_guid, sdev->raw_gid); base_dev->uverbs_cmd_mask |= BIT_ULL(IB_USER_VERBS_CMD_POST_SEND); base_dev->node_type = RDMA_NODE_RNIC; memcpy(base_dev->node_desc, SIW_NODE_DESC_COMMON, sizeof(SIW_NODE_DESC_COMMON)); /* * Current model (one-to-one device association): * One Softiwarp device per net_device or, equivalently, * per physical port. */ base_dev->phys_port_cnt = 1; base_dev->num_comp_vectors = num_possible_cpus(); xa_init_flags(&sdev->qp_xa, XA_FLAGS_ALLOC1); xa_init_flags(&sdev->mem_xa, XA_FLAGS_ALLOC1); ib_set_device_ops(base_dev, &siw_device_ops); rv = ib_device_set_netdev(base_dev, netdev, 1); if (rv) goto error; memcpy(base_dev->iw_ifname, netdev->name, sizeof(base_dev->iw_ifname)); /* Disable TCP port mapping */ base_dev->iw_driver_flags = IW_F_NO_PORT_MAP; sdev->attrs.max_qp = SIW_MAX_QP; sdev->attrs.max_qp_wr = SIW_MAX_QP_WR; sdev->attrs.max_ord = SIW_MAX_ORD_QP; sdev->attrs.max_ird = SIW_MAX_IRD_QP; sdev->attrs.max_sge = SIW_MAX_SGE; sdev->attrs.max_sge_rd = SIW_MAX_SGE_RD; sdev->attrs.max_cq = SIW_MAX_CQ; sdev->attrs.max_cqe = SIW_MAX_CQE; sdev->attrs.max_mr = SIW_MAX_MR; sdev->attrs.max_pd = SIW_MAX_PD; sdev->attrs.max_mw = SIW_MAX_MW; sdev->attrs.max_srq = SIW_MAX_SRQ; sdev->attrs.max_srq_wr = SIW_MAX_SRQ_WR; sdev->attrs.max_srq_sge = SIW_MAX_SGE; INIT_LIST_HEAD(&sdev->cep_list); INIT_LIST_HEAD(&sdev->qp_list); atomic_set(&sdev->num_ctx, 0); atomic_set(&sdev->num_srq, 0); atomic_set(&sdev->num_qp, 0); atomic_set(&sdev->num_cq, 0); atomic_set(&sdev->num_mr, 0); atomic_set(&sdev->num_pd, 0); sdev->numa_node = dev_to_node(&netdev->dev); spin_lock_init(&sdev->lock); return sdev; error: ib_dealloc_device(base_dev); return NULL; } /* * Network link becomes unavailable. Mark all * affected QP's accordingly. */ static void siw_netdev_down(struct work_struct *work) { struct siw_device *sdev = container_of(work, struct siw_device, netdev_down); struct siw_qp_attrs qp_attrs; struct list_head *pos, *tmp; memset(&qp_attrs, 0, sizeof(qp_attrs)); qp_attrs.state = SIW_QP_STATE_ERROR; list_for_each_safe(pos, tmp, &sdev->qp_list) { struct siw_qp *qp = list_entry(pos, struct siw_qp, devq); down_write(&qp->state_lock); WARN_ON(siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE)); up_write(&qp->state_lock); } ib_device_put(&sdev->base_dev); } static void siw_device_goes_down(struct siw_device *sdev) { if (ib_device_try_get(&sdev->base_dev)) { INIT_WORK(&sdev->netdev_down, siw_netdev_down); schedule_work(&sdev->netdev_down); } } static int siw_netdev_event(struct notifier_block *nb, unsigned long event, void *arg) { struct net_device *netdev = netdev_notifier_info_to_dev(arg); struct ib_device *base_dev; struct siw_device *sdev; dev_dbg(&netdev->dev, "siw: event %lu\n", event); base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW); if (!base_dev) return NOTIFY_OK; sdev = to_siw_dev(base_dev); switch (event) { case NETDEV_UP: sdev->state = IB_PORT_ACTIVE; siw_port_event(sdev, 1, IB_EVENT_PORT_ACTIVE); break; case NETDEV_GOING_DOWN: siw_device_goes_down(sdev); break; case NETDEV_DOWN: sdev->state = IB_PORT_DOWN; siw_port_event(sdev, 1, IB_EVENT_PORT_ERR); break; case NETDEV_REGISTER: /* * Device registration now handled only by * rdma netlink commands. So it shall be impossible * to end up here with a valid siw device. */ siw_dbg(base_dev, "unexpected NETDEV_REGISTER event\n"); break; case NETDEV_UNREGISTER: ib_unregister_device_queued(&sdev->base_dev); break; case NETDEV_CHANGEADDR: siw_port_event(sdev, 1, IB_EVENT_LID_CHANGE); break; /* * Todo: Below netdev events are currently not handled. */ case NETDEV_CHANGEMTU: case NETDEV_CHANGE: break; default: break; } ib_device_put(&sdev->base_dev); return NOTIFY_OK; } static struct notifier_block siw_netdev_nb = { .notifier_call = siw_netdev_event, }; static int siw_newlink(const char *basedev_name, struct net_device *netdev) { struct ib_device *base_dev; struct siw_device *sdev = NULL; int rv = -ENOMEM; if (!siw_dev_qualified(netdev)) return -EINVAL; base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW); if (base_dev) { ib_device_put(base_dev); return -EEXIST; } sdev = siw_device_create(netdev); if (sdev) { dev_dbg(&netdev->dev, "siw: new device\n"); if (netif_running(netdev) && netif_carrier_ok(netdev)) sdev->state = IB_PORT_ACTIVE; else sdev->state = IB_PORT_DOWN; rv = siw_device_register(sdev, basedev_name); if (rv) ib_dealloc_device(&sdev->base_dev); } return rv; } static struct rdma_link_ops siw_link_ops = { .type = "siw", .newlink = siw_newlink, }; /* * siw_init_module - Initialize Softiwarp module and register with netdev * subsystem. */ static __init int siw_init_module(void) { int rv; if (SENDPAGE_THRESH < SIW_MAX_INLINE) { pr_info("siw: sendpage threshold too small: %u\n", (int)SENDPAGE_THRESH); rv = -EINVAL; goto out_error; } rv = siw_init_cpulist(); if (rv) goto out_error; rv = siw_cm_init(); if (rv) goto out_error; if (!siw_create_tx_threads()) { pr_info("siw: Could not start any TX thread\n"); rv = -ENOMEM; goto out_error; } /* * Locate CRC32 algorithm. If unsuccessful, fail * loading siw only, if CRC is required. */ siw_crypto_shash = crypto_alloc_shash("crc32c", 0, 0); if (IS_ERR(siw_crypto_shash)) { pr_info("siw: Loading CRC32c failed: %ld\n", PTR_ERR(siw_crypto_shash)); siw_crypto_shash = NULL; if (mpa_crc_required) { rv = -EOPNOTSUPP; goto out_error; } } rv = register_netdevice_notifier(&siw_netdev_nb); if (rv) goto out_error; rdma_link_register(&siw_link_ops); pr_info("SoftiWARP attached\n"); return 0; out_error: siw_stop_tx_threads(); if (siw_crypto_shash) crypto_free_shash(siw_crypto_shash); pr_info("SoftIWARP attach failed. Error: %d\n", rv); siw_cm_exit(); siw_destroy_cpulist(); return rv; } static void __exit siw_exit_module(void) { siw_stop_tx_threads(); unregister_netdevice_notifier(&siw_netdev_nb); rdma_link_unregister(&siw_link_ops); ib_unregister_driver(RDMA_DRIVER_SIW); siw_cm_exit(); siw_destroy_cpulist(); if (siw_crypto_shash) crypto_free_shash(siw_crypto_shash); pr_info("SoftiWARP detached\n"); } module_init(siw_init_module); module_exit(siw_exit_module); MODULE_ALIAS_RDMA_LINK("siw");
6 6 6 6 5 6 1 1 1 1 2 2 1 1 1 1 1 1 1 5 5 5 1 1 17 17 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 // SPDX-License-Identifier: GPL-2.0-or-later /* -*- linux-c -*- * INET 802.1Q VLAN * Ethernet-type device handling. * * Authors: Ben Greear <greearb@candelatech.com> * Please send support related email to: netdev@vger.kernel.org * VLAN Home Page: http://www.candelatech.com/~greear/vlan.html * * Fixes: Mar 22 2001: Martin Bokaemper <mbokaemper@unispherenetworks.com> * - reset skb->pkt_type on incoming packets when MAC was changed * - see that changed MAC is saddr for outgoing packets * Oct 20, 2001: Ard van Breeman: * - Fix MC-list, finally. * - Flush MC-list on VLAN destroy. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/slab.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/net_tstamp.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <linux/phy.h> #include <net/arp.h> #include <net/macsec.h> #include "vlan.h" #include "vlanproc.h" #include <linux/if_vlan.h> #include <linux/netpoll.h> /* * Create the VLAN header for an arbitrary protocol layer * * saddr=NULL means use device source address * daddr=NULL means leave destination address (eg unresolved arp) * * This is called when the SKB is moving down the stack towards the * physical devices. */ static int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned int len) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct vlan_hdr *vhdr; unsigned int vhdrlen = 0; u16 vlan_tci = 0; int rc; if (!(vlan->flags & VLAN_FLAG_REORDER_HDR)) { vhdr = skb_push(skb, VLAN_HLEN); vlan_tci = vlan->vlan_id; vlan_tci |= vlan_dev_get_egress_qos_mask(dev, skb->priority); vhdr->h_vlan_TCI = htons(vlan_tci); /* * Set the protocol type. For a packet of type ETH_P_802_3/2 we * put the length in here instead. */ if (type != ETH_P_802_3 && type != ETH_P_802_2) vhdr->h_vlan_encapsulated_proto = htons(type); else vhdr->h_vlan_encapsulated_proto = htons(len); skb->protocol = vlan->vlan_proto; type = ntohs(vlan->vlan_proto); vhdrlen = VLAN_HLEN; } /* Before delegating work to the lower layer, enter our MAC-address */ if (saddr == NULL) saddr = dev->dev_addr; /* Now make the underlying real hard header */ dev = vlan->real_dev; rc = dev_hard_header(skb, dev, type, daddr, saddr, len + vhdrlen); if (rc > 0) rc += vhdrlen; return rc; } static inline netdev_tx_t vlan_netpoll_send_skb(struct vlan_dev_priv *vlan, struct sk_buff *skb) { #ifdef CONFIG_NET_POLL_CONTROLLER return netpoll_send_skb(vlan->netpoll, skb); #else BUG(); return NETDEV_TX_OK; #endif } static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data); unsigned int len; int ret; /* Handle non-VLAN frames if they are sent to us, for example by DHCP. * * NOTE: THIS ASSUMES DIX ETHERNET, SPECIFICALLY NOT SUPPORTING * OTHER THINGS LIKE FDDI/TokenRing/802.3 SNAPs... */ if (vlan->flags & VLAN_FLAG_REORDER_HDR || veth->h_vlan_proto != vlan->vlan_proto) { u16 vlan_tci; vlan_tci = vlan->vlan_id; vlan_tci |= vlan_dev_get_egress_qos_mask(dev, skb->priority); __vlan_hwaccel_put_tag(skb, vlan->vlan_proto, vlan_tci); } skb->dev = vlan->real_dev; len = skb->len; if (unlikely(netpoll_tx_running(dev))) return vlan_netpoll_send_skb(vlan, skb); ret = dev_queue_xmit(skb); if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { struct vlan_pcpu_stats *stats; stats = this_cpu_ptr(vlan->vlan_pcpu_stats); u64_stats_update_begin(&stats->syncp); u64_stats_inc(&stats->tx_packets); u64_stats_add(&stats->tx_bytes, len); u64_stats_update_end(&stats->syncp); } else { this_cpu_inc(vlan->vlan_pcpu_stats->tx_dropped); } return ret; } static int vlan_dev_change_mtu(struct net_device *dev, int new_mtu) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; unsigned int max_mtu = real_dev->mtu; if (netif_reduces_vlan_mtu(real_dev)) max_mtu -= VLAN_HLEN; if (max_mtu < new_mtu) return -ERANGE; dev->mtu = new_mtu; return 0; } void vlan_dev_set_ingress_priority(const struct net_device *dev, u32 skb_prio, u16 vlan_prio) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); if (vlan->ingress_priority_map[vlan_prio & 0x7] && !skb_prio) vlan->nr_ingress_mappings--; else if (!vlan->ingress_priority_map[vlan_prio & 0x7] && skb_prio) vlan->nr_ingress_mappings++; vlan->ingress_priority_map[vlan_prio & 0x7] = skb_prio; } int vlan_dev_set_egress_priority(const struct net_device *dev, u32 skb_prio, u16 vlan_prio) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct vlan_priority_tci_mapping *mp = NULL; struct vlan_priority_tci_mapping *np; u32 vlan_qos = (vlan_prio << VLAN_PRIO_SHIFT) & VLAN_PRIO_MASK; /* See if a priority mapping exists.. */ mp = vlan->egress_priority_map[skb_prio & 0xF]; while (mp) { if (mp->priority == skb_prio) { if (mp->vlan_qos && !vlan_qos) vlan->nr_egress_mappings--; else if (!mp->vlan_qos && vlan_qos) vlan->nr_egress_mappings++; mp->vlan_qos = vlan_qos; return 0; } mp = mp->next; } /* Create a new mapping then. */ mp = vlan->egress_priority_map[skb_prio & 0xF]; np = kmalloc(sizeof(struct vlan_priority_tci_mapping), GFP_KERNEL); if (!np) return -ENOBUFS; np->next = mp; np->priority = skb_prio; np->vlan_qos = vlan_qos; /* Before inserting this element in hash table, make sure all its fields * are committed to memory. * coupled with smp_rmb() in vlan_dev_get_egress_qos_mask() */ smp_wmb(); vlan->egress_priority_map[skb_prio & 0xF] = np; if (vlan_qos) vlan->nr_egress_mappings++; return 0; } /* Flags are defined in the vlan_flags enum in * include/uapi/linux/if_vlan.h file. */ int vlan_dev_change_flags(const struct net_device *dev, u32 flags, u32 mask) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); u32 old_flags = vlan->flags; if (mask & ~(VLAN_FLAG_REORDER_HDR | VLAN_FLAG_GVRP | VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP | VLAN_FLAG_BRIDGE_BINDING)) return -EINVAL; vlan->flags = (old_flags & ~mask) | (flags & mask); if (netif_running(dev) && (vlan->flags ^ old_flags) & VLAN_FLAG_GVRP) { if (vlan->flags & VLAN_FLAG_GVRP) vlan_gvrp_request_join(dev); else vlan_gvrp_request_leave(dev); } if (netif_running(dev) && (vlan->flags ^ old_flags) & VLAN_FLAG_MVRP) { if (vlan->flags & VLAN_FLAG_MVRP) vlan_mvrp_request_join(dev); else vlan_mvrp_request_leave(dev); } return 0; } void vlan_dev_get_realdev_name(const struct net_device *dev, char *result, size_t size) { strscpy_pad(result, vlan_dev_priv(dev)->real_dev->name, size); } bool vlan_dev_inherit_address(struct net_device *dev, struct net_device *real_dev) { if (dev->addr_assign_type != NET_ADDR_STOLEN) return false; eth_hw_addr_set(dev, real_dev->dev_addr); call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); return true; } static int vlan_dev_open(struct net_device *dev) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; int err; if (!(real_dev->flags & IFF_UP) && !(vlan->flags & VLAN_FLAG_LOOSE_BINDING)) return -ENETDOWN; if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr) && !vlan_dev_inherit_address(dev, real_dev)) { err = dev_uc_add(real_dev, dev->dev_addr); if (err < 0) goto out; } if (dev->flags & IFF_ALLMULTI) { err = dev_set_allmulti(real_dev, 1); if (err < 0) goto del_unicast; } if (dev->flags & IFF_PROMISC) { err = dev_set_promiscuity(real_dev, 1); if (err < 0) goto clear_allmulti; } ether_addr_copy(vlan->real_dev_addr, real_dev->dev_addr); if (vlan->flags & VLAN_FLAG_GVRP) vlan_gvrp_request_join(dev); if (vlan->flags & VLAN_FLAG_MVRP) vlan_mvrp_request_join(dev); if (netif_carrier_ok(real_dev) && !(vlan->flags & VLAN_FLAG_BRIDGE_BINDING)) netif_carrier_on(dev); return 0; clear_allmulti: if (dev->flags & IFF_ALLMULTI) dev_set_allmulti(real_dev, -1); del_unicast: if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr)) dev_uc_del(real_dev, dev->dev_addr); out: netif_carrier_off(dev); return err; } static int vlan_dev_stop(struct net_device *dev) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; dev_mc_unsync(real_dev, dev); dev_uc_unsync(real_dev, dev); if (dev->flags & IFF_ALLMULTI) dev_set_allmulti(real_dev, -1); if (dev->flags & IFF_PROMISC) dev_set_promiscuity(real_dev, -1); if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr)) dev_uc_del(real_dev, dev->dev_addr); if (!(vlan->flags & VLAN_FLAG_BRIDGE_BINDING)) netif_carrier_off(dev); return 0; } static int vlan_dev_set_mac_address(struct net_device *dev, void *p) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; struct sockaddr *addr = p; int err; if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; if (!(dev->flags & IFF_UP)) goto out; if (!ether_addr_equal(addr->sa_data, real_dev->dev_addr)) { err = dev_uc_add(real_dev, addr->sa_data); if (err < 0) return err; } if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr)) dev_uc_del(real_dev, dev->dev_addr); out: eth_hw_addr_set(dev, addr->sa_data); return 0; } static int vlan_hwtstamp_get(struct net_device *dev, struct kernel_hwtstamp_config *cfg) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; return generic_hwtstamp_get_lower(real_dev, cfg); } static int vlan_hwtstamp_set(struct net_device *dev, struct kernel_hwtstamp_config *cfg, struct netlink_ext_ack *extack) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; if (!net_eq(dev_net(dev), dev_net(real_dev))) return -EOPNOTSUPP; return generic_hwtstamp_set_lower(real_dev, cfg, extack); } static int vlan_dev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; const struct net_device_ops *ops = real_dev->netdev_ops; struct ifreq ifrr; int err = -EOPNOTSUPP; strscpy_pad(ifrr.ifr_name, real_dev->name, IFNAMSIZ); ifrr.ifr_ifru = ifr->ifr_ifru; switch (cmd) { case SIOCGMIIPHY: case SIOCGMIIREG: case SIOCSMIIREG: if (netif_device_present(real_dev) && ops->ndo_eth_ioctl) err = ops->ndo_eth_ioctl(real_dev, &ifrr, cmd); break; } if (!err) ifr->ifr_ifru = ifrr.ifr_ifru; return err; } static int vlan_dev_neigh_setup(struct net_device *dev, struct neigh_parms *pa) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; const struct net_device_ops *ops = real_dev->netdev_ops; int err = 0; if (netif_device_present(real_dev) && ops->ndo_neigh_setup) err = ops->ndo_neigh_setup(real_dev, pa); return err; } #if IS_ENABLED(CONFIG_FCOE) static int vlan_dev_fcoe_ddp_setup(struct net_device *dev, u16 xid, struct scatterlist *sgl, unsigned int sgc) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; const struct net_device_ops *ops = real_dev->netdev_ops; int rc = 0; if (ops->ndo_fcoe_ddp_setup) rc = ops->ndo_fcoe_ddp_setup(real_dev, xid, sgl, sgc); return rc; } static int vlan_dev_fcoe_ddp_done(struct net_device *dev, u16 xid) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; const struct net_device_ops *ops = real_dev->netdev_ops; int len = 0; if (ops->ndo_fcoe_ddp_done) len = ops->ndo_fcoe_ddp_done(real_dev, xid); return len; } static int vlan_dev_fcoe_enable(struct net_device *dev) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; const struct net_device_ops *ops = real_dev->netdev_ops; int rc = -EINVAL; if (ops->ndo_fcoe_enable) rc = ops->ndo_fcoe_enable(real_dev); return rc; } static int vlan_dev_fcoe_disable(struct net_device *dev) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; const struct net_device_ops *ops = real_dev->netdev_ops; int rc = -EINVAL; if (ops->ndo_fcoe_disable) rc = ops->ndo_fcoe_disable(real_dev); return rc; } static int vlan_dev_fcoe_ddp_target(struct net_device *dev, u16 xid, struct scatterlist *sgl, unsigned int sgc) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; const struct net_device_ops *ops = real_dev->netdev_ops; int rc = 0; if (ops->ndo_fcoe_ddp_target) rc = ops->ndo_fcoe_ddp_target(real_dev, xid, sgl, sgc); return rc; } #endif #ifdef NETDEV_FCOE_WWNN static int vlan_dev_fcoe_get_wwn(struct net_device *dev, u64 *wwn, int type) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; const struct net_device_ops *ops = real_dev->netdev_ops; int rc = -EINVAL; if (ops->ndo_fcoe_get_wwn) rc = ops->ndo_fcoe_get_wwn(real_dev, wwn, type); return rc; } #endif static void vlan_dev_change_rx_flags(struct net_device *dev, int change) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; if (dev->flags & IFF_UP) { if (change & IFF_ALLMULTI) dev_set_allmulti(real_dev, dev->flags & IFF_ALLMULTI ? 1 : -1); if (change & IFF_PROMISC) dev_set_promiscuity(real_dev, dev->flags & IFF_PROMISC ? 1 : -1); } } static void vlan_dev_set_rx_mode(struct net_device *vlan_dev) { dev_mc_sync(vlan_dev_priv(vlan_dev)->real_dev, vlan_dev); dev_uc_sync(vlan_dev_priv(vlan_dev)->real_dev, vlan_dev); } /* * vlan network devices have devices nesting below it, and are a special * "super class" of normal network devices; split their locks off into a * separate class since they always nest. */ static struct lock_class_key vlan_netdev_xmit_lock_key; static struct lock_class_key vlan_netdev_addr_lock_key; static void vlan_dev_set_lockdep_one(struct net_device *dev, struct netdev_queue *txq, void *unused) { lockdep_set_class(&txq->_xmit_lock, &vlan_netdev_xmit_lock_key); } static void vlan_dev_set_lockdep_class(struct net_device *dev) { lockdep_set_class(&dev->addr_list_lock, &vlan_netdev_addr_lock_key); netdev_for_each_tx_queue(dev, vlan_dev_set_lockdep_one, NULL); } static __be16 vlan_parse_protocol(const struct sk_buff *skb) { struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data); return __vlan_get_protocol(skb, veth->h_vlan_proto, NULL); } static const struct header_ops vlan_header_ops = { .create = vlan_dev_hard_header, .parse = eth_header_parse, .parse_protocol = vlan_parse_protocol, }; static int vlan_passthru_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned int len) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; if (saddr == NULL) saddr = dev->dev_addr; return dev_hard_header(skb, real_dev, type, daddr, saddr, len); } static const struct header_ops vlan_passthru_header_ops = { .create = vlan_passthru_hard_header, .parse = eth_header_parse, .parse_protocol = vlan_parse_protocol, }; static struct device_type vlan_type = { .name = "vlan", }; static const struct net_device_ops vlan_netdev_ops; static int vlan_dev_init(struct net_device *dev) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; netif_carrier_off(dev); /* IFF_BROADCAST|IFF_MULTICAST; ??? */ dev->flags = real_dev->flags & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_MASTER | IFF_SLAVE); dev->state = (real_dev->state & ((1<<__LINK_STATE_NOCARRIER) | (1<<__LINK_STATE_DORMANT))) | (1<<__LINK_STATE_PRESENT); if (vlan->flags & VLAN_FLAG_BRIDGE_BINDING) dev->state |= (1 << __LINK_STATE_NOCARRIER); dev->hw_features = NETIF_F_HW_CSUM | NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | NETIF_F_HIGHDMA | NETIF_F_SCTP_CRC | NETIF_F_ALL_FCOE; if (real_dev->vlan_features & NETIF_F_HW_MACSEC) dev->hw_features |= NETIF_F_HW_MACSEC; dev->features |= dev->hw_features | NETIF_F_LLTX; netif_inherit_tso_max(dev, real_dev); if (dev->features & NETIF_F_VLAN_FEATURES) netdev_warn(real_dev, "VLAN features are set incorrectly. Q-in-Q configurations may not work correctly.\n"); dev->vlan_features = real_dev->vlan_features & ~NETIF_F_ALL_FCOE; dev->hw_enc_features = vlan_tnl_features(real_dev); dev->mpls_features = real_dev->mpls_features; /* ipv6 shared card related stuff */ dev->dev_id = real_dev->dev_id; if (is_zero_ether_addr(dev->dev_addr)) { eth_hw_addr_set(dev, real_dev->dev_addr); dev->addr_assign_type = NET_ADDR_STOLEN; } if (is_zero_ether_addr(dev->broadcast)) memcpy(dev->broadcast, real_dev->broadcast, dev->addr_len); #if IS_ENABLED(CONFIG_FCOE) dev->fcoe_ddp_xid = real_dev->fcoe_ddp_xid; #endif dev->needed_headroom = real_dev->needed_headroom; if (vlan_hw_offload_capable(real_dev->features, vlan->vlan_proto)) { dev->header_ops = &vlan_passthru_header_ops; dev->hard_header_len = real_dev->hard_header_len; } else { dev->header_ops = &vlan_header_ops; dev->hard_header_len = real_dev->hard_header_len + VLAN_HLEN; } dev->netdev_ops = &vlan_netdev_ops; SET_NETDEV_DEVTYPE(dev, &vlan_type); vlan_dev_set_lockdep_class(dev); vlan->vlan_pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats); if (!vlan->vlan_pcpu_stats) return -ENOMEM; /* Get vlan's reference to real_dev */ netdev_hold(real_dev, &vlan->dev_tracker, GFP_KERNEL); return 0; } /* Note: this function might be called multiple times for the same device. */ void vlan_dev_free_egress_priority(const struct net_device *dev) { struct vlan_priority_tci_mapping *pm; struct vlan_dev_priv *vlan = vlan_dev_priv(dev); int i; for (i = 0; i < ARRAY_SIZE(vlan->egress_priority_map); i++) { while ((pm = vlan->egress_priority_map[i]) != NULL) { vlan->egress_priority_map[i] = pm->next; kfree(pm); } } } static void vlan_dev_uninit(struct net_device *dev) { vlan_dev_free_egress_priority(dev); } static netdev_features_t vlan_dev_fix_features(struct net_device *dev, netdev_features_t features) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; netdev_features_t old_features = features; netdev_features_t lower_features; lower_features = netdev_intersect_features((real_dev->vlan_features | NETIF_F_RXCSUM), real_dev->features); /* Add HW_CSUM setting to preserve user ability to control * checksum offload on the vlan device. */ if (lower_features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)) lower_features |= NETIF_F_HW_CSUM; features = netdev_intersect_features(features, lower_features); features |= old_features & (NETIF_F_SOFT_FEATURES | NETIF_F_GSO_SOFTWARE); features |= NETIF_F_LLTX; return features; } static int vlan_ethtool_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { const struct vlan_dev_priv *vlan = vlan_dev_priv(dev); return __ethtool_get_link_ksettings(vlan->real_dev, cmd); } static void vlan_ethtool_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { strscpy(info->driver, vlan_fullname, sizeof(info->driver)); strscpy(info->version, vlan_version, sizeof(info->version)); strscpy(info->fw_version, "N/A", sizeof(info->fw_version)); } static int vlan_ethtool_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info) { const struct vlan_dev_priv *vlan = vlan_dev_priv(dev); const struct ethtool_ops *ops = vlan->real_dev->ethtool_ops; struct phy_device *phydev = vlan->real_dev->phydev; if (phy_has_tsinfo(phydev)) { return phy_ts_info(phydev, info); } else if (ops->get_ts_info) { return ops->get_ts_info(vlan->real_dev, info); } else { info->so_timestamping = SOF_TIMESTAMPING_RX_SOFTWARE | SOF_TIMESTAMPING_SOFTWARE; info->phc_index = -1; } return 0; } static void vlan_dev_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { struct vlan_pcpu_stats *p; u32 rx_errors = 0, tx_dropped = 0; int i; for_each_possible_cpu(i) { u64 rxpackets, rxbytes, rxmulticast, txpackets, txbytes; unsigned int start; p = per_cpu_ptr(vlan_dev_priv(dev)->vlan_pcpu_stats, i); do { start = u64_stats_fetch_begin(&p->syncp); rxpackets = u64_stats_read(&p->rx_packets); rxbytes = u64_stats_read(&p->rx_bytes); rxmulticast = u64_stats_read(&p->rx_multicast); txpackets = u64_stats_read(&p->tx_packets); txbytes = u64_stats_read(&p->tx_bytes); } while (u64_stats_fetch_retry(&p->syncp, start)); stats->rx_packets += rxpackets; stats->rx_bytes += rxbytes; stats->multicast += rxmulticast; stats->tx_packets += txpackets; stats->tx_bytes += txbytes; /* rx_errors & tx_dropped are u32 */ rx_errors += READ_ONCE(p->rx_errors); tx_dropped += READ_ONCE(p->tx_dropped); } stats->rx_errors = rx_errors; stats->tx_dropped = tx_dropped; } #ifdef CONFIG_NET_POLL_CONTROLLER static void vlan_dev_poll_controller(struct net_device *dev) { return; } static int vlan_dev_netpoll_setup(struct net_device *dev, struct netpoll_info *npinfo) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; struct netpoll *netpoll; int err = 0; netpoll = kzalloc(sizeof(*netpoll), GFP_KERNEL); err = -ENOMEM; if (!netpoll) goto out; err = __netpoll_setup(netpoll, real_dev); if (err) { kfree(netpoll); goto out; } vlan->netpoll = netpoll; out: return err; } static void vlan_dev_netpoll_cleanup(struct net_device *dev) { struct vlan_dev_priv *vlan= vlan_dev_priv(dev); struct netpoll *netpoll = vlan->netpoll; if (!netpoll) return; vlan->netpoll = NULL; __netpoll_free(netpoll); } #endif /* CONFIG_NET_POLL_CONTROLLER */ static int vlan_dev_get_iflink(const struct net_device *dev) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; return real_dev->ifindex; } static int vlan_dev_fill_forward_path(struct net_device_path_ctx *ctx, struct net_device_path *path) { struct vlan_dev_priv *vlan = vlan_dev_priv(ctx->dev); path->type = DEV_PATH_VLAN; path->encap.id = vlan->vlan_id; path->encap.proto = vlan->vlan_proto; path->dev = ctx->dev; ctx->dev = vlan->real_dev; if (ctx->num_vlans >= ARRAY_SIZE(ctx->vlan)) return -ENOSPC; ctx->vlan[ctx->num_vlans].id = vlan->vlan_id; ctx->vlan[ctx->num_vlans].proto = vlan->vlan_proto; ctx->num_vlans++; return 0; } #if IS_ENABLED(CONFIG_MACSEC) static const struct macsec_ops *vlan_get_macsec_ops(const struct macsec_context *ctx) { return vlan_dev_priv(ctx->netdev)->real_dev->macsec_ops; } static int vlan_macsec_offload(int (* const func)(struct macsec_context *), struct macsec_context *ctx) { if (unlikely(!func)) return 0; return (*func)(ctx); } static int vlan_macsec_dev_open(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_dev_open, ctx); } static int vlan_macsec_dev_stop(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_dev_stop, ctx); } static int vlan_macsec_add_secy(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_add_secy, ctx); } static int vlan_macsec_upd_secy(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_upd_secy, ctx); } static int vlan_macsec_del_secy(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_del_secy, ctx); } static int vlan_macsec_add_rxsc(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_add_rxsc, ctx); } static int vlan_macsec_upd_rxsc(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_upd_rxsc, ctx); } static int vlan_macsec_del_rxsc(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_del_rxsc, ctx); } static int vlan_macsec_add_rxsa(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_add_rxsa, ctx); } static int vlan_macsec_upd_rxsa(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_upd_rxsa, ctx); } static int vlan_macsec_del_rxsa(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_del_rxsa, ctx); } static int vlan_macsec_add_txsa(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_add_txsa, ctx); } static int vlan_macsec_upd_txsa(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_upd_txsa, ctx); } static int vlan_macsec_del_txsa(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_del_txsa, ctx); } static int vlan_macsec_get_dev_stats(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_get_dev_stats, ctx); } static int vlan_macsec_get_tx_sc_stats(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_get_tx_sc_stats, ctx); } static int vlan_macsec_get_tx_sa_stats(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_get_tx_sa_stats, ctx); } static int vlan_macsec_get_rx_sc_stats(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_get_rx_sc_stats, ctx); } static int vlan_macsec_get_rx_sa_stats(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_get_rx_sa_stats, ctx); } static const struct macsec_ops macsec_offload_ops = { /* Device wide */ .mdo_dev_open = vlan_macsec_dev_open, .mdo_dev_stop = vlan_macsec_dev_stop, /* SecY */ .mdo_add_secy = vlan_macsec_add_secy, .mdo_upd_secy = vlan_macsec_upd_secy, .mdo_del_secy = vlan_macsec_del_secy, /* Security channels */ .mdo_add_rxsc = vlan_macsec_add_rxsc, .mdo_upd_rxsc = vlan_macsec_upd_rxsc, .mdo_del_rxsc = vlan_macsec_del_rxsc, /* Security associations */ .mdo_add_rxsa = vlan_macsec_add_rxsa, .mdo_upd_rxsa = vlan_macsec_upd_rxsa, .mdo_del_rxsa = vlan_macsec_del_rxsa, .mdo_add_txsa = vlan_macsec_add_txsa, .mdo_upd_txsa = vlan_macsec_upd_txsa, .mdo_del_txsa = vlan_macsec_del_txsa, /* Statistics */ .mdo_get_dev_stats = vlan_macsec_get_dev_stats, .mdo_get_tx_sc_stats = vlan_macsec_get_tx_sc_stats, .mdo_get_tx_sa_stats = vlan_macsec_get_tx_sa_stats, .mdo_get_rx_sc_stats = vlan_macsec_get_rx_sc_stats, .mdo_get_rx_sa_stats = vlan_macsec_get_rx_sa_stats, }; #endif static const struct ethtool_ops vlan_ethtool_ops = { .get_link_ksettings = vlan_ethtool_get_link_ksettings, .get_drvinfo = vlan_ethtool_get_drvinfo, .get_link = ethtool_op_get_link, .get_ts_info = vlan_ethtool_get_ts_info, }; static const struct net_device_ops vlan_netdev_ops = { .ndo_change_mtu = vlan_dev_change_mtu, .ndo_init = vlan_dev_init, .ndo_uninit = vlan_dev_uninit, .ndo_open = vlan_dev_open, .ndo_stop = vlan_dev_stop, .ndo_start_xmit = vlan_dev_hard_start_xmit, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = vlan_dev_set_mac_address, .ndo_set_rx_mode = vlan_dev_set_rx_mode, .ndo_change_rx_flags = vlan_dev_change_rx_flags, .ndo_eth_ioctl = vlan_dev_ioctl, .ndo_neigh_setup = vlan_dev_neigh_setup, .ndo_get_stats64 = vlan_dev_get_stats64, #if IS_ENABLED(CONFIG_FCOE) .ndo_fcoe_ddp_setup = vlan_dev_fcoe_ddp_setup, .ndo_fcoe_ddp_done = vlan_dev_fcoe_ddp_done, .ndo_fcoe_enable = vlan_dev_fcoe_enable, .ndo_fcoe_disable = vlan_dev_fcoe_disable, .ndo_fcoe_ddp_target = vlan_dev_fcoe_ddp_target, #endif #ifdef NETDEV_FCOE_WWNN .ndo_fcoe_get_wwn = vlan_dev_fcoe_get_wwn, #endif #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = vlan_dev_poll_controller, .ndo_netpoll_setup = vlan_dev_netpoll_setup, .ndo_netpoll_cleanup = vlan_dev_netpoll_cleanup, #endif .ndo_fix_features = vlan_dev_fix_features, .ndo_get_iflink = vlan_dev_get_iflink, .ndo_fill_forward_path = vlan_dev_fill_forward_path, .ndo_hwtstamp_get = vlan_hwtstamp_get, .ndo_hwtstamp_set = vlan_hwtstamp_set, }; static void vlan_dev_free(struct net_device *dev) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); free_percpu(vlan->vlan_pcpu_stats); vlan->vlan_pcpu_stats = NULL; /* Get rid of the vlan's reference to real_dev */ netdev_put(vlan->real_dev, &vlan->dev_tracker); } void vlan_setup(struct net_device *dev) { ether_setup(dev); dev->priv_flags |= IFF_802_1Q_VLAN | IFF_NO_QUEUE; dev->priv_flags |= IFF_UNICAST_FLT; dev->priv_flags &= ~IFF_TX_SKB_SHARING; netif_keep_dst(dev); dev->netdev_ops = &vlan_netdev_ops; dev->needs_free_netdev = true; dev->priv_destructor = vlan_dev_free; dev->ethtool_ops = &vlan_ethtool_ops; #if IS_ENABLED(CONFIG_MACSEC) dev->macsec_ops = &macsec_offload_ops; #endif dev->min_mtu = 0; dev->max_mtu = ETH_MAX_MTU; eth_zero_addr(dev->broadcast); }
248 1335 520 12869 4 1860 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 /* SPDX-License-Identifier: GPL-2.0 */ /* thread_info.h: common low-level thread information accessors * * Copyright (C) 2002 David Howells (dhowells@redhat.com) * - Incorporating suggestions made by Linus Torvalds */ #ifndef _LINUX_THREAD_INFO_H #define _LINUX_THREAD_INFO_H #include <linux/types.h> #include <linux/limits.h> #include <linux/bug.h> #include <linux/restart_block.h> #include <linux/errno.h> #ifdef CONFIG_THREAD_INFO_IN_TASK /* * For CONFIG_THREAD_INFO_IN_TASK kernels we need <asm/current.h> for the * definition of current, but for !CONFIG_THREAD_INFO_IN_TASK kernels, * including <asm/current.h> can cause a circular dependency on some platforms. */ #include <asm/current.h> #define current_thread_info() ((struct thread_info *)current) #endif #include <linux/bitops.h> /* * For per-arch arch_within_stack_frames() implementations, defined in * asm/thread_info.h. */ enum { BAD_STACK = -1, NOT_STACK = 0, GOOD_FRAME, GOOD_STACK, }; #ifdef CONFIG_GENERIC_ENTRY enum syscall_work_bit { SYSCALL_WORK_BIT_SECCOMP, SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT, SYSCALL_WORK_BIT_SYSCALL_TRACE, SYSCALL_WORK_BIT_SYSCALL_EMU, SYSCALL_WORK_BIT_SYSCALL_AUDIT, SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH, SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP, }; #define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP) #define SYSCALL_WORK_SYSCALL_TRACEPOINT BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT) #define SYSCALL_WORK_SYSCALL_TRACE BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE) #define SYSCALL_WORK_SYSCALL_EMU BIT(SYSCALL_WORK_BIT_SYSCALL_EMU) #define SYSCALL_WORK_SYSCALL_AUDIT BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT) #define SYSCALL_WORK_SYSCALL_USER_DISPATCH BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH) #define SYSCALL_WORK_SYSCALL_EXIT_TRAP BIT(SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP) #endif #include <asm/thread_info.h> #ifdef __KERNEL__ #ifndef arch_set_restart_data #define arch_set_restart_data(restart) do { } while (0) #endif static inline long set_restart_fn(struct restart_block *restart, long (*fn)(struct restart_block *)) { restart->fn = fn; arch_set_restart_data(restart); return -ERESTART_RESTARTBLOCK; } #ifndef THREAD_ALIGN #define THREAD_ALIGN THREAD_SIZE #endif #define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO) /* * flag set/clear/test wrappers * - pass TIF_xxxx constants to these functions */ static inline void set_ti_thread_flag(struct thread_info *ti, int flag) { set_bit(flag, (unsigned long *)&ti->flags); } static inline void clear_ti_thread_flag(struct thread_info *ti, int flag) { clear_bit(flag, (unsigned long *)&ti->flags); } static inline void update_ti_thread_flag(struct thread_info *ti, int flag, bool value) { if (value) set_ti_thread_flag(ti, flag); else clear_ti_thread_flag(ti, flag); } static inline int test_and_set_ti_thread_flag(struct thread_info *ti, int flag) { return test_and_set_bit(flag, (unsigned long *)&ti->flags); } static inline int test_and_clear_ti_thread_flag(struct thread_info *ti, int flag) { return test_and_clear_bit(flag, (unsigned long *)&ti->flags); } static inline int test_ti_thread_flag(struct thread_info *ti, int flag) { return test_bit(flag, (unsigned long *)&ti->flags); } /* * This may be used in noinstr code, and needs to be __always_inline to prevent * inadvertent instrumentation. */ static __always_inline unsigned long read_ti_thread_flags(struct thread_info *ti) { return READ_ONCE(ti->flags); } #define set_thread_flag(flag) \ set_ti_thread_flag(current_thread_info(), flag) #define clear_thread_flag(flag) \ clear_ti_thread_flag(current_thread_info(), flag) #define update_thread_flag(flag, value) \ update_ti_thread_flag(current_thread_info(), flag, value) #define test_and_set_thread_flag(flag) \ test_and_set_ti_thread_flag(current_thread_info(), flag) #define test_and_clear_thread_flag(flag) \ test_and_clear_ti_thread_flag(current_thread_info(), flag) #define test_thread_flag(flag) \ test_ti_thread_flag(current_thread_info(), flag) #define read_thread_flags() \ read_ti_thread_flags(current_thread_info()) #define read_task_thread_flags(t) \ read_ti_thread_flags(task_thread_info(t)) #ifdef CONFIG_GENERIC_ENTRY #define set_syscall_work(fl) \ set_bit(SYSCALL_WORK_BIT_##fl, &current_thread_info()->syscall_work) #define test_syscall_work(fl) \ test_bit(SYSCALL_WORK_BIT_##fl, &current_thread_info()->syscall_work) #define clear_syscall_work(fl) \ clear_bit(SYSCALL_WORK_BIT_##fl, &current_thread_info()->syscall_work) #define set_task_syscall_work(t, fl) \ set_bit(SYSCALL_WORK_BIT_##fl, &task_thread_info(t)->syscall_work) #define test_task_syscall_work(t, fl) \ test_bit(SYSCALL_WORK_BIT_##fl, &task_thread_info(t)->syscall_work) #define clear_task_syscall_work(t, fl) \ clear_bit(SYSCALL_WORK_BIT_##fl, &task_thread_info(t)->syscall_work) #else /* CONFIG_GENERIC_ENTRY */ #define set_syscall_work(fl) \ set_ti_thread_flag(current_thread_info(), TIF_##fl) #define test_syscall_work(fl) \ test_ti_thread_flag(current_thread_info(), TIF_##fl) #define clear_syscall_work(fl) \ clear_ti_thread_flag(current_thread_info(), TIF_##fl) #define set_task_syscall_work(t, fl) \ set_ti_thread_flag(task_thread_info(t), TIF_##fl) #define test_task_syscall_work(t, fl) \ test_ti_thread_flag(task_thread_info(t), TIF_##fl) #define clear_task_syscall_work(t, fl) \ clear_ti_thread_flag(task_thread_info(t), TIF_##fl) #endif /* !CONFIG_GENERIC_ENTRY */ #ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H static __always_inline bool tif_need_resched(void) { return arch_test_bit(TIF_NEED_RESCHED, (unsigned long *)(&current_thread_info()->flags)); } #else static __always_inline bool tif_need_resched(void) { return test_bit(TIF_NEED_RESCHED, (unsigned long *)(&current_thread_info()->flags)); } #endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */ #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES static inline int arch_within_stack_frames(const void * const stack, const void * const stackend, const void *obj, unsigned long len) { return 0; } #endif #ifdef CONFIG_HARDENED_USERCOPY extern void __check_object_size(const void *ptr, unsigned long n, bool to_user); static __always_inline void check_object_size(const void *ptr, unsigned long n, bool to_user) { if (!__builtin_constant_p(n)) __check_object_size(ptr, n, to_user); } #else static inline void check_object_size(const void *ptr, unsigned long n, bool to_user) { } #endif /* CONFIG_HARDENED_USERCOPY */ extern void __compiletime_error("copy source size is too small") __bad_copy_from(void); extern void __compiletime_error("copy destination size is too small") __bad_copy_to(void); void __copy_overflow(int size, unsigned long count); static inline void copy_overflow(int size, unsigned long count) { if (IS_ENABLED(CONFIG_BUG)) __copy_overflow(size, count); } static __always_inline __must_check bool check_copy_size(const void *addr, size_t bytes, bool is_source) { int sz = __builtin_object_size(addr, 0); if (unlikely(sz >= 0 && sz < bytes)) { if (!__builtin_constant_p(bytes)) copy_overflow(sz, bytes); else if (is_source) __bad_copy_from(); else __bad_copy_to(); return false; } if (WARN_ON_ONCE(bytes > INT_MAX)) return false; check_object_size(addr, bytes, is_source); return true; } #ifndef arch_setup_new_exec static inline void arch_setup_new_exec(void) { } #endif void arch_task_cache_init(void); /* for CONFIG_SH */ void arch_release_task_struct(struct task_struct *tsk); int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); #endif /* __KERNEL__ */ #endif /* _LINUX_THREAD_INFO_H */
80 1 26 36 36 26 37 37 36 36 36 36 36 25 25 25 25 25 10 10 10 10 9 9 9 13 14 13 25 14 25 14 13 13 13 14 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 16 16 15 5 15 12 12 12 12 12 11 12 12 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 // SPDX-License-Identifier: GPL-2.0-or-later /* * Forwarding database * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/rculist.h> #include <linux/spinlock.h> #include <linux/times.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/jhash.h> #include <linux/random.h> #include <linux/slab.h> #include <linux/atomic.h> #include <asm/unaligned.h> #include <linux/if_vlan.h> #include <net/switchdev.h> #include <trace/events/bridge.h> #include "br_private.h" static const struct rhashtable_params br_fdb_rht_params = { .head_offset = offsetof(struct net_bridge_fdb_entry, rhnode), .key_offset = offsetof(struct net_bridge_fdb_entry, key), .key_len = sizeof(struct net_bridge_fdb_key), .automatic_shrinking = true, }; static struct kmem_cache *br_fdb_cache __read_mostly; int __init br_fdb_init(void) { br_fdb_cache = kmem_cache_create("bridge_fdb_cache", sizeof(struct net_bridge_fdb_entry), 0, SLAB_HWCACHE_ALIGN, NULL); if (!br_fdb_cache) return -ENOMEM; return 0; } void br_fdb_fini(void) { kmem_cache_destroy(br_fdb_cache); } int br_fdb_hash_init(struct net_bridge *br) { return rhashtable_init(&br->fdb_hash_tbl, &br_fdb_rht_params); } void br_fdb_hash_fini(struct net_bridge *br) { rhashtable_destroy(&br->fdb_hash_tbl); } /* if topology_changing then use forward_delay (default 15 sec) * otherwise keep longer (default 5 minutes) */ static inline unsigned long hold_time(const struct net_bridge *br) { return br->topology_change ? br->forward_delay : br->ageing_time; } static inline int has_expired(const struct net_bridge *br, const struct net_bridge_fdb_entry *fdb) { return !test_bit(BR_FDB_STATIC, &fdb->flags) && !test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags) && time_before_eq(fdb->updated + hold_time(br), jiffies); } static void fdb_rcu_free(struct rcu_head *head) { struct net_bridge_fdb_entry *ent = container_of(head, struct net_bridge_fdb_entry, rcu); kmem_cache_free(br_fdb_cache, ent); } static int fdb_to_nud(const struct net_bridge *br, const struct net_bridge_fdb_entry *fdb) { if (test_bit(BR_FDB_LOCAL, &fdb->flags)) return NUD_PERMANENT; else if (test_bit(BR_FDB_STATIC, &fdb->flags)) return NUD_NOARP; else if (has_expired(br, fdb)) return NUD_STALE; else return NUD_REACHABLE; } static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br, const struct net_bridge_fdb_entry *fdb, u32 portid, u32 seq, int type, unsigned int flags) { const struct net_bridge_port *dst = READ_ONCE(fdb->dst); unsigned long now = jiffies; struct nda_cacheinfo ci; struct nlmsghdr *nlh; struct ndmsg *ndm; u32 ext_flags = 0; nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags); if (nlh == NULL) return -EMSGSIZE; ndm = nlmsg_data(nlh); ndm->ndm_family = AF_BRIDGE; ndm->ndm_pad1 = 0; ndm->ndm_pad2 = 0; ndm->ndm_flags = 0; ndm->ndm_type = 0; ndm->ndm_ifindex = dst ? dst->dev->ifindex : br->dev->ifindex; ndm->ndm_state = fdb_to_nud(br, fdb); if (test_bit(BR_FDB_OFFLOADED, &fdb->flags)) ndm->ndm_flags |= NTF_OFFLOADED; if (test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags)) ndm->ndm_flags |= NTF_EXT_LEARNED; if (test_bit(BR_FDB_STICKY, &fdb->flags)) ndm->ndm_flags |= NTF_STICKY; if (test_bit(BR_FDB_LOCKED, &fdb->flags)) ext_flags |= NTF_EXT_LOCKED; if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->key.addr)) goto nla_put_failure; if (nla_put_u32(skb, NDA_MASTER, br->dev->ifindex)) goto nla_put_failure; if (nla_put_u32(skb, NDA_FLAGS_EXT, ext_flags)) goto nla_put_failure; ci.ndm_used = jiffies_to_clock_t(now - fdb->used); ci.ndm_confirmed = 0; ci.ndm_updated = jiffies_to_clock_t(now - fdb->updated); ci.ndm_refcnt = 0; if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) goto nla_put_failure; if (fdb->key.vlan_id && nla_put(skb, NDA_VLAN, sizeof(u16), &fdb->key.vlan_id)) goto nla_put_failure; if (test_bit(BR_FDB_NOTIFY, &fdb->flags)) { struct nlattr *nest = nla_nest_start(skb, NDA_FDB_EXT_ATTRS); u8 notify_bits = FDB_NOTIFY_BIT; if (!nest) goto nla_put_failure; if (test_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags)) notify_bits |= FDB_NOTIFY_INACTIVE_BIT; if (nla_put_u8(skb, NFEA_ACTIVITY_NOTIFY, notify_bits)) { nla_nest_cancel(skb, nest); goto nla_put_failure; } nla_nest_end(skb, nest); } nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static inline size_t fdb_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(ETH_ALEN) /* NDA_LLADDR */ + nla_total_size(sizeof(u32)) /* NDA_MASTER */ + nla_total_size(sizeof(u32)) /* NDA_FLAGS_EXT */ + nla_total_size(sizeof(u16)) /* NDA_VLAN */ + nla_total_size(sizeof(struct nda_cacheinfo)) + nla_total_size(0) /* NDA_FDB_EXT_ATTRS */ + nla_total_size(sizeof(u8)); /* NFEA_ACTIVITY_NOTIFY */ } static void fdb_notify(struct net_bridge *br, const struct net_bridge_fdb_entry *fdb, int type, bool swdev_notify) { struct net *net = dev_net(br->dev); struct sk_buff *skb; int err = -ENOBUFS; if (swdev_notify) br_switchdev_fdb_notify(br, fdb, type); skb = nlmsg_new(fdb_nlmsg_size(), GFP_ATOMIC); if (skb == NULL) goto errout; err = fdb_fill_info(skb, br, fdb, 0, 0, type, 0); if (err < 0) { /* -EMSGSIZE implies BUG in fdb_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); return; errout: rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); } static struct net_bridge_fdb_entry *fdb_find_rcu(struct rhashtable *tbl, const unsigned char *addr, __u16 vid) { struct net_bridge_fdb_key key; WARN_ON_ONCE(!rcu_read_lock_held()); key.vlan_id = vid; memcpy(key.addr.addr, addr, sizeof(key.addr.addr)); return rhashtable_lookup(tbl, &key, br_fdb_rht_params); } /* requires bridge hash_lock */ static struct net_bridge_fdb_entry *br_fdb_find(struct net_bridge *br, const unsigned char *addr, __u16 vid) { struct net_bridge_fdb_entry *fdb; lockdep_assert_held_once(&br->hash_lock); rcu_read_lock(); fdb = fdb_find_rcu(&br->fdb_hash_tbl, addr, vid); rcu_read_unlock(); return fdb; } struct net_device *br_fdb_find_port(const struct net_device *br_dev, const unsigned char *addr, __u16 vid) { struct net_bridge_fdb_entry *f; struct net_device *dev = NULL; struct net_bridge *br; ASSERT_RTNL(); if (!netif_is_bridge_master(br_dev)) return NULL; br = netdev_priv(br_dev); rcu_read_lock(); f = br_fdb_find_rcu(br, addr, vid); if (f && f->dst) dev = f->dst->dev; rcu_read_unlock(); return dev; } EXPORT_SYMBOL_GPL(br_fdb_find_port); struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br, const unsigned char *addr, __u16 vid) { return fdb_find_rcu(&br->fdb_hash_tbl, addr, vid); } /* When a static FDB entry is added, the mac address from the entry is * added to the bridge private HW address list and all required ports * are then updated with the new information. * Called under RTNL. */ static void fdb_add_hw_addr(struct net_bridge *br, const unsigned char *addr) { int err; struct net_bridge_port *p; ASSERT_RTNL(); list_for_each_entry(p, &br->port_list, list) { if (!br_promisc_port(p)) { err = dev_uc_add(p->dev, addr); if (err) goto undo; } } return; undo: list_for_each_entry_continue_reverse(p, &br->port_list, list) { if (!br_promisc_port(p)) dev_uc_del(p->dev, addr); } } /* When a static FDB entry is deleted, the HW address from that entry is * also removed from the bridge private HW address list and updates all * the ports with needed information. * Called under RTNL. */ static void fdb_del_hw_addr(struct net_bridge *br, const unsigned char *addr) { struct net_bridge_port *p; ASSERT_RTNL(); list_for_each_entry(p, &br->port_list, list) { if (!br_promisc_port(p)) dev_uc_del(p->dev, addr); } } static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f, bool swdev_notify) { trace_fdb_delete(br, f); if (test_bit(BR_FDB_STATIC, &f->flags)) fdb_del_hw_addr(br, f->key.addr.addr); hlist_del_init_rcu(&f->fdb_node); rhashtable_remove_fast(&br->fdb_hash_tbl, &f->rhnode, br_fdb_rht_params); fdb_notify(br, f, RTM_DELNEIGH, swdev_notify); call_rcu(&f->rcu, fdb_rcu_free); } /* Delete a local entry if no other port had the same address. */ static void fdb_delete_local(struct net_bridge *br, const struct net_bridge_port *p, struct net_bridge_fdb_entry *f) { const unsigned char *addr = f->key.addr.addr; struct net_bridge_vlan_group *vg; const struct net_bridge_vlan *v; struct net_bridge_port *op; u16 vid = f->key.vlan_id; /* Maybe another port has same hw addr? */ list_for_each_entry(op, &br->port_list, list) { vg = nbp_vlan_group(op); if (op != p && ether_addr_equal(op->dev->dev_addr, addr) && (!vid || br_vlan_find(vg, vid))) { f->dst = op; clear_bit(BR_FDB_ADDED_BY_USER, &f->flags); return; } } vg = br_vlan_group(br); v = br_vlan_find(vg, vid); /* Maybe bridge device has same hw addr? */ if (p && ether_addr_equal(br->dev->dev_addr, addr) && (!vid || (v && br_vlan_should_use(v)))) { f->dst = NULL; clear_bit(BR_FDB_ADDED_BY_USER, &f->flags); return; } fdb_delete(br, f, true); } void br_fdb_find_delete_local(struct net_bridge *br, const struct net_bridge_port *p, const unsigned char *addr, u16 vid) { struct net_bridge_fdb_entry *f; spin_lock_bh(&br->hash_lock); f = br_fdb_find(br, addr, vid); if (f && test_bit(BR_FDB_LOCAL, &f->flags) && !test_bit(BR_FDB_ADDED_BY_USER, &f->flags) && f->dst == p) fdb_delete_local(br, p, f); spin_unlock_bh(&br->hash_lock); } static struct net_bridge_fdb_entry *fdb_create(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, __u16 vid, unsigned long flags) { struct net_bridge_fdb_entry *fdb; int err; fdb = kmem_cache_alloc(br_fdb_cache, GFP_ATOMIC); if (!fdb) return NULL; memcpy(fdb->key.addr.addr, addr, ETH_ALEN); WRITE_ONCE(fdb->dst, source); fdb->key.vlan_id = vid; fdb->flags = flags; fdb->updated = fdb->used = jiffies; err = rhashtable_lookup_insert_fast(&br->fdb_hash_tbl, &fdb->rhnode, br_fdb_rht_params); if (err) { kmem_cache_free(br_fdb_cache, fdb); return NULL; } hlist_add_head_rcu(&fdb->fdb_node, &br->fdb_list); return fdb; } static int fdb_add_local(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid) { struct net_bridge_fdb_entry *fdb; if (!is_valid_ether_addr(addr)) return -EINVAL; fdb = br_fdb_find(br, addr, vid); if (fdb) { /* it is okay to have multiple ports with same * address, just use the first one. */ if (test_bit(BR_FDB_LOCAL, &fdb->flags)) return 0; br_warn(br, "adding interface %s with same address as a received packet (addr:%pM, vlan:%u)\n", source ? source->dev->name : br->dev->name, addr, vid); fdb_delete(br, fdb, true); } fdb = fdb_create(br, source, addr, vid, BIT(BR_FDB_LOCAL) | BIT(BR_FDB_STATIC)); if (!fdb) return -ENOMEM; fdb_add_hw_addr(br, addr); fdb_notify(br, fdb, RTM_NEWNEIGH, true); return 0; } void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr) { struct net_bridge_vlan_group *vg; struct net_bridge_fdb_entry *f; struct net_bridge *br = p->br; struct net_bridge_vlan *v; spin_lock_bh(&br->hash_lock); vg = nbp_vlan_group(p); hlist_for_each_entry(f, &br->fdb_list, fdb_node) { if (f->dst == p && test_bit(BR_FDB_LOCAL, &f->flags) && !test_bit(BR_FDB_ADDED_BY_USER, &f->flags)) { /* delete old one */ fdb_delete_local(br, p, f); /* if this port has no vlan information * configured, we can safely be done at * this point. */ if (!vg || !vg->num_vlans) goto insert; } } insert: /* insert new address, may fail if invalid address or dup. */ fdb_add_local(br, p, newaddr, 0); if (!vg || !vg->num_vlans) goto done; /* Now add entries for every VLAN configured on the port. * This function runs under RTNL so the bitmap will not change * from under us. */ list_for_each_entry(v, &vg->vlan_list, vlist) fdb_add_local(br, p, newaddr, v->vid); done: spin_unlock_bh(&br->hash_lock); } void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) { struct net_bridge_vlan_group *vg; struct net_bridge_fdb_entry *f; struct net_bridge_vlan *v; spin_lock_bh(&br->hash_lock); /* If old entry was unassociated with any port, then delete it. */ f = br_fdb_find(br, br->dev->dev_addr, 0); if (f && test_bit(BR_FDB_LOCAL, &f->flags) && !f->dst && !test_bit(BR_FDB_ADDED_BY_USER, &f->flags)) fdb_delete_local(br, NULL, f); fdb_add_local(br, NULL, newaddr, 0); vg = br_vlan_group(br); if (!vg || !vg->num_vlans) goto out; /* Now remove and add entries for every VLAN configured on the * bridge. This function runs under RTNL so the bitmap will not * change from under us. */ list_for_each_entry(v, &vg->vlan_list, vlist) { if (!br_vlan_should_use(v)) continue; f = br_fdb_find(br, br->dev->dev_addr, v->vid); if (f && test_bit(BR_FDB_LOCAL, &f->flags) && !f->dst && !test_bit(BR_FDB_ADDED_BY_USER, &f->flags)) fdb_delete_local(br, NULL, f); fdb_add_local(br, NULL, newaddr, v->vid); } out: spin_unlock_bh(&br->hash_lock); } void br_fdb_cleanup(struct work_struct *work) { struct net_bridge *br = container_of(work, struct net_bridge, gc_work.work); struct net_bridge_fdb_entry *f = NULL; unsigned long delay = hold_time(br); unsigned long work_delay = delay; unsigned long now = jiffies; /* this part is tricky, in order to avoid blocking learning and * consequently forwarding, we rely on rcu to delete objects with * delayed freeing allowing us to continue traversing */ rcu_read_lock(); hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { unsigned long this_timer = f->updated + delay; if (test_bit(BR_FDB_STATIC, &f->flags) || test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &f->flags)) { if (test_bit(BR_FDB_NOTIFY, &f->flags)) { if (time_after(this_timer, now)) work_delay = min(work_delay, this_timer - now); else if (!test_and_set_bit(BR_FDB_NOTIFY_INACTIVE, &f->flags)) fdb_notify(br, f, RTM_NEWNEIGH, false); } continue; } if (time_after(this_timer, now)) { work_delay = min(work_delay, this_timer - now); } else { spin_lock_bh(&br->hash_lock); if (!hlist_unhashed(&f->fdb_node)) fdb_delete(br, f, true); spin_unlock_bh(&br->hash_lock); } } rcu_read_unlock(); /* Cleanup minimum 10 milliseconds apart */ work_delay = max_t(unsigned long, work_delay, msecs_to_jiffies(10)); mod_delayed_work(system_long_wq, &br->gc_work, work_delay); } static bool __fdb_flush_matches(const struct net_bridge *br, const struct net_bridge_fdb_entry *f, const struct net_bridge_fdb_flush_desc *desc) { const struct net_bridge_port *dst = READ_ONCE(f->dst); int port_ifidx = dst ? dst->dev->ifindex : br->dev->ifindex; if (desc->vlan_id && desc->vlan_id != f->key.vlan_id) return false; if (desc->port_ifindex && desc->port_ifindex != port_ifidx) return false; if (desc->flags_mask && (f->flags & desc->flags_mask) != desc->flags) return false; return true; } /* Flush forwarding database entries matching the description */ void br_fdb_flush(struct net_bridge *br, const struct net_bridge_fdb_flush_desc *desc) { struct net_bridge_fdb_entry *f; rcu_read_lock(); hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { if (!__fdb_flush_matches(br, f, desc)) continue; spin_lock_bh(&br->hash_lock); if (!hlist_unhashed(&f->fdb_node)) fdb_delete(br, f, true); spin_unlock_bh(&br->hash_lock); } rcu_read_unlock(); } static unsigned long __ndm_state_to_fdb_flags(u16 ndm_state) { unsigned long flags = 0; if (ndm_state & NUD_PERMANENT) __set_bit(BR_FDB_LOCAL, &flags); if (ndm_state & NUD_NOARP) __set_bit(BR_FDB_STATIC, &flags); return flags; } static unsigned long __ndm_flags_to_fdb_flags(u8 ndm_flags) { unsigned long flags = 0; if (ndm_flags & NTF_USE) __set_bit(BR_FDB_ADDED_BY_USER, &flags); if (ndm_flags & NTF_EXT_LEARNED) __set_bit(BR_FDB_ADDED_BY_EXT_LEARN, &flags); if (ndm_flags & NTF_OFFLOADED) __set_bit(BR_FDB_OFFLOADED, &flags); if (ndm_flags & NTF_STICKY) __set_bit(BR_FDB_STICKY, &flags); return flags; } static int __fdb_flush_validate_ifindex(const struct net_bridge *br, int ifindex, struct netlink_ext_ack *extack) { const struct net_device *dev; dev = __dev_get_by_index(dev_net(br->dev), ifindex); if (!dev) { NL_SET_ERR_MSG_MOD(extack, "Unknown flush device ifindex"); return -ENODEV; } if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev)) { NL_SET_ERR_MSG_MOD(extack, "Flush device is not a bridge or bridge port"); return -EINVAL; } if (netif_is_bridge_master(dev) && dev != br->dev) { NL_SET_ERR_MSG_MOD(extack, "Flush bridge device does not match target bridge device"); return -EINVAL; } if (netif_is_bridge_port(dev)) { struct net_bridge_port *p = br_port_get_rtnl(dev); if (p->br != br) { NL_SET_ERR_MSG_MOD(extack, "Port belongs to a different bridge device"); return -EINVAL; } } return 0; } int br_fdb_delete_bulk(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, u16 vid, struct netlink_ext_ack *extack) { u8 ndm_flags = ndm->ndm_flags & ~FDB_FLUSH_IGNORED_NDM_FLAGS; struct net_bridge_fdb_flush_desc desc = { .vlan_id = vid }; struct net_bridge_port *p = NULL; struct net_bridge *br; if (netif_is_bridge_master(dev)) { br = netdev_priv(dev); } else { p = br_port_get_rtnl(dev); if (!p) { NL_SET_ERR_MSG_MOD(extack, "Device is not a bridge port"); return -EINVAL; } br = p->br; } if (ndm_flags & ~FDB_FLUSH_ALLOWED_NDM_FLAGS) { NL_SET_ERR_MSG(extack, "Unsupported fdb flush ndm flag bits set"); return -EINVAL; } if (ndm->ndm_state & ~FDB_FLUSH_ALLOWED_NDM_STATES) { NL_SET_ERR_MSG(extack, "Unsupported fdb flush ndm state bits set"); return -EINVAL; } desc.flags |= __ndm_state_to_fdb_flags(ndm->ndm_state); desc.flags |= __ndm_flags_to_fdb_flags(ndm_flags); if (tb[NDA_NDM_STATE_MASK]) { u16 ndm_state_mask = nla_get_u16(tb[NDA_NDM_STATE_MASK]); desc.flags_mask |= __ndm_state_to_fdb_flags(ndm_state_mask); } if (tb[NDA_NDM_FLAGS_MASK]) { u8 ndm_flags_mask = nla_get_u8(tb[NDA_NDM_FLAGS_MASK]); desc.flags_mask |= __ndm_flags_to_fdb_flags(ndm_flags_mask); } if (tb[NDA_IFINDEX]) { int err, ifidx = nla_get_s32(tb[NDA_IFINDEX]); err = __fdb_flush_validate_ifindex(br, ifidx, extack); if (err) return err; desc.port_ifindex = ifidx; } else if (p) { /* flush was invoked with port device and NTF_MASTER */ desc.port_ifindex = p->dev->ifindex; } br_debug(br, "flushing port ifindex: %d vlan id: %u flags: 0x%lx flags mask: 0x%lx\n", desc.port_ifindex, desc.vlan_id, desc.flags, desc.flags_mask); br_fdb_flush(br, &desc); return 0; } /* Flush all entries referring to a specific port. * if do_all is set also flush static entries * if vid is set delete all entries that match the vlan_id */ void br_fdb_delete_by_port(struct net_bridge *br, const struct net_bridge_port *p, u16 vid, int do_all) { struct net_bridge_fdb_entry *f; struct hlist_node *tmp; spin_lock_bh(&br->hash_lock); hlist_for_each_entry_safe(f, tmp, &br->fdb_list, fdb_node) { if (f->dst != p) continue; if (!do_all) if (test_bit(BR_FDB_STATIC, &f->flags) || (test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &f->flags) && !test_bit(BR_FDB_OFFLOADED, &f->flags)) || (vid && f->key.vlan_id != vid)) continue; if (test_bit(BR_FDB_LOCAL, &f->flags)) fdb_delete_local(br, p, f); else fdb_delete(br, f, true); } spin_unlock_bh(&br->hash_lock); } #if IS_ENABLED(CONFIG_ATM_LANE) /* Interface used by ATM LANE hook to test * if an addr is on some other bridge port */ int br_fdb_test_addr(struct net_device *dev, unsigned char *addr) { struct net_bridge_fdb_entry *fdb; struct net_bridge_port *port; int ret; rcu_read_lock(); port = br_port_get_rcu(dev); if (!port) ret = 0; else { const struct net_bridge_port *dst = NULL; fdb = br_fdb_find_rcu(port->br, addr, 0); if (fdb) dst = READ_ONCE(fdb->dst); ret = dst && dst->dev != dev && dst->state == BR_STATE_FORWARDING; } rcu_read_unlock(); return ret; } #endif /* CONFIG_ATM_LANE */ /* * Fill buffer with forwarding table records in * the API format. */ int br_fdb_fillbuf(struct net_bridge *br, void *buf, unsigned long maxnum, unsigned long skip) { struct net_bridge_fdb_entry *f; struct __fdb_entry *fe = buf; int num = 0; memset(buf, 0, maxnum*sizeof(struct __fdb_entry)); rcu_read_lock(); hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { if (num >= maxnum) break; if (has_expired(br, f)) continue; /* ignore pseudo entry for local MAC address */ if (!f->dst) continue; if (skip) { --skip; continue; } /* convert from internal format to API */ memcpy(fe->mac_addr, f->key.addr.addr, ETH_ALEN); /* due to ABI compat need to split into hi/lo */ fe->port_no = f->dst->port_no; fe->port_hi = f->dst->port_no >> 8; fe->is_local = test_bit(BR_FDB_LOCAL, &f->flags); if (!test_bit(BR_FDB_STATIC, &f->flags)) fe->ageing_timer_value = jiffies_delta_to_clock_t(jiffies - f->updated); ++fe; ++num; } rcu_read_unlock(); return num; } /* Add entry for local address of interface */ int br_fdb_add_local(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid) { int ret; spin_lock_bh(&br->hash_lock); ret = fdb_add_local(br, source, addr, vid); spin_unlock_bh(&br->hash_lock); return ret; } /* returns true if the fdb was modified */ static bool __fdb_mark_active(struct net_bridge_fdb_entry *fdb) { return !!(test_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags) && test_and_clear_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags)); } void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid, unsigned long flags) { struct net_bridge_fdb_entry *fdb; /* some users want to always flood. */ if (hold_time(br) == 0) return; fdb = fdb_find_rcu(&br->fdb_hash_tbl, addr, vid); if (likely(fdb)) { /* attempt to update an entry for a local interface */ if (unlikely(test_bit(BR_FDB_LOCAL, &fdb->flags))) { if (net_ratelimit()) br_warn(br, "received packet on %s with own address as source address (addr:%pM, vlan:%u)\n", source->dev->name, addr, vid); } else { unsigned long now = jiffies; bool fdb_modified = false; if (now != fdb->updated) { fdb->updated = now; fdb_modified = __fdb_mark_active(fdb); } /* fastpath: update of existing entry */ if (unlikely(source != READ_ONCE(fdb->dst) && !test_bit(BR_FDB_STICKY, &fdb->flags))) { br_switchdev_fdb_notify(br, fdb, RTM_DELNEIGH); WRITE_ONCE(fdb->dst, source); fdb_modified = true; /* Take over HW learned entry */ if (unlikely(test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags))) clear_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags); /* Clear locked flag when roaming to an * unlocked port. */ if (unlikely(test_bit(BR_FDB_LOCKED, &fdb->flags))) clear_bit(BR_FDB_LOCKED, &fdb->flags); } if (unlikely(test_bit(BR_FDB_ADDED_BY_USER, &flags))) set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); if (unlikely(fdb_modified)) { trace_br_fdb_update(br, source, addr, vid, flags); fdb_notify(br, fdb, RTM_NEWNEIGH, true); } } } else { spin_lock(&br->hash_lock); fdb = fdb_create(br, source, addr, vid, flags); if (fdb) { trace_br_fdb_update(br, source, addr, vid, flags); fdb_notify(br, fdb, RTM_NEWNEIGH, true); } /* else we lose race and someone else inserts * it first, don't bother updating */ spin_unlock(&br->hash_lock); } } /* Dump information about entries, in response to GETNEIGH */ int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, int *idx) { struct net_bridge *br = netdev_priv(dev); struct net_bridge_fdb_entry *f; int err = 0; if (!netif_is_bridge_master(dev)) return err; if (!filter_dev) { err = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx); if (err < 0) return err; } rcu_read_lock(); hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { if (*idx < cb->args[2]) goto skip; if (filter_dev && (!f->dst || f->dst->dev != filter_dev)) { if (filter_dev != dev) goto skip; /* !f->dst is a special case for bridge * It means the MAC belongs to the bridge * Therefore need a little more filtering * we only want to dump the !f->dst case */ if (f->dst) goto skip; } if (!filter_dev && f->dst) goto skip; err = fdb_fill_info(skb, br, f, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, NLM_F_MULTI); if (err < 0) break; skip: *idx += 1; } rcu_read_unlock(); return err; } int br_fdb_get(struct sk_buff *skb, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u32 portid, u32 seq, struct netlink_ext_ack *extack) { struct net_bridge *br = netdev_priv(dev); struct net_bridge_fdb_entry *f; int err = 0; rcu_read_lock(); f = br_fdb_find_rcu(br, addr, vid); if (!f) { NL_SET_ERR_MSG(extack, "Fdb entry not found"); err = -ENOENT; goto errout; } err = fdb_fill_info(skb, br, f, portid, seq, RTM_NEWNEIGH, 0); errout: rcu_read_unlock(); return err; } /* returns true if the fdb is modified */ static bool fdb_handle_notify(struct net_bridge_fdb_entry *fdb, u8 notify) { bool modified = false; /* allow to mark an entry as inactive, usually done on creation */ if ((notify & FDB_NOTIFY_INACTIVE_BIT) && !test_and_set_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags)) modified = true; if ((notify & FDB_NOTIFY_BIT) && !test_and_set_bit(BR_FDB_NOTIFY, &fdb->flags)) { /* enabled activity tracking */ modified = true; } else if (!(notify & FDB_NOTIFY_BIT) && test_and_clear_bit(BR_FDB_NOTIFY, &fdb->flags)) { /* disabled activity tracking, clear notify state */ clear_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags); modified = true; } return modified; } /* Update (create or replace) forwarding database entry */ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, const u8 *addr, struct ndmsg *ndm, u16 flags, u16 vid, struct nlattr *nfea_tb[]) { bool is_sticky = !!(ndm->ndm_flags & NTF_STICKY); bool refresh = !nfea_tb[NFEA_DONT_REFRESH]; struct net_bridge_fdb_entry *fdb; u16 state = ndm->ndm_state; bool modified = false; u8 notify = 0; /* If the port cannot learn allow only local and static entries */ if (source && !(state & NUD_PERMANENT) && !(state & NUD_NOARP) && !(source->state == BR_STATE_LEARNING || source->state == BR_STATE_FORWARDING)) return -EPERM; if (!source && !(state & NUD_PERMANENT)) { pr_info("bridge: RTM_NEWNEIGH %s without NUD_PERMANENT\n", br->dev->name); return -EINVAL; } if (is_sticky && (state & NUD_PERMANENT)) return -EINVAL; if (nfea_tb[NFEA_ACTIVITY_NOTIFY]) { notify = nla_get_u8(nfea_tb[NFEA_ACTIVITY_NOTIFY]); if ((notify & ~BR_FDB_NOTIFY_SETTABLE_BITS) || (notify & BR_FDB_NOTIFY_SETTABLE_BITS) == FDB_NOTIFY_INACTIVE_BIT) return -EINVAL; } fdb = br_fdb_find(br, addr, vid); if (fdb == NULL) { if (!(flags & NLM_F_CREATE)) return -ENOENT; fdb = fdb_create(br, source, addr, vid, 0); if (!fdb) return -ENOMEM; modified = true; } else { if (flags & NLM_F_EXCL) return -EEXIST; if (READ_ONCE(fdb->dst) != source) { WRITE_ONCE(fdb->dst, source); modified = true; } } if (fdb_to_nud(br, fdb) != state) { if (state & NUD_PERMANENT) { set_bit(BR_FDB_LOCAL, &fdb->flags); if (!test_and_set_bit(BR_FDB_STATIC, &fdb->flags)) fdb_add_hw_addr(br, addr); } else if (state & NUD_NOARP) { clear_bit(BR_FDB_LOCAL, &fdb->flags); if (!test_and_set_bit(BR_FDB_STATIC, &fdb->flags)) fdb_add_hw_addr(br, addr); } else { clear_bit(BR_FDB_LOCAL, &fdb->flags); if (test_and_clear_bit(BR_FDB_STATIC, &fdb->flags)) fdb_del_hw_addr(br, addr); } modified = true; } if (is_sticky != test_bit(BR_FDB_STICKY, &fdb->flags)) { change_bit(BR_FDB_STICKY, &fdb->flags); modified = true; } if (test_and_clear_bit(BR_FDB_LOCKED, &fdb->flags)) modified = true; if (fdb_handle_notify(fdb, notify)) modified = true; set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); fdb->used = jiffies; if (modified) { if (refresh) fdb->updated = jiffies; fdb_notify(br, fdb, RTM_NEWNEIGH, true); } return 0; } static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, u16 nlh_flags, u16 vid, struct nlattr *nfea_tb[], struct netlink_ext_ack *extack) { int err = 0; if (ndm->ndm_flags & NTF_USE) { if (!p) { pr_info("bridge: RTM_NEWNEIGH %s with NTF_USE is not supported\n", br->dev->name); return -EINVAL; } if (!nbp_state_should_learn(p)) return 0; local_bh_disable(); rcu_read_lock(); br_fdb_update(br, p, addr, vid, BIT(BR_FDB_ADDED_BY_USER)); rcu_read_unlock(); local_bh_enable(); } else if (ndm->ndm_flags & NTF_EXT_LEARNED) { if (!p && !(ndm->ndm_state & NUD_PERMANENT)) { NL_SET_ERR_MSG_MOD(extack, "FDB entry towards bridge must be permanent"); return -EINVAL; } err = br_fdb_external_learn_add(br, p, addr, vid, false, true); } else { spin_lock_bh(&br->hash_lock); err = fdb_add_entry(br, p, addr, ndm, nlh_flags, vid, nfea_tb); spin_unlock_bh(&br->hash_lock); } return err; } static const struct nla_policy br_nda_fdb_pol[NFEA_MAX + 1] = { [NFEA_ACTIVITY_NOTIFY] = { .type = NLA_U8 }, [NFEA_DONT_REFRESH] = { .type = NLA_FLAG }, }; /* Add new permanent fdb entry with RTM_NEWNEIGH */ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 nlh_flags, struct netlink_ext_ack *extack) { struct nlattr *nfea_tb[NFEA_MAX + 1], *attr; struct net_bridge_vlan_group *vg; struct net_bridge_port *p = NULL; struct net_bridge_vlan *v; struct net_bridge *br = NULL; u32 ext_flags = 0; int err = 0; trace_br_fdb_add(ndm, dev, addr, vid, nlh_flags); if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE))) { pr_info("bridge: RTM_NEWNEIGH with invalid state %#x\n", ndm->ndm_state); return -EINVAL; } if (is_zero_ether_addr(addr)) { pr_info("bridge: RTM_NEWNEIGH with invalid ether address\n"); return -EINVAL; } if (netif_is_bridge_master(dev)) { br = netdev_priv(dev); vg = br_vlan_group(br); } else { p = br_port_get_rtnl(dev); if (!p) { pr_info("bridge: RTM_NEWNEIGH %s not a bridge port\n", dev->name); return -EINVAL; } br = p->br; vg = nbp_vlan_group(p); } if (tb[NDA_FLAGS_EXT]) ext_flags = nla_get_u32(tb[NDA_FLAGS_EXT]); if (ext_flags & NTF_EXT_LOCKED) { NL_SET_ERR_MSG_MOD(extack, "Cannot add FDB entry with \"locked\" flag set"); return -EINVAL; } if (tb[NDA_FDB_EXT_ATTRS]) { attr = tb[NDA_FDB_EXT_ATTRS]; err = nla_parse_nested(nfea_tb, NFEA_MAX, attr, br_nda_fdb_pol, extack); if (err) return err; } else { memset(nfea_tb, 0, sizeof(struct nlattr *) * (NFEA_MAX + 1)); } if (vid) { v = br_vlan_find(vg, vid); if (!v || !br_vlan_should_use(v)) { pr_info("bridge: RTM_NEWNEIGH with unconfigured vlan %d on %s\n", vid, dev->name); return -EINVAL; } /* VID was specified, so use it. */ err = __br_fdb_add(ndm, br, p, addr, nlh_flags, vid, nfea_tb, extack); } else { err = __br_fdb_add(ndm, br, p, addr, nlh_flags, 0, nfea_tb, extack); if (err || !vg || !vg->num_vlans) goto out; /* We have vlans configured on this port and user didn't * specify a VLAN. To be nice, add/update entry for every * vlan on this port. */ list_for_each_entry(v, &vg->vlan_list, vlist) { if (!br_vlan_should_use(v)) continue; err = __br_fdb_add(ndm, br, p, addr, nlh_flags, v->vid, nfea_tb, extack); if (err) goto out; } } out: return err; } static int fdb_delete_by_addr_and_port(struct net_bridge *br, const struct net_bridge_port *p, const u8 *addr, u16 vlan) { struct net_bridge_fdb_entry *fdb; fdb = br_fdb_find(br, addr, vlan); if (!fdb || READ_ONCE(fdb->dst) != p) return -ENOENT; fdb_delete(br, fdb, true); return 0; } static int __br_fdb_delete(struct net_bridge *br, const struct net_bridge_port *p, const unsigned char *addr, u16 vid) { int err; spin_lock_bh(&br->hash_lock); err = fdb_delete_by_addr_and_port(br, p, addr, vid); spin_unlock_bh(&br->hash_lock); return err; } /* Remove neighbor entry with RTM_DELNEIGH */ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, struct netlink_ext_ack *extack) { struct net_bridge_vlan_group *vg; struct net_bridge_port *p = NULL; struct net_bridge_vlan *v; struct net_bridge *br; int err; if (netif_is_bridge_master(dev)) { br = netdev_priv(dev); vg = br_vlan_group(br); } else { p = br_port_get_rtnl(dev); if (!p) { pr_info("bridge: RTM_DELNEIGH %s not a bridge port\n", dev->name); return -EINVAL; } vg = nbp_vlan_group(p); br = p->br; } if (vid) { v = br_vlan_find(vg, vid); if (!v) { pr_info("bridge: RTM_DELNEIGH with unconfigured vlan %d on %s\n", vid, dev->name); return -EINVAL; } err = __br_fdb_delete(br, p, addr, vid); } else { err = -ENOENT; err &= __br_fdb_delete(br, p, addr, 0); if (!vg || !vg->num_vlans) return err; list_for_each_entry(v, &vg->vlan_list, vlist) { if (!br_vlan_should_use(v)) continue; err &= __br_fdb_delete(br, p, addr, v->vid); } } return err; } int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p) { struct net_bridge_fdb_entry *f, *tmp; int err = 0; ASSERT_RTNL(); /* the key here is that static entries change only under rtnl */ rcu_read_lock(); hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { /* We only care for static entries */ if (!test_bit(BR_FDB_STATIC, &f->flags)) continue; err = dev_uc_add(p->dev, f->key.addr.addr); if (err) goto rollback; } done: rcu_read_unlock(); return err; rollback: hlist_for_each_entry_rcu(tmp, &br->fdb_list, fdb_node) { /* We only care for static entries */ if (!test_bit(BR_FDB_STATIC, &tmp->flags)) continue; if (tmp == f) break; dev_uc_del(p->dev, tmp->key.addr.addr); } goto done; } void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p) { struct net_bridge_fdb_entry *f; ASSERT_RTNL(); rcu_read_lock(); hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { /* We only care for static entries */ if (!test_bit(BR_FDB_STATIC, &f->flags)) continue; dev_uc_del(p->dev, f->key.addr.addr); } rcu_read_unlock(); } int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, u16 vid, bool locked, bool swdev_notify) { struct net_bridge_fdb_entry *fdb; bool modified = false; int err = 0; trace_br_fdb_external_learn_add(br, p, addr, vid); if (locked && (!p || !(p->flags & BR_PORT_MAB))) return -EINVAL; spin_lock_bh(&br->hash_lock); fdb = br_fdb_find(br, addr, vid); if (!fdb) { unsigned long flags = BIT(BR_FDB_ADDED_BY_EXT_LEARN); if (swdev_notify) flags |= BIT(BR_FDB_ADDED_BY_USER); if (!p) flags |= BIT(BR_FDB_LOCAL); if (locked) flags |= BIT(BR_FDB_LOCKED); fdb = fdb_create(br, p, addr, vid, flags); if (!fdb) { err = -ENOMEM; goto err_unlock; } fdb_notify(br, fdb, RTM_NEWNEIGH, swdev_notify); } else { if (locked && (!test_bit(BR_FDB_LOCKED, &fdb->flags) || READ_ONCE(fdb->dst) != p)) { err = -EINVAL; goto err_unlock; } fdb->updated = jiffies; if (READ_ONCE(fdb->dst) != p) { WRITE_ONCE(fdb->dst, p); modified = true; } if (test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags)) { /* Refresh entry */ fdb->used = jiffies; } else if (!test_bit(BR_FDB_ADDED_BY_USER, &fdb->flags)) { /* Take over SW learned entry */ set_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags); modified = true; } if (locked != test_bit(BR_FDB_LOCKED, &fdb->flags)) { change_bit(BR_FDB_LOCKED, &fdb->flags); modified = true; } if (swdev_notify) set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); if (!p) set_bit(BR_FDB_LOCAL, &fdb->flags); if (modified) fdb_notify(br, fdb, RTM_NEWNEIGH, swdev_notify); } err_unlock: spin_unlock_bh(&br->hash_lock); return err; } int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, u16 vid, bool swdev_notify) { struct net_bridge_fdb_entry *fdb; int err = 0; spin_lock_bh(&br->hash_lock); fdb = br_fdb_find(br, addr, vid); if (fdb && test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags)) fdb_delete(br, fdb, swdev_notify); else err = -ENOENT; spin_unlock_bh(&br->hash_lock); return err; } void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, u16 vid, bool offloaded) { struct net_bridge_fdb_entry *fdb; spin_lock_bh(&br->hash_lock); fdb = br_fdb_find(br, addr, vid); if (fdb && offloaded != test_bit(BR_FDB_OFFLOADED, &fdb->flags)) change_bit(BR_FDB_OFFLOADED, &fdb->flags); spin_unlock_bh(&br->hash_lock); } void br_fdb_clear_offload(const struct net_device *dev, u16 vid) { struct net_bridge_fdb_entry *f; struct net_bridge_port *p; ASSERT_RTNL(); p = br_port_get_rtnl(dev); if (!p) return; spin_lock_bh(&p->br->hash_lock); hlist_for_each_entry(f, &p->br->fdb_list, fdb_node) { if (f->dst == p && f->key.vlan_id == vid) clear_bit(BR_FDB_OFFLOADED, &f->flags); } spin_unlock_bh(&p->br->hash_lock); } EXPORT_SYMBOL_GPL(br_fdb_clear_offload);
1529 4 1 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 // SPDX-License-Identifier: GPL-2.0-only /* net/atm/clip.c - RFC1577 Classical IP over ATM */ /* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__ #include <linux/string.h> #include <linux/errno.h> #include <linux/kernel.h> /* for UINT_MAX */ #include <linux/module.h> #include <linux/init.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/wait.h> #include <linux/timer.h> #include <linux/if_arp.h> /* for some manifest constants */ #include <linux/notifier.h> #include <linux/atm.h> #include <linux/atmdev.h> #include <linux/atmclip.h> #include <linux/atmarp.h> #include <linux/capability.h> #include <linux/ip.h> /* for net/route.h */ #include <linux/in.h> /* for struct sockaddr_in */ #include <linux/if.h> /* for IFF_UP */ #include <linux/inetdevice.h> #include <linux/bitops.h> #include <linux/poison.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/rcupdate.h> #include <linux/jhash.h> #include <linux/slab.h> #include <net/route.h> /* for struct rtable and routing */ #include <net/icmp.h> /* icmp_send */ #include <net/arp.h> #include <linux/param.h> /* for HZ */ #include <linux/uaccess.h> #include <asm/byteorder.h> /* for htons etc. */ #include <linux/atomic.h> #include "common.h" #include "resources.h" #include <net/atmclip.h> static struct net_device *clip_devs; static struct atm_vcc *atmarpd; static struct timer_list idle_timer; static const struct neigh_ops clip_neigh_ops; static int to_atmarpd(enum atmarp_ctrl_type type, int itf, __be32 ip) { struct sock *sk; struct atmarp_ctrl *ctrl; struct sk_buff *skb; pr_debug("(%d)\n", type); if (!atmarpd) return -EUNATCH; skb = alloc_skb(sizeof(struct atmarp_ctrl), GFP_ATOMIC); if (!skb) return -ENOMEM; ctrl = skb_put(skb, sizeof(struct atmarp_ctrl)); ctrl->type = type; ctrl->itf_num = itf; ctrl->ip = ip; atm_force_charge(atmarpd, skb->truesize); sk = sk_atm(atmarpd); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk); return 0; } static void link_vcc(struct clip_vcc *clip_vcc, struct atmarp_entry *entry) { pr_debug("%p to entry %p (neigh %p)\n", clip_vcc, entry, entry->neigh); clip_vcc->entry = entry; clip_vcc->xoff = 0; /* @@@ may overrun buffer by one packet */ clip_vcc->next = entry->vccs; entry->vccs = clip_vcc; entry->neigh->used = jiffies; } static void unlink_clip_vcc(struct clip_vcc *clip_vcc) { struct atmarp_entry *entry = clip_vcc->entry; struct clip_vcc **walk; if (!entry) { pr_err("!clip_vcc->entry (clip_vcc %p)\n", clip_vcc); return; } netif_tx_lock_bh(entry->neigh->dev); /* block clip_start_xmit() */ entry->neigh->used = jiffies; for (walk = &entry->vccs; *walk; walk = &(*walk)->next) if (*walk == clip_vcc) { int error; *walk = clip_vcc->next; /* atomic */ clip_vcc->entry = NULL; if (clip_vcc->xoff) netif_wake_queue(entry->neigh->dev); if (entry->vccs) goto out; entry->expires = jiffies - 1; /* force resolution or expiration */ error = neigh_update(entry->neigh, NULL, NUD_NONE, NEIGH_UPDATE_F_ADMIN, 0); if (error) pr_err("neigh_update failed with %d\n", error); goto out; } pr_err("ATMARP: failed (entry %p, vcc 0x%p)\n", entry, clip_vcc); out: netif_tx_unlock_bh(entry->neigh->dev); } /* The neighbour entry n->lock is held. */ static int neigh_check_cb(struct neighbour *n) { struct atmarp_entry *entry = neighbour_priv(n); struct clip_vcc *cv; if (n->ops != &clip_neigh_ops) return 0; for (cv = entry->vccs; cv; cv = cv->next) { unsigned long exp = cv->last_use + cv->idle_timeout; if (cv->idle_timeout && time_after(jiffies, exp)) { pr_debug("releasing vcc %p->%p of entry %p\n", cv, cv->vcc, entry); vcc_release_async(cv->vcc, -ETIMEDOUT); } } if (entry->vccs || time_before(jiffies, entry->expires)) return 0; if (refcount_read(&n->refcnt) > 1) { struct sk_buff *skb; pr_debug("destruction postponed with ref %d\n", refcount_read(&n->refcnt)); while ((skb = skb_dequeue(&n->arp_queue)) != NULL) dev_kfree_skb(skb); return 0; } pr_debug("expired neigh %p\n", n); return 1; } static void idle_timer_check(struct timer_list *unused) { write_lock(&arp_tbl.lock); __neigh_for_each_release(&arp_tbl, neigh_check_cb); mod_timer(&idle_timer, jiffies + CLIP_CHECK_INTERVAL * HZ); write_unlock(&arp_tbl.lock); } static int clip_arp_rcv(struct sk_buff *skb) { struct atm_vcc *vcc; pr_debug("\n"); vcc = ATM_SKB(skb)->vcc; if (!vcc || !atm_charge(vcc, skb->truesize)) { dev_kfree_skb_any(skb); return 0; } pr_debug("pushing to %p\n", vcc); pr_debug("using %p\n", CLIP_VCC(vcc)->old_push); CLIP_VCC(vcc)->old_push(vcc, skb); return 0; } static const unsigned char llc_oui[] = { 0xaa, /* DSAP: non-ISO */ 0xaa, /* SSAP: non-ISO */ 0x03, /* Ctrl: Unnumbered Information Command PDU */ 0x00, /* OUI: EtherType */ 0x00, 0x00 }; static void clip_push(struct atm_vcc *vcc, struct sk_buff *skb) { struct clip_vcc *clip_vcc = CLIP_VCC(vcc); pr_debug("\n"); if (!clip_devs) { atm_return(vcc, skb->truesize); kfree_skb(skb); return; } if (!skb) { pr_debug("removing VCC %p\n", clip_vcc); if (clip_vcc->entry) unlink_clip_vcc(clip_vcc); clip_vcc->old_push(vcc, NULL); /* pass on the bad news */ kfree(clip_vcc); return; } atm_return(vcc, skb->truesize); skb->dev = clip_vcc->entry ? clip_vcc->entry->neigh->dev : clip_devs; /* clip_vcc->entry == NULL if we don't have an IP address yet */ if (!skb->dev) { dev_kfree_skb_any(skb); return; } ATM_SKB(skb)->vcc = vcc; skb_reset_mac_header(skb); if (!clip_vcc->encap || skb->len < RFC1483LLC_LEN || memcmp(skb->data, llc_oui, sizeof(llc_oui))) skb->protocol = htons(ETH_P_IP); else { skb->protocol = ((__be16 *)skb->data)[3]; skb_pull(skb, RFC1483LLC_LEN); if (skb->protocol == htons(ETH_P_ARP)) { skb->dev->stats.rx_packets++; skb->dev->stats.rx_bytes += skb->len; clip_arp_rcv(skb); return; } } clip_vcc->last_use = jiffies; skb->dev->stats.rx_packets++; skb->dev->stats.rx_bytes += skb->len; memset(ATM_SKB(skb), 0, sizeof(struct atm_skb_data)); netif_rx(skb); } /* * Note: these spinlocks _must_not_ block on non-SMP. The only goal is that * clip_pop is atomic with respect to the critical section in clip_start_xmit. */ static void clip_pop(struct atm_vcc *vcc, struct sk_buff *skb) { struct clip_vcc *clip_vcc = CLIP_VCC(vcc); struct net_device *dev = skb->dev; int old; unsigned long flags; pr_debug("(vcc %p)\n", vcc); clip_vcc->old_pop(vcc, skb); /* skb->dev == NULL in outbound ARP packets */ if (!dev) return; spin_lock_irqsave(&PRIV(dev)->xoff_lock, flags); if (atm_may_send(vcc, 0)) { old = xchg(&clip_vcc->xoff, 0); if (old) netif_wake_queue(dev); } spin_unlock_irqrestore(&PRIV(dev)->xoff_lock, flags); } static void clip_neigh_solicit(struct neighbour *neigh, struct sk_buff *skb) { __be32 *ip = (__be32 *) neigh->primary_key; pr_debug("(neigh %p, skb %p)\n", neigh, skb); to_atmarpd(act_need, PRIV(neigh->dev)->number, *ip); } static void clip_neigh_error(struct neighbour *neigh, struct sk_buff *skb) { #ifndef CONFIG_ATM_CLIP_NO_ICMP icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); #endif kfree_skb(skb); } static const struct neigh_ops clip_neigh_ops = { .family = AF_INET, .solicit = clip_neigh_solicit, .error_report = clip_neigh_error, .output = neigh_direct_output, .connected_output = neigh_direct_output, }; static int clip_constructor(struct net_device *dev, struct neighbour *neigh) { struct atmarp_entry *entry = neighbour_priv(neigh); if (neigh->tbl->family != AF_INET) return -EINVAL; if (neigh->type != RTN_UNICAST) return -EINVAL; neigh->nud_state = NUD_NONE; neigh->ops = &clip_neigh_ops; neigh->output = neigh->ops->output; entry->neigh = neigh; entry->vccs = NULL; entry->expires = jiffies - 1; return 0; } /* @@@ copy bh locking from arp.c -- need to bh-enable atm code before */ /* * We play with the resolve flag: 0 and 1 have the usual meaning, but -1 means * to allocate the neighbour entry but not to ask atmarpd for resolution. Also, * don't increment the usage count. This is used to create entries in * clip_setentry. */ static int clip_encap(struct atm_vcc *vcc, int mode) { if (!CLIP_VCC(vcc)) return -EBADFD; CLIP_VCC(vcc)->encap = mode; return 0; } static netdev_tx_t clip_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct clip_priv *clip_priv = PRIV(dev); struct dst_entry *dst = skb_dst(skb); struct atmarp_entry *entry; struct neighbour *n; struct atm_vcc *vcc; struct rtable *rt; __be32 *daddr; int old; unsigned long flags; pr_debug("(skb %p)\n", skb); if (!dst) { pr_err("skb_dst(skb) == NULL\n"); dev_kfree_skb(skb); dev->stats.tx_dropped++; return NETDEV_TX_OK; } rt = (struct rtable *) dst; if (rt->rt_gw_family == AF_INET) daddr = &rt->rt_gw4; else daddr = &ip_hdr(skb)->daddr; n = dst_neigh_lookup(dst, daddr); if (!n) { pr_err("NO NEIGHBOUR !\n"); dev_kfree_skb(skb); dev->stats.tx_dropped++; return NETDEV_TX_OK; } entry = neighbour_priv(n); if (!entry->vccs) { if (time_after(jiffies, entry->expires)) { /* should be resolved */ entry->expires = jiffies + ATMARP_RETRY_DELAY * HZ; to_atmarpd(act_need, PRIV(dev)->number, *((__be32 *)n->primary_key)); } if (entry->neigh->arp_queue.qlen < ATMARP_MAX_UNRES_PACKETS) skb_queue_tail(&entry->neigh->arp_queue, skb); else { dev_kfree_skb(skb); dev->stats.tx_dropped++; } goto out_release_neigh; } pr_debug("neigh %p, vccs %p\n", entry, entry->vccs); ATM_SKB(skb)->vcc = vcc = entry->vccs->vcc; pr_debug("using neighbour %p, vcc %p\n", n, vcc); if (entry->vccs->encap) { void *here; here = skb_push(skb, RFC1483LLC_LEN); memcpy(here, llc_oui, sizeof(llc_oui)); ((__be16 *) here)[3] = skb->protocol; } atm_account_tx(vcc, skb); entry->vccs->last_use = jiffies; pr_debug("atm_skb(%p)->vcc(%p)->dev(%p)\n", skb, vcc, vcc->dev); old = xchg(&entry->vccs->xoff, 1); /* assume XOFF ... */ if (old) { pr_warn("XOFF->XOFF transition\n"); goto out_release_neigh; } dev->stats.tx_packets++; dev->stats.tx_bytes += skb->len; vcc->send(vcc, skb); if (atm_may_send(vcc, 0)) { entry->vccs->xoff = 0; goto out_release_neigh; } spin_lock_irqsave(&clip_priv->xoff_lock, flags); netif_stop_queue(dev); /* XOFF -> throttle immediately */ barrier(); if (!entry->vccs->xoff) netif_start_queue(dev); /* Oh, we just raced with clip_pop. netif_start_queue should be good enough, because nothing should really be asleep because of the brief netif_stop_queue. If this isn't true or if it changes, use netif_wake_queue instead. */ spin_unlock_irqrestore(&clip_priv->xoff_lock, flags); out_release_neigh: neigh_release(n); return NETDEV_TX_OK; } static int clip_mkip(struct atm_vcc *vcc, int timeout) { struct clip_vcc *clip_vcc; if (!vcc->push) return -EBADFD; clip_vcc = kmalloc(sizeof(struct clip_vcc), GFP_KERNEL); if (!clip_vcc) return -ENOMEM; pr_debug("%p vcc %p\n", clip_vcc, vcc); clip_vcc->vcc = vcc; vcc->user_back = clip_vcc; set_bit(ATM_VF_IS_CLIP, &vcc->flags); clip_vcc->entry = NULL; clip_vcc->xoff = 0; clip_vcc->encap = 1; clip_vcc->last_use = jiffies; clip_vcc->idle_timeout = timeout * HZ; clip_vcc->old_push = vcc->push; clip_vcc->old_pop = vcc->pop; vcc->push = clip_push; vcc->pop = clip_pop; /* re-process everything received between connection setup and MKIP */ vcc_process_recv_queue(vcc); return 0; } static int clip_setentry(struct atm_vcc *vcc, __be32 ip) { struct neighbour *neigh; struct atmarp_entry *entry; int error; struct clip_vcc *clip_vcc; struct rtable *rt; if (vcc->push != clip_push) { pr_warn("non-CLIP VCC\n"); return -EBADF; } clip_vcc = CLIP_VCC(vcc); if (!ip) { if (!clip_vcc->entry) { pr_err("hiding hidden ATMARP entry\n"); return 0; } pr_debug("remove\n"); unlink_clip_vcc(clip_vcc); return 0; } rt = ip_route_output(&init_net, ip, 0, 1, 0); if (IS_ERR(rt)) return PTR_ERR(rt); neigh = __neigh_lookup(&arp_tbl, &ip, rt->dst.dev, 1); ip_rt_put(rt); if (!neigh) return -ENOMEM; entry = neighbour_priv(neigh); if (entry != clip_vcc->entry) { if (!clip_vcc->entry) pr_debug("add\n"); else { pr_debug("update\n"); unlink_clip_vcc(clip_vcc); } link_vcc(clip_vcc, entry); } error = neigh_update(neigh, llc_oui, NUD_PERMANENT, NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN, 0); neigh_release(neigh); return error; } static const struct net_device_ops clip_netdev_ops = { .ndo_start_xmit = clip_start_xmit, .ndo_neigh_construct = clip_constructor, }; static void clip_setup(struct net_device *dev) { dev->netdev_ops = &clip_netdev_ops; dev->type = ARPHRD_ATM; dev->neigh_priv_len = sizeof(struct atmarp_entry); dev->hard_header_len = RFC1483LLC_LEN; dev->mtu = RFC1626_MTU; dev->tx_queue_len = 100; /* "normal" queue (packets) */ /* When using a "real" qdisc, the qdisc determines the queue */ /* length. tx_queue_len is only used for the default case, */ /* without any more elaborate queuing. 100 is a reasonable */ /* compromise between decent burst-tolerance and protection */ /* against memory hogs. */ netif_keep_dst(dev); } static int clip_create(int number) { struct net_device *dev; struct clip_priv *clip_priv; int error; if (number != -1) { for (dev = clip_devs; dev; dev = PRIV(dev)->next) if (PRIV(dev)->number == number) return -EEXIST; } else { number = 0; for (dev = clip_devs; dev; dev = PRIV(dev)->next) if (PRIV(dev)->number >= number) number = PRIV(dev)->number + 1; } dev = alloc_netdev(sizeof(struct clip_priv), "", NET_NAME_UNKNOWN, clip_setup); if (!dev) return -ENOMEM; clip_priv = PRIV(dev); sprintf(dev->name, "atm%d", number); spin_lock_init(&clip_priv->xoff_lock); clip_priv->number = number; error = register_netdev(dev); if (error) { free_netdev(dev); return error; } clip_priv->next = clip_devs; clip_devs = dev; pr_debug("registered (net:%s)\n", dev->name); return number; } static int clip_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; if (event == NETDEV_UNREGISTER) return NOTIFY_DONE; /* ignore non-CLIP devices */ if (dev->type != ARPHRD_ATM || dev->netdev_ops != &clip_netdev_ops) return NOTIFY_DONE; switch (event) { case NETDEV_UP: pr_debug("NETDEV_UP\n"); to_atmarpd(act_up, PRIV(dev)->number, 0); break; case NETDEV_GOING_DOWN: pr_debug("NETDEV_DOWN\n"); to_atmarpd(act_down, PRIV(dev)->number, 0); break; case NETDEV_CHANGE: case NETDEV_CHANGEMTU: pr_debug("NETDEV_CHANGE*\n"); to_atmarpd(act_change, PRIV(dev)->number, 0); break; } return NOTIFY_DONE; } static int clip_inet_event(struct notifier_block *this, unsigned long event, void *ifa) { struct in_device *in_dev; struct netdev_notifier_info info; in_dev = ((struct in_ifaddr *)ifa)->ifa_dev; /* * Transitions are of the down-change-up type, so it's sufficient to * handle the change on up. */ if (event != NETDEV_UP) return NOTIFY_DONE; netdev_notifier_info_init(&info, in_dev->dev); return clip_device_event(this, NETDEV_CHANGE, &info); } static struct notifier_block clip_dev_notifier = { .notifier_call = clip_device_event, }; static struct notifier_block clip_inet_notifier = { .notifier_call = clip_inet_event, }; static void atmarpd_close(struct atm_vcc *vcc) { pr_debug("\n"); rtnl_lock(); atmarpd = NULL; skb_queue_purge(&sk_atm(vcc)->sk_receive_queue); rtnl_unlock(); pr_debug("(done)\n"); module_put(THIS_MODULE); } static const struct atmdev_ops atmarpd_dev_ops = { .close = atmarpd_close }; static struct atm_dev atmarpd_dev = { .ops = &atmarpd_dev_ops, .type = "arpd", .number = 999, .lock = __SPIN_LOCK_UNLOCKED(atmarpd_dev.lock) }; static int atm_init_atmarp(struct atm_vcc *vcc) { rtnl_lock(); if (atmarpd) { rtnl_unlock(); return -EADDRINUSE; } mod_timer(&idle_timer, jiffies + CLIP_CHECK_INTERVAL * HZ); atmarpd = vcc; set_bit(ATM_VF_META, &vcc->flags); set_bit(ATM_VF_READY, &vcc->flags); /* allow replies and avoid getting closed if signaling dies */ vcc->dev = &atmarpd_dev; vcc_insert_socket(sk_atm(vcc)); vcc->push = NULL; vcc->pop = NULL; /* crash */ vcc->push_oam = NULL; /* crash */ rtnl_unlock(); return 0; } static int clip_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct atm_vcc *vcc = ATM_SD(sock); int err = 0; switch (cmd) { case SIOCMKCLIP: case ATMARPD_CTRL: case ATMARP_MKIP: case ATMARP_SETENTRY: case ATMARP_ENCAP: if (!capable(CAP_NET_ADMIN)) return -EPERM; break; default: return -ENOIOCTLCMD; } switch (cmd) { case SIOCMKCLIP: err = clip_create(arg); break; case ATMARPD_CTRL: err = atm_init_atmarp(vcc); if (!err) { sock->state = SS_CONNECTED; __module_get(THIS_MODULE); } break; case ATMARP_MKIP: err = clip_mkip(vcc, arg); break; case ATMARP_SETENTRY: err = clip_setentry(vcc, (__force __be32)arg); break; case ATMARP_ENCAP: err = clip_encap(vcc, arg); break; } return err; } static struct atm_ioctl clip_ioctl_ops = { .owner = THIS_MODULE, .ioctl = clip_ioctl, }; #ifdef CONFIG_PROC_FS static void svc_addr(struct seq_file *seq, struct sockaddr_atmsvc *addr) { static int code[] = { 1, 2, 10, 6, 1, 0 }; static int e164[] = { 1, 8, 4, 6, 1, 0 }; if (*addr->sas_addr.pub) { seq_printf(seq, "%s", addr->sas_addr.pub); if (*addr->sas_addr.prv) seq_putc(seq, '+'); } else if (!*addr->sas_addr.prv) { seq_printf(seq, "%s", "(none)"); return; } if (*addr->sas_addr.prv) { unsigned char *prv = addr->sas_addr.prv; int *fields; int i, j; fields = *prv == ATM_AFI_E164 ? e164 : code; for (i = 0; fields[i]; i++) { for (j = fields[i]; j; j--) seq_printf(seq, "%02X", *prv++); if (fields[i + 1]) seq_putc(seq, '.'); } } } /* This means the neighbour entry has no attached VCC objects. */ #define SEQ_NO_VCC_TOKEN ((void *) 2) static void atmarp_info(struct seq_file *seq, struct neighbour *n, struct atmarp_entry *entry, struct clip_vcc *clip_vcc) { struct net_device *dev = n->dev; unsigned long exp; char buf[17]; int svc, llc, off; svc = ((clip_vcc == SEQ_NO_VCC_TOKEN) || (sk_atm(clip_vcc->vcc)->sk_family == AF_ATMSVC)); llc = ((clip_vcc == SEQ_NO_VCC_TOKEN) || clip_vcc->encap); if (clip_vcc == SEQ_NO_VCC_TOKEN) exp = entry->neigh->used; else exp = clip_vcc->last_use; exp = (jiffies - exp) / HZ; seq_printf(seq, "%-6s%-4s%-4s%5ld ", dev->name, svc ? "SVC" : "PVC", llc ? "LLC" : "NULL", exp); off = scnprintf(buf, sizeof(buf) - 1, "%pI4", n->primary_key); while (off < 16) buf[off++] = ' '; buf[off] = '\0'; seq_printf(seq, "%s", buf); if (clip_vcc == SEQ_NO_VCC_TOKEN) { if (time_before(jiffies, entry->expires)) seq_printf(seq, "(resolving)\n"); else seq_printf(seq, "(expired, ref %d)\n", refcount_read(&entry->neigh->refcnt)); } else if (!svc) { seq_printf(seq, "%d.%d.%d\n", clip_vcc->vcc->dev->number, clip_vcc->vcc->vpi, clip_vcc->vcc->vci); } else { svc_addr(seq, &clip_vcc->vcc->remote); seq_putc(seq, '\n'); } } struct clip_seq_state { /* This member must be first. */ struct neigh_seq_state ns; /* Local to clip specific iteration. */ struct clip_vcc *vcc; }; static struct clip_vcc *clip_seq_next_vcc(struct atmarp_entry *e, struct clip_vcc *curr) { if (!curr) { curr = e->vccs; if (!curr) return SEQ_NO_VCC_TOKEN; return curr; } if (curr == SEQ_NO_VCC_TOKEN) return NULL; curr = curr->next; return curr; } static void *clip_seq_vcc_walk(struct clip_seq_state *state, struct atmarp_entry *e, loff_t * pos) { struct clip_vcc *vcc = state->vcc; vcc = clip_seq_next_vcc(e, vcc); if (vcc && pos != NULL) { while (*pos) { vcc = clip_seq_next_vcc(e, vcc); if (!vcc) break; --(*pos); } } state->vcc = vcc; return vcc; } static void *clip_seq_sub_iter(struct neigh_seq_state *_state, struct neighbour *n, loff_t * pos) { struct clip_seq_state *state = (struct clip_seq_state *)_state; if (n->dev->type != ARPHRD_ATM) return NULL; return clip_seq_vcc_walk(state, neighbour_priv(n), pos); } static void *clip_seq_start(struct seq_file *seq, loff_t * pos) { struct clip_seq_state *state = seq->private; state->ns.neigh_sub_iter = clip_seq_sub_iter; return neigh_seq_start(seq, pos, &arp_tbl, NEIGH_SEQ_NEIGH_ONLY); } static int clip_seq_show(struct seq_file *seq, void *v) { static char atm_arp_banner[] = "IPitf TypeEncp Idle IP address ATM address\n"; if (v == SEQ_START_TOKEN) { seq_puts(seq, atm_arp_banner); } else { struct clip_seq_state *state = seq->private; struct clip_vcc *vcc = state->vcc; struct neighbour *n = v; atmarp_info(seq, n, neighbour_priv(n), vcc); } return 0; } static const struct seq_operations arp_seq_ops = { .start = clip_seq_start, .next = neigh_seq_next, .stop = neigh_seq_stop, .show = clip_seq_show, }; #endif static void atm_clip_exit_noproc(void); static int __init atm_clip_init(void) { register_atm_ioctl(&clip_ioctl_ops); register_netdevice_notifier(&clip_dev_notifier); register_inetaddr_notifier(&clip_inet_notifier); timer_setup(&idle_timer, idle_timer_check, 0); #ifdef CONFIG_PROC_FS { struct proc_dir_entry *p; p = proc_create_net("arp", 0444, atm_proc_root, &arp_seq_ops, sizeof(struct clip_seq_state)); if (!p) { pr_err("Unable to initialize /proc/net/atm/arp\n"); atm_clip_exit_noproc(); return -ENOMEM; } } #endif return 0; } static void atm_clip_exit_noproc(void) { struct net_device *dev, *next; unregister_inetaddr_notifier(&clip_inet_notifier); unregister_netdevice_notifier(&clip_dev_notifier); deregister_atm_ioctl(&clip_ioctl_ops); /* First, stop the idle timer, so it stops banging * on the table. */ del_timer_sync(&idle_timer); dev = clip_devs; while (dev) { next = PRIV(dev)->next; unregister_netdev(dev); free_netdev(dev); dev = next; } } static void __exit atm_clip_exit(void) { remove_proc_entry("arp", atm_proc_root); atm_clip_exit_noproc(); } module_init(atm_clip_init); module_exit(atm_clip_exit); MODULE_AUTHOR("Werner Almesberger"); MODULE_DESCRIPTION("Classical/IP over ATM interface"); MODULE_LICENSE("GPL");
4872 6 246 216 229 93 48 4 223 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_SIGNAL_H #define _LINUX_SCHED_SIGNAL_H #include <linux/rculist.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/sched/jobctl.h> #include <linux/sched/task.h> #include <linux/cred.h> #include <linux/refcount.h> #include <linux/posix-timers.h> #include <linux/mm_types.h> #include <asm/ptrace.h> /* * Types defining task->signal and task->sighand and APIs using them: */ struct sighand_struct { spinlock_t siglock; refcount_t count; wait_queue_head_t signalfd_wqh; struct k_sigaction action[_NSIG]; }; /* * Per-process accounting stats: */ struct pacct_struct { int ac_flag; long ac_exitcode; unsigned long ac_mem; u64 ac_utime, ac_stime; unsigned long ac_minflt, ac_majflt; }; struct cpu_itimer { u64 expires; u64 incr; }; /* * This is the atomic variant of task_cputime, which can be used for * storing and updating task_cputime statistics without locking. */ struct task_cputime_atomic { atomic64_t utime; atomic64_t stime; atomic64_t sum_exec_runtime; }; #define INIT_CPUTIME_ATOMIC \ (struct task_cputime_atomic) { \ .utime = ATOMIC64_INIT(0), \ .stime = ATOMIC64_INIT(0), \ .sum_exec_runtime = ATOMIC64_INIT(0), \ } /** * struct thread_group_cputimer - thread group interval timer counts * @cputime_atomic: atomic thread group interval timers. * * This structure contains the version of task_cputime, above, that is * used for thread group CPU timer calculations. */ struct thread_group_cputimer { struct task_cputime_atomic cputime_atomic; }; struct multiprocess_signals { sigset_t signal; struct hlist_node node; }; struct core_thread { struct task_struct *task; struct core_thread *next; }; struct core_state { atomic_t nr_threads; struct core_thread dumper; struct completion startup; }; /* * NOTE! "signal_struct" does not have its own * locking, because a shared signal_struct always * implies a shared sighand_struct, so locking * sighand_struct is always a proper superset of * the locking of signal_struct. */ struct signal_struct { refcount_t sigcnt; atomic_t live; int nr_threads; int quick_threads; struct list_head thread_head; wait_queue_head_t wait_chldexit; /* for wait4() */ /* current thread group signal load-balancing target: */ struct task_struct *curr_target; /* shared signal handling: */ struct sigpending shared_pending; /* For collecting multiprocess signals during fork */ struct hlist_head multiprocess; /* thread group exit support */ int group_exit_code; /* notify group_exec_task when notify_count is less or equal to 0 */ int notify_count; struct task_struct *group_exec_task; /* thread group stop support, overloads group_exit_code too */ int group_stop_count; unsigned int flags; /* see SIGNAL_* flags below */ struct core_state *core_state; /* coredumping support */ /* * PR_SET_CHILD_SUBREAPER marks a process, like a service * manager, to re-parent orphan (double-forking) child processes * to this process instead of 'init'. The service manager is * able to receive SIGCHLD signals and is able to investigate * the process until it calls wait(). All children of this * process will inherit a flag if they should look for a * child_subreaper process at exit. */ unsigned int is_child_subreaper:1; unsigned int has_child_subreaper:1; #ifdef CONFIG_POSIX_TIMERS /* POSIX.1b Interval Timers */ unsigned int next_posix_timer_id; struct list_head posix_timers; /* ITIMER_REAL timer for the process */ struct hrtimer real_timer; ktime_t it_real_incr; /* * ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use * CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these * values are defined to 0 and 1 respectively */ struct cpu_itimer it[2]; /* * Thread group totals for process CPU timers. * See thread_group_cputimer(), et al, for details. */ struct thread_group_cputimer cputimer; #endif /* Empty if CONFIG_POSIX_TIMERS=n */ struct posix_cputimers posix_cputimers; /* PID/PID hash table linkage. */ struct pid *pids[PIDTYPE_MAX]; #ifdef CONFIG_NO_HZ_FULL atomic_t tick_dep_mask; #endif struct pid *tty_old_pgrp; /* boolean value for session group leader */ int leader; struct tty_struct *tty; /* NULL if no tty */ #ifdef CONFIG_SCHED_AUTOGROUP struct autogroup *autogroup; #endif /* * Cumulative resource counters for dead threads in the group, * and for reaped dead child processes forked by this group. * Live threads maintain their own counters and add to these * in __exit_signal, except for the group leader. */ seqlock_t stats_lock; u64 utime, stime, cutime, cstime; u64 gtime; u64 cgtime; struct prev_cputime prev_cputime; unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; unsigned long inblock, oublock, cinblock, coublock; unsigned long maxrss, cmaxrss; struct task_io_accounting ioac; /* * Cumulative ns of schedule CPU time fo dead threads in the * group, not including a zombie group leader, (This only differs * from jiffies_to_ns(utime + stime) if sched_clock uses something * other than jiffies.) */ unsigned long long sum_sched_runtime; /* * We don't bother to synchronize most readers of this at all, * because there is no reader checking a limit that actually needs * to get both rlim_cur and rlim_max atomically, and either one * alone is a single word that can safely be read normally. * getrlimit/setrlimit use task_lock(current->group_leader) to * protect this instead of the siglock, because they really * have no need to disable irqs. */ struct rlimit rlim[RLIM_NLIMITS]; #ifdef CONFIG_BSD_PROCESS_ACCT struct pacct_struct pacct; /* per-process accounting information */ #endif #ifdef CONFIG_TASKSTATS struct taskstats *stats; #endif #ifdef CONFIG_AUDIT unsigned audit_tty; struct tty_audit_buf *tty_audit_buf; #endif /* * Thread is the potential origin of an oom condition; kill first on * oom */ bool oom_flag_origin; short oom_score_adj; /* OOM kill score adjustment */ short oom_score_adj_min; /* OOM kill score adjustment min value. * Only settable by CAP_SYS_RESOURCE. */ struct mm_struct *oom_mm; /* recorded mm when the thread group got * killed by the oom killer */ struct mutex cred_guard_mutex; /* guard against foreign influences on * credential calculations * (notably. ptrace) * Deprecated do not use in new code. * Use exec_update_lock instead. */ struct rw_semaphore exec_update_lock; /* Held while task_struct is * being updated during exec, * and may have inconsistent * permissions. */ } __randomize_layout; /* * Bits in flags field of signal_struct. */ #define SIGNAL_STOP_STOPPED 0x00000001 /* job control stop in effect */ #define SIGNAL_STOP_CONTINUED 0x00000002 /* SIGCONT since WCONTINUED reap */ #define SIGNAL_GROUP_EXIT 0x00000004 /* group exit in progress */ /* * Pending notifications to parent. */ #define SIGNAL_CLD_STOPPED 0x00000010 #define SIGNAL_CLD_CONTINUED 0x00000020 #define SIGNAL_CLD_MASK (SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED) #define SIGNAL_UNKILLABLE 0x00000040 /* for init: ignore fatal signals */ #define SIGNAL_STOP_MASK (SIGNAL_CLD_MASK | SIGNAL_STOP_STOPPED | \ SIGNAL_STOP_CONTINUED) static inline void signal_set_stop_flags(struct signal_struct *sig, unsigned int flags) { WARN_ON(sig->flags & SIGNAL_GROUP_EXIT); sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags; } extern void flush_signals(struct task_struct *); extern void ignore_signals(struct task_struct *); extern void flush_signal_handlers(struct task_struct *, int force_default); extern int dequeue_signal(struct task_struct *task, sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type); static inline int kernel_dequeue_signal(void) { struct task_struct *task = current; kernel_siginfo_t __info; enum pid_type __type; int ret; spin_lock_irq(&task->sighand->siglock); ret = dequeue_signal(task, &task->blocked, &__info, &__type); spin_unlock_irq(&task->sighand->siglock); return ret; } static inline void kernel_signal_stop(void) { spin_lock_irq(&current->sighand->siglock); if (current->jobctl & JOBCTL_STOP_DEQUEUED) { current->jobctl |= JOBCTL_STOPPED; set_special_state(TASK_STOPPED); } spin_unlock_irq(&current->sighand->siglock); schedule(); } #ifdef __ia64__ # define ___ARCH_SI_IA64(_a1, _a2, _a3) , _a1, _a2, _a3 #else # define ___ARCH_SI_IA64(_a1, _a2, _a3) #endif int force_sig_fault_to_task(int sig, int code, void __user *addr ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) , struct task_struct *t); int force_sig_fault(int sig, int code, void __user *addr ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)); int send_sig_fault(int sig, int code, void __user *addr ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) , struct task_struct *t); int force_sig_mceerr(int code, void __user *, short); int send_sig_mceerr(int code, void __user *, short, struct task_struct *); int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper); int force_sig_pkuerr(void __user *addr, u32 pkey); int send_sig_perf(void __user *addr, u32 type, u64 sig_data); int force_sig_ptrace_errno_trap(int errno, void __user *addr); int force_sig_fault_trapno(int sig, int code, void __user *addr, int trapno); int send_sig_fault_trapno(int sig, int code, void __user *addr, int trapno, struct task_struct *t); int force_sig_seccomp(int syscall, int reason, bool force_coredump); extern int send_sig_info(int, struct kernel_siginfo *, struct task_struct *); extern void force_sigsegv(int sig); extern int force_sig_info(struct kernel_siginfo *); extern int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp); extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid); extern int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr, struct pid *, const struct cred *); extern int kill_pgrp(struct pid *pid, int sig, int priv); extern int kill_pid(struct pid *pid, int sig, int priv); extern __must_check bool do_notify_parent(struct task_struct *, int); extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent); extern void force_sig(int); extern void force_fatal_sig(int); extern void force_exit_sig(int); extern int send_sig(int, struct task_struct *, int); extern int zap_other_threads(struct task_struct *p); extern struct sigqueue *sigqueue_alloc(void); extern void sigqueue_free(struct sigqueue *); extern int send_sigqueue(struct sigqueue *, struct pid *, enum pid_type); extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *); static inline void clear_notify_signal(void) { clear_thread_flag(TIF_NOTIFY_SIGNAL); smp_mb__after_atomic(); } /* * Returns 'true' if kick_process() is needed to force a transition from * user -> kernel to guarantee expedient run of TWA_SIGNAL based task_work. */ static inline bool __set_notify_signal(struct task_struct *task) { return !test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) && !wake_up_state(task, TASK_INTERRUPTIBLE); } /* * Called to break out of interruptible wait loops, and enter the * exit_to_user_mode_loop(). */ static inline void set_notify_signal(struct task_struct *task) { if (__set_notify_signal(task)) kick_process(task); } static inline int restart_syscall(void) { set_tsk_thread_flag(current, TIF_SIGPENDING); return -ERESTARTNOINTR; } static inline int task_sigpending(struct task_struct *p) { return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING)); } static inline int signal_pending(struct task_struct *p) { /* * TIF_NOTIFY_SIGNAL isn't really a signal, but it requires the same * behavior in terms of ensuring that we break out of wait loops * so that notify signal callbacks can be processed. */ if (unlikely(test_tsk_thread_flag(p, TIF_NOTIFY_SIGNAL))) return 1; return task_sigpending(p); } static inline int __fatal_signal_pending(struct task_struct *p) { return unlikely(sigismember(&p->pending.signal, SIGKILL)); } static inline int fatal_signal_pending(struct task_struct *p) { return task_sigpending(p) && __fatal_signal_pending(p); } static inline int signal_pending_state(unsigned int state, struct task_struct *p) { if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL))) return 0; if (!signal_pending(p)) return 0; return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p); } /* * This should only be used in fault handlers to decide whether we * should stop the current fault routine to handle the signals * instead, especially with the case where we've got interrupted with * a VM_FAULT_RETRY. */ static inline bool fault_signal_pending(vm_fault_t fault_flags, struct pt_regs *regs) { return unlikely((fault_flags & VM_FAULT_RETRY) && (fatal_signal_pending(current) || (user_mode(regs) && signal_pending(current)))); } /* * Reevaluate whether the task has signals pending delivery. * Wake the task if so. * This is required every time the blocked sigset_t changes. * callers must hold sighand->siglock. */ extern void recalc_sigpending_and_wake(struct task_struct *t); extern void recalc_sigpending(void); extern void calculate_sigpending(void); extern void signal_wake_up_state(struct task_struct *t, unsigned int state); static inline void signal_wake_up(struct task_struct *t, bool fatal) { unsigned int state = 0; if (fatal && !(t->jobctl & JOBCTL_PTRACE_FROZEN)) { t->jobctl &= ~(JOBCTL_STOPPED | JOBCTL_TRACED); state = TASK_WAKEKILL | __TASK_TRACED; } signal_wake_up_state(t, state); } static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume) { unsigned int state = 0; if (resume) { t->jobctl &= ~JOBCTL_TRACED; state = __TASK_TRACED; } signal_wake_up_state(t, state); } void task_join_group_stop(struct task_struct *task); #ifdef TIF_RESTORE_SIGMASK /* * Legacy restore_sigmask accessors. These are inefficient on * SMP architectures because they require atomic operations. */ /** * set_restore_sigmask() - make sure saved_sigmask processing gets done * * This sets TIF_RESTORE_SIGMASK and ensures that the arch signal code * will run before returning to user mode, to process the flag. For * all callers, TIF_SIGPENDING is already set or it's no harm to set * it. TIF_RESTORE_SIGMASK need not be in the set of bits that the * arch code will notice on return to user mode, in case those bits * are scarce. We set TIF_SIGPENDING here to ensure that the arch * signal code always gets run when TIF_RESTORE_SIGMASK is set. */ static inline void set_restore_sigmask(void) { set_thread_flag(TIF_RESTORE_SIGMASK); } static inline void clear_tsk_restore_sigmask(struct task_struct *task) { clear_tsk_thread_flag(task, TIF_RESTORE_SIGMASK); } static inline void clear_restore_sigmask(void) { clear_thread_flag(TIF_RESTORE_SIGMASK); } static inline bool test_tsk_restore_sigmask(struct task_struct *task) { return test_tsk_thread_flag(task, TIF_RESTORE_SIGMASK); } static inline bool test_restore_sigmask(void) { return test_thread_flag(TIF_RESTORE_SIGMASK); } static inline bool test_and_clear_restore_sigmask(void) { return test_and_clear_thread_flag(TIF_RESTORE_SIGMASK); } #else /* TIF_RESTORE_SIGMASK */ /* Higher-quality implementation, used if TIF_RESTORE_SIGMASK doesn't exist. */ static inline void set_restore_sigmask(void) { current->restore_sigmask = true; } static inline void clear_tsk_restore_sigmask(struct task_struct *task) { task->restore_sigmask = false; } static inline void clear_restore_sigmask(void) { current->restore_sigmask = false; } static inline bool test_restore_sigmask(void) { return current->restore_sigmask; } static inline bool test_tsk_restore_sigmask(struct task_struct *task) { return task->restore_sigmask; } static inline bool test_and_clear_restore_sigmask(void) { if (!current->restore_sigmask) return false; current->restore_sigmask = false; return true; } #endif static inline void restore_saved_sigmask(void) { if (test_and_clear_restore_sigmask()) __set_current_blocked(&current->saved_sigmask); } extern int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize); static inline void restore_saved_sigmask_unless(bool interrupted) { if (interrupted) WARN_ON(!signal_pending(current)); else restore_saved_sigmask(); } static inline sigset_t *sigmask_to_save(void) { sigset_t *res = &current->blocked; if (unlikely(test_restore_sigmask())) res = &current->saved_sigmask; return res; } static inline int kill_cad_pid(int sig, int priv) { return kill_pid(cad_pid, sig, priv); } /* These can be the second arg to send_sig_info/send_group_sig_info. */ #define SEND_SIG_NOINFO ((struct kernel_siginfo *) 0) #define SEND_SIG_PRIV ((struct kernel_siginfo *) 1) static inline int __on_sig_stack(unsigned long sp) { #ifdef CONFIG_STACK_GROWSUP return sp >= current->sas_ss_sp && sp - current->sas_ss_sp < current->sas_ss_size; #else return sp > current->sas_ss_sp && sp - current->sas_ss_sp <= current->sas_ss_size; #endif } /* * True if we are on the alternate signal stack. */ static inline int on_sig_stack(unsigned long sp) { /* * If the signal stack is SS_AUTODISARM then, by construction, we * can't be on the signal stack unless user code deliberately set * SS_AUTODISARM when we were already on it. * * This improves reliability: if user state gets corrupted such that * the stack pointer points very close to the end of the signal stack, * then this check will enable the signal to be handled anyway. */ if (current->sas_ss_flags & SS_AUTODISARM) return 0; return __on_sig_stack(sp); } static inline int sas_ss_flags(unsigned long sp) { if (!current->sas_ss_size) return SS_DISABLE; return on_sig_stack(sp) ? SS_ONSTACK : 0; } static inline void sas_ss_reset(struct task_struct *p) { p->sas_ss_sp = 0; p->sas_ss_size = 0; p->sas_ss_flags = SS_DISABLE; } static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig) { if (unlikely((ksig->ka.sa.sa_flags & SA_ONSTACK)) && ! sas_ss_flags(sp)) #ifdef CONFIG_STACK_GROWSUP return current->sas_ss_sp; #else return current->sas_ss_sp + current->sas_ss_size; #endif return sp; } extern void __cleanup_sighand(struct sighand_struct *); extern void flush_itimer_signals(void); #define tasklist_empty() \ list_empty(&init_task.tasks) #define next_task(p) \ list_entry_rcu((p)->tasks.next, struct task_struct, tasks) #define for_each_process(p) \ for (p = &init_task ; (p = next_task(p)) != &init_task ; ) extern bool current_is_single_threaded(void); /* * Without tasklist/siglock it is only rcu-safe if g can't exit/exec, * otherwise next_thread(t) will never reach g after list_del_rcu(g). */ #define while_each_thread(g, t) \ while ((t = next_thread(t)) != g) #define __for_each_thread(signal, t) \ list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node) #define for_each_thread(p, t) \ __for_each_thread((p)->signal, t) /* Careful: this is a double loop, 'break' won't work as expected. */ #define for_each_process_thread(p, t) \ for_each_process(p) for_each_thread(p, t) typedef int (*proc_visitor)(struct task_struct *p, void *data); void walk_process_tree(struct task_struct *top, proc_visitor, void *); static inline struct pid *task_pid_type(struct task_struct *task, enum pid_type type) { struct pid *pid; if (type == PIDTYPE_PID) pid = task_pid(task); else pid = task->signal->pids[type]; return pid; } static inline struct pid *task_tgid(struct task_struct *task) { return task->signal->pids[PIDTYPE_TGID]; } /* * Without tasklist or RCU lock it is not safe to dereference * the result of task_pgrp/task_session even if task == current, * we can race with another thread doing sys_setsid/sys_setpgid. */ static inline struct pid *task_pgrp(struct task_struct *task) { return task->signal->pids[PIDTYPE_PGID]; } static inline struct pid *task_session(struct task_struct *task) { return task->signal->pids[PIDTYPE_SID]; } static inline int get_nr_threads(struct task_struct *task) { return task->signal->nr_threads; } static inline bool thread_group_leader(struct task_struct *p) { return p->exit_signal >= 0; } static inline bool same_thread_group(struct task_struct *p1, struct task_struct *p2) { return p1->signal == p2->signal; } static inline struct task_struct *next_thread(const struct task_struct *p) { return list_entry_rcu(p->thread_group.next, struct task_struct, thread_group); } static inline int thread_group_empty(struct task_struct *p) { return list_empty(&p->thread_group); } #define delay_group_leader(p) \ (thread_group_leader(p) && !thread_group_empty(p)) extern bool thread_group_exited(struct pid *pid); extern struct sighand_struct *__lock_task_sighand(struct task_struct *task, unsigned long *flags); static inline struct sighand_struct *lock_task_sighand(struct task_struct *task, unsigned long *flags) { struct sighand_struct *ret; ret = __lock_task_sighand(task, flags); (void)__cond_lock(&task->sighand->siglock, ret); return ret; } static inline void unlock_task_sighand(struct task_struct *task, unsigned long *flags) { spin_unlock_irqrestore(&task->sighand->siglock, *flags); } #ifdef CONFIG_LOCKDEP extern void lockdep_assert_task_sighand_held(struct task_struct *task); #else static inline void lockdep_assert_task_sighand_held(struct task_struct *task) { } #endif static inline unsigned long task_rlimit(const struct task_struct *task, unsigned int limit) { return READ_ONCE(task->signal->rlim[limit].rlim_cur); } static inline unsigned long task_rlimit_max(const struct task_struct *task, unsigned int limit) { return READ_ONCE(task->signal->rlim[limit].rlim_max); } static inline unsigned long rlimit(unsigned int limit) { return task_rlimit(current, limit); } static inline unsigned long rlimit_max(unsigned int limit) { return task_rlimit_max(current, limit); } #endif /* _LINUX_SCHED_SIGNAL_H */
35 306 305 298 39 289 46 11 11 11 296 15 296 296 296 23 296 296 15 296 15 296 296 296 306 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 69 69 69 35 1 128 40 39 39 127 32 3 26 25 223 224 161 9 9 2 7 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 // SPDX-License-Identifier: GPL-2.0-or-later /* * PF_INET6 socket protocol family * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Adapted from linux/net/ipv4/af_inet.c * * Fixes: * piggy, Karl Knutson : Socket protocol table * Hideaki YOSHIFUJI : sin6_scope_id support * Arnaldo Melo : check proc_net_create return, cleanups */ #define pr_fmt(fmt) "IPv6: " fmt #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/proc_fs.h> #include <linux/stat.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/icmpv6.h> #include <linux/netfilter_ipv6.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/udp.h> #include <net/udplite.h> #include <net/tcp.h> #include <net/ping.h> #include <net/protocol.h> #include <net/inet_common.h> #include <net/route.h> #include <net/transp_v6.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/ipv6_stubs.h> #include <net/ndisc.h> #ifdef CONFIG_IPV6_TUNNEL #include <net/ip6_tunnel.h> #endif #include <net/calipso.h> #include <net/seg6.h> #include <net/rpl.h> #include <net/compat.h> #include <net/xfrm.h> #include <net/ioam6.h> #include <net/rawv6.h> #include <linux/uaccess.h> #include <linux/mroute6.h> #include "ip6_offload.h" MODULE_AUTHOR("Cast of dozens"); MODULE_DESCRIPTION("IPv6 protocol stack for Linux"); MODULE_LICENSE("GPL"); /* The inetsw6 table contains everything that inet6_create needs to * build a new socket. */ static struct list_head inetsw6[SOCK_MAX]; static DEFINE_SPINLOCK(inetsw6_lock); struct ipv6_params ipv6_defaults = { .disable_ipv6 = 0, .autoconf = 1, }; static int disable_ipv6_mod; module_param_named(disable, disable_ipv6_mod, int, 0444); MODULE_PARM_DESC(disable, "Disable IPv6 module such that it is non-functional"); module_param_named(disable_ipv6, ipv6_defaults.disable_ipv6, int, 0444); MODULE_PARM_DESC(disable_ipv6, "Disable IPv6 on all interfaces"); module_param_named(autoconf, ipv6_defaults.autoconf, int, 0444); MODULE_PARM_DESC(autoconf, "Enable IPv6 address autoconfiguration on all interfaces"); bool ipv6_mod_enabled(void) { return disable_ipv6_mod == 0; } EXPORT_SYMBOL_GPL(ipv6_mod_enabled); static struct ipv6_pinfo *inet6_sk_generic(struct sock *sk) { const int offset = sk->sk_prot->ipv6_pinfo_offset; return (struct ipv6_pinfo *)(((u8 *)sk) + offset); } void inet6_sock_destruct(struct sock *sk) { inet6_cleanup_sock(sk); inet_sock_destruct(sk); } EXPORT_SYMBOL_GPL(inet6_sock_destruct); static int inet6_create(struct net *net, struct socket *sock, int protocol, int kern) { struct inet_sock *inet; struct ipv6_pinfo *np; struct sock *sk; struct inet_protosw *answer; struct proto *answer_prot; unsigned char answer_flags; int try_loading_module = 0; int err; if (protocol < 0 || protocol >= IPPROTO_MAX) return -EINVAL; /* Look for the requested type/protocol pair. */ lookup_protocol: err = -ESOCKTNOSUPPORT; rcu_read_lock(); list_for_each_entry_rcu(answer, &inetsw6[sock->type], list) { err = 0; /* Check the non-wild match. */ if (protocol == answer->protocol) { if (protocol != IPPROTO_IP) break; } else { /* Check for the two wild cases. */ if (IPPROTO_IP == protocol) { protocol = answer->protocol; break; } if (IPPROTO_IP == answer->protocol) break; } err = -EPROTONOSUPPORT; } if (err) { if (try_loading_module < 2) { rcu_read_unlock(); /* * Be more specific, e.g. net-pf-10-proto-132-type-1 * (net-pf-PF_INET6-proto-IPPROTO_SCTP-type-SOCK_STREAM) */ if (++try_loading_module == 1) request_module("net-pf-%d-proto-%d-type-%d", PF_INET6, protocol, sock->type); /* * Fall back to generic, e.g. net-pf-10-proto-132 * (net-pf-PF_INET6-proto-IPPROTO_SCTP) */ else request_module("net-pf-%d-proto-%d", PF_INET6, protocol); goto lookup_protocol; } else goto out_rcu_unlock; } err = -EPERM; if (sock->type == SOCK_RAW && !kern && !ns_capable(net->user_ns, CAP_NET_RAW)) goto out_rcu_unlock; sock->ops = answer->ops; answer_prot = answer->prot; answer_flags = answer->flags; rcu_read_unlock(); WARN_ON(!answer_prot->slab); err = -ENOBUFS; sk = sk_alloc(net, PF_INET6, GFP_KERNEL, answer_prot, kern); if (!sk) goto out; sock_init_data(sock, sk); err = 0; if (INET_PROTOSW_REUSE & answer_flags) sk->sk_reuse = SK_CAN_REUSE; inet = inet_sk(sk); inet_assign_bit(IS_ICSK, sk, INET_PROTOSW_ICSK & answer_flags); if (SOCK_RAW == sock->type) { inet->inet_num = protocol; if (IPPROTO_RAW == protocol) inet_set_bit(HDRINCL, sk); } sk->sk_destruct = inet6_sock_destruct; sk->sk_family = PF_INET6; sk->sk_protocol = protocol; sk->sk_backlog_rcv = answer->prot->backlog_rcv; inet_sk(sk)->pinet6 = np = inet6_sk_generic(sk); np->hop_limit = -1; np->mcast_hops = IPV6_DEFAULT_MCASTHOPS; np->mc_loop = 1; np->mc_all = 1; np->pmtudisc = IPV6_PMTUDISC_WANT; np->repflow = net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ESTABLISHED; sk->sk_ipv6only = net->ipv6.sysctl.bindv6only; sk->sk_txrehash = READ_ONCE(net->core.sysctl_txrehash); /* Init the ipv4 part of the socket since we can have sockets * using v6 API for ipv4. */ inet->uc_ttl = -1; inet_set_bit(MC_LOOP, sk); inet->mc_ttl = 1; inet->mc_index = 0; RCU_INIT_POINTER(inet->mc_list, NULL); inet->rcv_tos = 0; if (READ_ONCE(net->ipv4.sysctl_ip_no_pmtu_disc)) inet->pmtudisc = IP_PMTUDISC_DONT; else inet->pmtudisc = IP_PMTUDISC_WANT; if (inet->inet_num) { /* It assumes that any protocol which allows * the user to assign a number at socket * creation time automatically shares. */ inet->inet_sport = htons(inet->inet_num); err = sk->sk_prot->hash(sk); if (err) { sk_common_release(sk); goto out; } } if (sk->sk_prot->init) { err = sk->sk_prot->init(sk); if (err) { sk_common_release(sk); goto out; } } if (!kern) { err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk); if (err) { sk_common_release(sk); goto out; } } out: return err; out_rcu_unlock: rcu_read_unlock(); goto out; } static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, u32 flags) { struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr; struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); __be32 v4addr = 0; unsigned short snum; bool saved_ipv6only; int addr_type = 0; int err = 0; if (addr->sin6_family != AF_INET6) return -EAFNOSUPPORT; addr_type = ipv6_addr_type(&addr->sin6_addr); if ((addr_type & IPV6_ADDR_MULTICAST) && sk->sk_type == SOCK_STREAM) return -EINVAL; snum = ntohs(addr->sin6_port); if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) && snum && inet_port_requires_bind_service(net, snum) && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) return -EACCES; if (flags & BIND_WITH_LOCK) lock_sock(sk); /* Check these errors (active socket, double bind). */ if (sk->sk_state != TCP_CLOSE || inet->inet_num) { err = -EINVAL; goto out; } /* Check if the address belongs to the host. */ if (addr_type == IPV6_ADDR_MAPPED) { struct net_device *dev = NULL; int chk_addr_ret; /* Binding to v4-mapped address on a v6-only socket * makes no sense */ if (ipv6_only_sock(sk)) { err = -EINVAL; goto out; } rcu_read_lock(); if (sk->sk_bound_dev_if) { dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if); if (!dev) { err = -ENODEV; goto out_unlock; } } /* Reproduce AF_INET checks to make the bindings consistent */ v4addr = addr->sin6_addr.s6_addr32[3]; chk_addr_ret = inet_addr_type_dev_table(net, dev, v4addr); rcu_read_unlock(); if (!inet_addr_valid_or_nonlocal(net, inet, v4addr, chk_addr_ret)) { err = -EADDRNOTAVAIL; goto out; } } else { if (addr_type != IPV6_ADDR_ANY) { struct net_device *dev = NULL; rcu_read_lock(); if (__ipv6_addr_needs_scope_id(addr_type)) { if (addr_len >= sizeof(struct sockaddr_in6) && addr->sin6_scope_id) { /* Override any existing binding, if another one * is supplied by user. */ sk->sk_bound_dev_if = addr->sin6_scope_id; } /* Binding to link-local address requires an interface */ if (!sk->sk_bound_dev_if) { err = -EINVAL; goto out_unlock; } } if (sk->sk_bound_dev_if) { dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if); if (!dev) { err = -ENODEV; goto out_unlock; } } /* ipv4 addr of the socket is invalid. Only the * unspecified and mapped address have a v4 equivalent. */ v4addr = LOOPBACK4_IPV6; if (!(addr_type & IPV6_ADDR_MULTICAST)) { if (!ipv6_can_nonlocal_bind(net, inet) && !ipv6_chk_addr(net, &addr->sin6_addr, dev, 0)) { err = -EADDRNOTAVAIL; goto out_unlock; } } rcu_read_unlock(); } } inet->inet_rcv_saddr = v4addr; inet->inet_saddr = v4addr; sk->sk_v6_rcv_saddr = addr->sin6_addr; if (!(addr_type & IPV6_ADDR_MULTICAST)) np->saddr = addr->sin6_addr; saved_ipv6only = sk->sk_ipv6only; if (addr_type != IPV6_ADDR_ANY && addr_type != IPV6_ADDR_MAPPED) sk->sk_ipv6only = 1; /* Make sure we are allowed to bind here. */ if (snum || !(inet_test_bit(BIND_ADDRESS_NO_PORT, sk) || (flags & BIND_FORCE_ADDRESS_NO_PORT))) { err = sk->sk_prot->get_port(sk, snum); if (err) { sk->sk_ipv6only = saved_ipv6only; inet_reset_saddr(sk); goto out; } if (!(flags & BIND_FROM_BPF)) { err = BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk); if (err) { sk->sk_ipv6only = saved_ipv6only; inet_reset_saddr(sk); if (sk->sk_prot->put_port) sk->sk_prot->put_port(sk); goto out; } } } if (addr_type != IPV6_ADDR_ANY) sk->sk_userlocks |= SOCK_BINDADDR_LOCK; if (snum) sk->sk_userlocks |= SOCK_BINDPORT_LOCK; inet->inet_sport = htons(inet->inet_num); inet->inet_dport = 0; inet->inet_daddr = 0; out: if (flags & BIND_WITH_LOCK) release_sock(sk); return err; out_unlock: rcu_read_unlock(); goto out; } int inet6_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len) { u32 flags = BIND_WITH_LOCK; const struct proto *prot; int err = 0; /* IPV6_ADDRFORM can change sk->sk_prot under us. */ prot = READ_ONCE(sk->sk_prot); /* If the socket has its own bind function then use it. */ if (prot->bind) return prot->bind(sk, uaddr, addr_len); if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; /* BPF prog is run before any checks are done so that if the prog * changes context in a wrong way it will be caught. */ err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, CGROUP_INET6_BIND, &flags); if (err) return err; return __inet6_bind(sk, uaddr, addr_len, flags); } /* bind for INET6 API */ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { return inet6_bind_sk(sock->sk, uaddr, addr_len); } EXPORT_SYMBOL(inet6_bind); int inet6_release(struct socket *sock) { struct sock *sk = sock->sk; if (!sk) return -EINVAL; /* Free mc lists */ ipv6_sock_mc_close(sk); /* Free ac lists */ ipv6_sock_ac_close(sk); return inet_release(sock); } EXPORT_SYMBOL(inet6_release); void inet6_cleanup_sock(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); struct sk_buff *skb; struct ipv6_txoptions *opt; /* Release rx options */ skb = xchg(&np->pktoptions, NULL); kfree_skb(skb); skb = xchg(&np->rxpmtu, NULL); kfree_skb(skb); /* Free flowlabels */ fl6_free_socklist(sk); /* Free tx options */ opt = xchg((__force struct ipv6_txoptions **)&np->opt, NULL); if (opt) { atomic_sub(opt->tot_len, &sk->sk_omem_alloc); txopt_put(opt); } } EXPORT_SYMBOL_GPL(inet6_cleanup_sock); /* * This does both peername and sockname. */ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sockaddr_in6 *sin = (struct sockaddr_in6 *)uaddr; struct sock *sk = sock->sk; struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); sin->sin6_family = AF_INET6; sin->sin6_flowinfo = 0; sin->sin6_scope_id = 0; lock_sock(sk); if (peer) { if (!inet->inet_dport || (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) && peer == 1)) { release_sock(sk); return -ENOTCONN; } sin->sin6_port = inet->inet_dport; sin->sin6_addr = sk->sk_v6_daddr; if (np->sndflow) sin->sin6_flowinfo = np->flow_label; BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, CGROUP_INET6_GETPEERNAME); } else { if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) sin->sin6_addr = np->saddr; else sin->sin6_addr = sk->sk_v6_rcv_saddr; sin->sin6_port = inet->inet_sport; BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, CGROUP_INET6_GETSOCKNAME); } sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, sk->sk_bound_dev_if); release_sock(sk); return sizeof(*sin); } EXPORT_SYMBOL(inet6_getname); int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { void __user *argp = (void __user *)arg; struct sock *sk = sock->sk; struct net *net = sock_net(sk); const struct proto *prot; switch (cmd) { case SIOCADDRT: case SIOCDELRT: { struct in6_rtmsg rtmsg; if (copy_from_user(&rtmsg, argp, sizeof(rtmsg))) return -EFAULT; return ipv6_route_ioctl(net, cmd, &rtmsg); } case SIOCSIFADDR: return addrconf_add_ifaddr(net, argp); case SIOCDIFADDR: return addrconf_del_ifaddr(net, argp); case SIOCSIFDSTADDR: return addrconf_set_dstaddr(net, argp); default: /* IPV6_ADDRFORM can change sk->sk_prot under us. */ prot = READ_ONCE(sk->sk_prot); if (!prot->ioctl) return -ENOIOCTLCMD; return sk_ioctl(sk, cmd, (void __user *)arg); } /*NOTREACHED*/ return 0; } EXPORT_SYMBOL(inet6_ioctl); #ifdef CONFIG_COMPAT struct compat_in6_rtmsg { struct in6_addr rtmsg_dst; struct in6_addr rtmsg_src; struct in6_addr rtmsg_gateway; u32 rtmsg_type; u16 rtmsg_dst_len; u16 rtmsg_src_len; u32 rtmsg_metric; u32 rtmsg_info; u32 rtmsg_flags; s32 rtmsg_ifindex; }; static int inet6_compat_routing_ioctl(struct sock *sk, unsigned int cmd, struct compat_in6_rtmsg __user *ur) { struct in6_rtmsg rt; if (copy_from_user(&rt.rtmsg_dst, &ur->rtmsg_dst, 3 * sizeof(struct in6_addr)) || get_user(rt.rtmsg_type, &ur->rtmsg_type) || get_user(rt.rtmsg_dst_len, &ur->rtmsg_dst_len) || get_user(rt.rtmsg_src_len, &ur->rtmsg_src_len) || get_user(rt.rtmsg_metric, &ur->rtmsg_metric) || get_user(rt.rtmsg_info, &ur->rtmsg_info) || get_user(rt.rtmsg_flags, &ur->rtmsg_flags) || get_user(rt.rtmsg_ifindex, &ur->rtmsg_ifindex)) return -EFAULT; return ipv6_route_ioctl(sock_net(sk), cmd, &rt); } int inet6_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { void __user *argp = compat_ptr(arg); struct sock *sk = sock->sk; switch (cmd) { case SIOCADDRT: case SIOCDELRT: return inet6_compat_routing_ioctl(sk, cmd, argp); default: return -ENOIOCTLCMD; } } EXPORT_SYMBOL_GPL(inet6_compat_ioctl); #endif /* CONFIG_COMPAT */ INDIRECT_CALLABLE_DECLARE(int udpv6_sendmsg(struct sock *, struct msghdr *, size_t)); int inet6_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; const struct proto *prot; if (unlikely(inet_send_prepare(sk))) return -EAGAIN; /* IPV6_ADDRFORM can change sk->sk_prot under us. */ prot = READ_ONCE(sk->sk_prot); return INDIRECT_CALL_2(prot->sendmsg, tcp_sendmsg, udpv6_sendmsg, sk, msg, size); } INDIRECT_CALLABLE_DECLARE(int udpv6_recvmsg(struct sock *, struct msghdr *, size_t, int, int *)); int inet6_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; const struct proto *prot; int addr_len = 0; int err; if (likely(!(flags & MSG_ERRQUEUE))) sock_rps_record_flow(sk); /* IPV6_ADDRFORM can change sk->sk_prot under us. */ prot = READ_ONCE(sk->sk_prot); err = INDIRECT_CALL_2(prot->recvmsg, tcp_recvmsg, udpv6_recvmsg, sk, msg, size, flags, &addr_len); if (err >= 0) msg->msg_namelen = addr_len; return err; } const struct proto_ops inet6_stream_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, .bind = inet6_bind, .connect = inet_stream_connect, /* ok */ .socketpair = sock_no_socketpair, /* a do nothing */ .accept = inet_accept, /* ok */ .getname = inet6_getname, .poll = tcp_poll, /* ok */ .ioctl = inet6_ioctl, /* must change */ .gettstamp = sock_gettstamp, .listen = inet_listen, /* ok */ .shutdown = inet_shutdown, /* ok */ .setsockopt = sock_common_setsockopt, /* ok */ .getsockopt = sock_common_getsockopt, /* ok */ .sendmsg = inet6_sendmsg, /* retpoline's sake */ .recvmsg = inet6_recvmsg, /* retpoline's sake */ #ifdef CONFIG_MMU .mmap = tcp_mmap, #endif .splice_eof = inet_splice_eof, .sendmsg_locked = tcp_sendmsg_locked, .splice_read = tcp_splice_read, .read_sock = tcp_read_sock, .read_skb = tcp_read_skb, .peek_len = tcp_peek_len, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, #endif .set_rcvlowat = tcp_set_rcvlowat, }; const struct proto_ops inet6_dgram_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, .bind = inet6_bind, .connect = inet_dgram_connect, /* ok */ .socketpair = sock_no_socketpair, /* a do nothing */ .accept = sock_no_accept, /* a do nothing */ .getname = inet6_getname, .poll = udp_poll, /* ok */ .ioctl = inet6_ioctl, /* must change */ .gettstamp = sock_gettstamp, .listen = sock_no_listen, /* ok */ .shutdown = inet_shutdown, /* ok */ .setsockopt = sock_common_setsockopt, /* ok */ .getsockopt = sock_common_getsockopt, /* ok */ .sendmsg = inet6_sendmsg, /* retpoline's sake */ .recvmsg = inet6_recvmsg, /* retpoline's sake */ .read_skb = udp_read_skb, .mmap = sock_no_mmap, .set_peek_off = sk_set_peek_off, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, #endif }; static const struct net_proto_family inet6_family_ops = { .family = PF_INET6, .create = inet6_create, .owner = THIS_MODULE, }; int inet6_register_protosw(struct inet_protosw *p) { struct list_head *lh; struct inet_protosw *answer; struct list_head *last_perm; int protocol = p->protocol; int ret; spin_lock_bh(&inetsw6_lock); ret = -EINVAL; if (p->type >= SOCK_MAX) goto out_illegal; /* If we are trying to override a permanent protocol, bail. */ answer = NULL; ret = -EPERM; last_perm = &inetsw6[p->type]; list_for_each(lh, &inetsw6[p->type]) { answer = list_entry(lh, struct inet_protosw, list); /* Check only the non-wild match. */ if (INET_PROTOSW_PERMANENT & answer->flags) { if (protocol == answer->protocol) break; last_perm = lh; } answer = NULL; } if (answer) goto out_permanent; /* Add the new entry after the last permanent entry if any, so that * the new entry does not override a permanent entry when matched with * a wild-card protocol. But it is allowed to override any existing * non-permanent entry. This means that when we remove this entry, the * system automatically returns to the old behavior. */ list_add_rcu(&p->list, last_perm); ret = 0; out: spin_unlock_bh(&inetsw6_lock); return ret; out_permanent: pr_err("Attempt to override permanent protocol %d\n", protocol); goto out; out_illegal: pr_err("Ignoring attempt to register invalid socket type %d\n", p->type); goto out; } EXPORT_SYMBOL(inet6_register_protosw); void inet6_unregister_protosw(struct inet_protosw *p) { if (INET_PROTOSW_PERMANENT & p->flags) { pr_err("Attempt to unregister permanent protocol %d\n", p->protocol); } else { spin_lock_bh(&inetsw6_lock); list_del_rcu(&p->list); spin_unlock_bh(&inetsw6_lock); synchronize_net(); } } EXPORT_SYMBOL(inet6_unregister_protosw); int inet6_sk_rebuild_header(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); struct dst_entry *dst; dst = __sk_dst_check(sk, np->dst_cookie); if (!dst) { struct inet_sock *inet = inet_sk(sk); struct in6_addr *final_p, final; struct flowi6 fl6; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = sk->sk_protocol; fl6.daddr = sk->sk_v6_daddr; fl6.saddr = np->saddr; fl6.flowlabel = np->flow_label; fl6.flowi6_oif = sk->sk_bound_dev_if; fl6.flowi6_mark = sk->sk_mark; fl6.fl6_dport = inet->inet_dport; fl6.fl6_sport = inet->inet_sport; fl6.flowi6_uid = sk->sk_uid; security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); rcu_read_lock(); final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final); rcu_read_unlock(); dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) { sk->sk_route_caps = 0; WRITE_ONCE(sk->sk_err_soft, -PTR_ERR(dst)); return PTR_ERR(dst); } ip6_dst_store(sk, dst, NULL, NULL); } return 0; } EXPORT_SYMBOL_GPL(inet6_sk_rebuild_header); bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb, const struct inet6_skb_parm *opt) { const struct ipv6_pinfo *np = inet6_sk(sk); if (np->rxopt.all) { if (((opt->flags & IP6SKB_HOPBYHOP) && (np->rxopt.bits.hopopts || np->rxopt.bits.ohopopts)) || (ip6_flowinfo((struct ipv6hdr *) skb_network_header(skb)) && np->rxopt.bits.rxflow) || (opt->srcrt && (np->rxopt.bits.srcrt || np->rxopt.bits.osrcrt)) || ((opt->dst1 || opt->dst0) && (np->rxopt.bits.dstopts || np->rxopt.bits.odstopts))) return true; } return false; } EXPORT_SYMBOL_GPL(ipv6_opt_accepted); static struct packet_type ipv6_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_IPV6), .func = ipv6_rcv, .list_func = ipv6_list_rcv, }; static int __init ipv6_packet_init(void) { dev_add_pack(&ipv6_packet_type); return 0; } static void ipv6_packet_cleanup(void) { dev_remove_pack(&ipv6_packet_type); } static int __net_init ipv6_init_mibs(struct net *net) { int i; net->mib.udp_stats_in6 = alloc_percpu(struct udp_mib); if (!net->mib.udp_stats_in6) return -ENOMEM; net->mib.udplite_stats_in6 = alloc_percpu(struct udp_mib); if (!net->mib.udplite_stats_in6) goto err_udplite_mib; net->mib.ipv6_statistics = alloc_percpu(struct ipstats_mib); if (!net->mib.ipv6_statistics) goto err_ip_mib; for_each_possible_cpu(i) { struct ipstats_mib *af_inet6_stats; af_inet6_stats = per_cpu_ptr(net->mib.ipv6_statistics, i); u64_stats_init(&af_inet6_stats->syncp); } net->mib.icmpv6_statistics = alloc_percpu(struct icmpv6_mib); if (!net->mib.icmpv6_statistics) goto err_icmp_mib; net->mib.icmpv6msg_statistics = kzalloc(sizeof(struct icmpv6msg_mib), GFP_KERNEL); if (!net->mib.icmpv6msg_statistics) goto err_icmpmsg_mib; return 0; err_icmpmsg_mib: free_percpu(net->mib.icmpv6_statistics); err_icmp_mib: free_percpu(net->mib.ipv6_statistics); err_ip_mib: free_percpu(net->mib.udplite_stats_in6); err_udplite_mib: free_percpu(net->mib.udp_stats_in6); return -ENOMEM; } static void ipv6_cleanup_mibs(struct net *net) { free_percpu(net->mib.udp_stats_in6); free_percpu(net->mib.udplite_stats_in6); free_percpu(net->mib.ipv6_statistics); free_percpu(net->mib.icmpv6_statistics); kfree(net->mib.icmpv6msg_statistics); } static int __net_init inet6_net_init(struct net *net) { int err = 0; net->ipv6.sysctl.bindv6only = 0; net->ipv6.sysctl.icmpv6_time = 1*HZ; net->ipv6.sysctl.icmpv6_echo_ignore_all = 0; net->ipv6.sysctl.icmpv6_echo_ignore_multicast = 0; net->ipv6.sysctl.icmpv6_echo_ignore_anycast = 0; net->ipv6.sysctl.icmpv6_error_anycast_as_unicast = 0; /* By default, rate limit error messages. * Except for pmtu discovery, it would break it. * proc_do_large_bitmap needs pointer to the bitmap. */ bitmap_set(net->ipv6.sysctl.icmpv6_ratemask, 0, ICMPV6_ERRMSG_MAX + 1); bitmap_clear(net->ipv6.sysctl.icmpv6_ratemask, ICMPV6_PKT_TOOBIG, 1); net->ipv6.sysctl.icmpv6_ratemask_ptr = net->ipv6.sysctl.icmpv6_ratemask; net->ipv6.sysctl.flowlabel_consistency = 1; net->ipv6.sysctl.auto_flowlabels = IP6_DEFAULT_AUTO_FLOW_LABELS; net->ipv6.sysctl.idgen_retries = 3; net->ipv6.sysctl.idgen_delay = 1 * HZ; net->ipv6.sysctl.flowlabel_state_ranges = 0; net->ipv6.sysctl.max_dst_opts_cnt = IP6_DEFAULT_MAX_DST_OPTS_CNT; net->ipv6.sysctl.max_hbh_opts_cnt = IP6_DEFAULT_MAX_HBH_OPTS_CNT; net->ipv6.sysctl.max_dst_opts_len = IP6_DEFAULT_MAX_DST_OPTS_LEN; net->ipv6.sysctl.max_hbh_opts_len = IP6_DEFAULT_MAX_HBH_OPTS_LEN; net->ipv6.sysctl.fib_notify_on_flag_change = 0; atomic_set(&net->ipv6.fib6_sernum, 1); net->ipv6.sysctl.ioam6_id = IOAM6_DEFAULT_ID; net->ipv6.sysctl.ioam6_id_wide = IOAM6_DEFAULT_ID_WIDE; err = ipv6_init_mibs(net); if (err) return err; #ifdef CONFIG_PROC_FS err = udp6_proc_init(net); if (err) goto out; err = tcp6_proc_init(net); if (err) goto proc_tcp6_fail; err = ac6_proc_init(net); if (err) goto proc_ac6_fail; #endif return err; #ifdef CONFIG_PROC_FS proc_ac6_fail: tcp6_proc_exit(net); proc_tcp6_fail: udp6_proc_exit(net); out: ipv6_cleanup_mibs(net); return err; #endif } static void __net_exit inet6_net_exit(struct net *net) { #ifdef CONFIG_PROC_FS udp6_proc_exit(net); tcp6_proc_exit(net); ac6_proc_exit(net); #endif ipv6_cleanup_mibs(net); } static struct pernet_operations inet6_net_ops = { .init = inet6_net_init, .exit = inet6_net_exit, }; static int ipv6_route_input(struct sk_buff *skb) { ip6_route_input(skb); return skb_dst(skb)->error; } static const struct ipv6_stub ipv6_stub_impl = { .ipv6_sock_mc_join = ipv6_sock_mc_join, .ipv6_sock_mc_drop = ipv6_sock_mc_drop, .ipv6_dst_lookup_flow = ip6_dst_lookup_flow, .ipv6_route_input = ipv6_route_input, .fib6_get_table = fib6_get_table, .fib6_table_lookup = fib6_table_lookup, .fib6_lookup = fib6_lookup, .fib6_select_path = fib6_select_path, .ip6_mtu_from_fib6 = ip6_mtu_from_fib6, .fib6_nh_init = fib6_nh_init, .fib6_nh_release = fib6_nh_release, .fib6_nh_release_dsts = fib6_nh_release_dsts, .fib6_update_sernum = fib6_update_sernum_stub, .fib6_rt_update = fib6_rt_update, .ip6_del_rt = ip6_del_rt, .udpv6_encap_enable = udpv6_encap_enable, .ndisc_send_na = ndisc_send_na, #if IS_ENABLED(CONFIG_XFRM) .xfrm6_local_rxpmtu = xfrm6_local_rxpmtu, .xfrm6_udp_encap_rcv = xfrm6_udp_encap_rcv, .xfrm6_rcv_encap = xfrm6_rcv_encap, #endif .nd_tbl = &nd_tbl, .ipv6_fragment = ip6_fragment, .ipv6_dev_find = ipv6_dev_find, }; static const struct ipv6_bpf_stub ipv6_bpf_stub_impl = { .inet6_bind = __inet6_bind, .udp6_lib_lookup = __udp6_lib_lookup, .ipv6_setsockopt = do_ipv6_setsockopt, .ipv6_getsockopt = do_ipv6_getsockopt, }; static int __init inet6_init(void) { struct list_head *r; int err = 0; sock_skb_cb_check_size(sizeof(struct inet6_skb_parm)); /* Register the socket-side information for inet6_create. */ for (r = &inetsw6[0]; r < &inetsw6[SOCK_MAX]; ++r) INIT_LIST_HEAD(r); raw_hashinfo_init(&raw_v6_hashinfo); if (disable_ipv6_mod) { pr_info("Loaded, but administratively disabled, reboot required to enable\n"); goto out; } err = proto_register(&tcpv6_prot, 1); if (err) goto out; err = proto_register(&udpv6_prot, 1); if (err) goto out_unregister_tcp_proto; err = proto_register(&udplitev6_prot, 1); if (err) goto out_unregister_udp_proto; err = proto_register(&rawv6_prot, 1); if (err) goto out_unregister_udplite_proto; err = proto_register(&pingv6_prot, 1); if (err) goto out_unregister_raw_proto; /* We MUST register RAW sockets before we create the ICMP6, * IGMP6, or NDISC control sockets. */ err = rawv6_init(); if (err) goto out_unregister_ping_proto; /* Register the family here so that the init calls below will * be able to create sockets. (?? is this dangerous ??) */ err = sock_register(&inet6_family_ops); if (err) goto out_sock_register_fail; /* * ipngwg API draft makes clear that the correct semantics * for TCP and UDP is to consider one TCP and UDP instance * in a host available by both INET and INET6 APIs and * able to communicate via both network protocols. */ err = register_pernet_subsys(&inet6_net_ops); if (err) goto register_pernet_fail; err = ip6_mr_init(); if (err) goto ipmr_fail; err = icmpv6_init(); if (err) goto icmp_fail; err = ndisc_init(); if (err) goto ndisc_fail; err = igmp6_init(); if (err) goto igmp_fail; err = ipv6_netfilter_init(); if (err) goto netfilter_fail; /* Create /proc/foo6 entries. */ #ifdef CONFIG_PROC_FS err = -ENOMEM; if (raw6_proc_init()) goto proc_raw6_fail; if (udplite6_proc_init()) goto proc_udplite6_fail; if (ipv6_misc_proc_init()) goto proc_misc6_fail; if (if6_proc_init()) goto proc_if6_fail; #endif err = ip6_route_init(); if (err) goto ip6_route_fail; err = ndisc_late_init(); if (err) goto ndisc_late_fail; err = ip6_flowlabel_init(); if (err) goto ip6_flowlabel_fail; err = ipv6_anycast_init(); if (err) goto ipv6_anycast_fail; err = addrconf_init(); if (err) goto addrconf_fail; /* Init v6 extension headers. */ err = ipv6_exthdrs_init(); if (err) goto ipv6_exthdrs_fail; err = ipv6_frag_init(); if (err) goto ipv6_frag_fail; /* Init v6 transport protocols. */ err = udpv6_init(); if (err) goto udpv6_fail; err = udplitev6_init(); if (err) goto udplitev6_fail; err = udpv6_offload_init(); if (err) goto udpv6_offload_fail; err = tcpv6_init(); if (err) goto tcpv6_fail; err = ipv6_packet_init(); if (err) goto ipv6_packet_fail; err = pingv6_init(); if (err) goto pingv6_fail; err = calipso_init(); if (err) goto calipso_fail; err = seg6_init(); if (err) goto seg6_fail; err = rpl_init(); if (err) goto rpl_fail; err = ioam6_init(); if (err) goto ioam6_fail; err = igmp6_late_init(); if (err) goto igmp6_late_err; #ifdef CONFIG_SYSCTL err = ipv6_sysctl_register(); if (err) goto sysctl_fail; #endif /* ensure that ipv6 stubs are visible only after ipv6 is ready */ wmb(); ipv6_stub = &ipv6_stub_impl; ipv6_bpf_stub = &ipv6_bpf_stub_impl; out: return err; #ifdef CONFIG_SYSCTL sysctl_fail: igmp6_late_cleanup(); #endif igmp6_late_err: ioam6_exit(); ioam6_fail: rpl_exit(); rpl_fail: seg6_exit(); seg6_fail: calipso_exit(); calipso_fail: pingv6_exit(); pingv6_fail: ipv6_packet_cleanup(); ipv6_packet_fail: tcpv6_exit(); tcpv6_fail: udpv6_offload_exit(); udpv6_offload_fail: udplitev6_exit(); udplitev6_fail: udpv6_exit(); udpv6_fail: ipv6_frag_exit(); ipv6_frag_fail: ipv6_exthdrs_exit(); ipv6_exthdrs_fail: addrconf_cleanup(); addrconf_fail: ipv6_anycast_cleanup(); ipv6_anycast_fail: ip6_flowlabel_cleanup(); ip6_flowlabel_fail: ndisc_late_cleanup(); ndisc_late_fail: ip6_route_cleanup(); ip6_route_fail: #ifdef CONFIG_PROC_FS if6_proc_exit(); proc_if6_fail: ipv6_misc_proc_exit(); proc_misc6_fail: udplite6_proc_exit(); proc_udplite6_fail: raw6_proc_exit(); proc_raw6_fail: #endif ipv6_netfilter_fini(); netfilter_fail: igmp6_cleanup(); igmp_fail: ndisc_cleanup(); ndisc_fail: icmpv6_cleanup(); icmp_fail: ip6_mr_cleanup(); ipmr_fail: unregister_pernet_subsys(&inet6_net_ops); register_pernet_fail: sock_unregister(PF_INET6); rtnl_unregister_all(PF_INET6); out_sock_register_fail: rawv6_exit(); out_unregister_ping_proto: proto_unregister(&pingv6_prot); out_unregister_raw_proto: proto_unregister(&rawv6_prot); out_unregister_udplite_proto: proto_unregister(&udplitev6_prot); out_unregister_udp_proto: proto_unregister(&udpv6_prot); out_unregister_tcp_proto: proto_unregister(&tcpv6_prot); goto out; } module_init(inet6_init); MODULE_ALIAS_NETPROTO(PF_INET6);
47 105 37 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef LLIST_H #define LLIST_H /* * Lock-less NULL terminated single linked list * * Cases where locking is not needed: * If there are multiple producers and multiple consumers, llist_add can be * used in producers and llist_del_all can be used in consumers simultaneously * without locking. Also a single consumer can use llist_del_first while * multiple producers simultaneously use llist_add, without any locking. * * Cases where locking is needed: * If we have multiple consumers with llist_del_first used in one consumer, and * llist_del_first or llist_del_all used in other consumers, then a lock is * needed. This is because llist_del_first depends on list->first->next not * changing, but without lock protection, there's no way to be sure about that * if a preemption happens in the middle of the delete operation and on being * preempted back, the list->first is the same as before causing the cmpxchg in * llist_del_first to succeed. For example, while a llist_del_first operation * is in progress in one consumer, then a llist_del_first, llist_add, * llist_add (or llist_del_all, llist_add, llist_add) sequence in another * consumer may cause violations. * * This can be summarized as follows: * * | add | del_first | del_all * add | - | - | - * del_first | | L | L * del_all | | | - * * Where, a particular row's operation can happen concurrently with a column's * operation, with "-" being no lock needed, while "L" being lock is needed. * * The list entries deleted via llist_del_all can be traversed with * traversing function such as llist_for_each etc. But the list * entries can not be traversed safely before deleted from the list. * The order of deleted entries is from the newest to the oldest added * one. If you want to traverse from the oldest to the newest, you * must reverse the order by yourself before traversing. * * The basic atomic operation of this list is cmpxchg on long. On * architectures that don't have NMI-safe cmpxchg implementation, the * list can NOT be used in NMI handlers. So code that uses the list in * an NMI handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG. * * Copyright 2010,2011 Intel Corp. * Author: Huang Ying <ying.huang@intel.com> */ #include <linux/atomic.h> #include <linux/container_of.h> #include <linux/stddef.h> #include <linux/types.h> struct llist_head { struct llist_node *first; }; struct llist_node { struct llist_node *next; }; #define LLIST_HEAD_INIT(name) { NULL } #define LLIST_HEAD(name) struct llist_head name = LLIST_HEAD_INIT(name) /** * init_llist_head - initialize lock-less list head * @head: the head for your lock-less list */ static inline void init_llist_head(struct llist_head *list) { list->first = NULL; } /** * llist_entry - get the struct of this entry * @ptr: the &struct llist_node pointer. * @type: the type of the struct this is embedded in. * @member: the name of the llist_node within the struct. */ #define llist_entry(ptr, type, member) \ container_of(ptr, type, member) /** * member_address_is_nonnull - check whether the member address is not NULL * @ptr: the object pointer (struct type * that contains the llist_node) * @member: the name of the llist_node within the struct. * * This macro is conceptually the same as * &ptr->member != NULL * but it works around the fact that compilers can decide that taking a member * address is never a NULL pointer. * * Real objects that start at a high address and have a member at NULL are * unlikely to exist, but such pointers may be returned e.g. by the * container_of() macro. */ #define member_address_is_nonnull(ptr, member) \ ((uintptr_t)(ptr) + offsetof(typeof(*(ptr)), member) != 0) /** * llist_for_each - iterate over some deleted entries of a lock-less list * @pos: the &struct llist_node to use as a loop cursor * @node: the first entry of deleted list entries * * In general, some entries of the lock-less list can be traversed * safely only after being deleted from list, so start with an entry * instead of list head. * * If being used on entries deleted from lock-less list directly, the * traverse order is from the newest to the oldest added entry. If * you want to traverse from the oldest to the newest, you must * reverse the order by yourself before traversing. */ #define llist_for_each(pos, node) \ for ((pos) = (node); pos; (pos) = (pos)->next) /** * llist_for_each_safe - iterate over some deleted entries of a lock-less list * safe against removal of list entry * @pos: the &struct llist_node to use as a loop cursor * @n: another &struct llist_node to use as temporary storage * @node: the first entry of deleted list entries * * In general, some entries of the lock-less list can be traversed * safely only after being deleted from list, so start with an entry * instead of list head. * * If being used on entries deleted from lock-less list directly, the * traverse order is from the newest to the oldest added entry. If * you want to traverse from the oldest to the newest, you must * reverse the order by yourself before traversing. */ #define llist_for_each_safe(pos, n, node) \ for ((pos) = (node); (pos) && ((n) = (pos)->next, true); (pos) = (n)) /** * llist_for_each_entry - iterate over some deleted entries of lock-less list of given type * @pos: the type * to use as a loop cursor. * @node: the fist entry of deleted list entries. * @member: the name of the llist_node with the struct. * * In general, some entries of the lock-less list can be traversed * safely only after being removed from list, so start with an entry * instead of list head. * * If being used on entries deleted from lock-less list directly, the * traverse order is from the newest to the oldest added entry. If * you want to traverse from the oldest to the newest, you must * reverse the order by yourself before traversing. */ #define llist_for_each_entry(pos, node, member) \ for ((pos) = llist_entry((node), typeof(*(pos)), member); \ member_address_is_nonnull(pos, member); \ (pos) = llist_entry((pos)->member.next, typeof(*(pos)), member)) /** * llist_for_each_entry_safe - iterate over some deleted entries of lock-less list of given type * safe against removal of list entry * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @node: the first entry of deleted list entries. * @member: the name of the llist_node with the struct. * * In general, some entries of the lock-less list can be traversed * safely only after being removed from list, so start with an entry * instead of list head. * * If being used on entries deleted from lock-less list directly, the * traverse order is from the newest to the oldest added entry. If * you want to traverse from the oldest to the newest, you must * reverse the order by yourself before traversing. */ #define llist_for_each_entry_safe(pos, n, node, member) \ for (pos = llist_entry((node), typeof(*pos), member); \ member_address_is_nonnull(pos, member) && \ (n = llist_entry(pos->member.next, typeof(*n), member), true); \ pos = n) /** * llist_empty - tests whether a lock-less list is empty * @head: the list to test * * Not guaranteed to be accurate or up to date. Just a quick way to * test whether the list is empty without deleting something from the * list. */ static inline bool llist_empty(const struct llist_head *head) { return READ_ONCE(head->first) == NULL; } static inline struct llist_node *llist_next(struct llist_node *node) { return node->next; } extern bool llist_add_batch(struct llist_node *new_first, struct llist_node *new_last, struct llist_head *head); static inline bool __llist_add_batch(struct llist_node *new_first, struct llist_node *new_last, struct llist_head *head) { new_last->next = head->first; head->first = new_first; return new_last->next == NULL; } /** * llist_add - add a new entry * @new: new entry to be added * @head: the head for your lock-less list * * Returns true if the list was empty prior to adding this entry. */ static inline bool llist_add(struct llist_node *new, struct llist_head *head) { return llist_add_batch(new, new, head); } static inline bool __llist_add(struct llist_node *new, struct llist_head *head) { return __llist_add_batch(new, new, head); } /** * llist_del_all - delete all entries from lock-less list * @head: the head of lock-less list to delete all entries * * If list is empty, return NULL, otherwise, delete all entries and * return the pointer to the first entry. The order of entries * deleted is from the newest to the oldest added one. */ static inline struct llist_node *llist_del_all(struct llist_head *head) { return xchg(&head->first, NULL); } static inline struct llist_node *__llist_del_all(struct llist_head *head) { struct llist_node *first = head->first; head->first = NULL; return first; } extern struct llist_node *llist_del_first(struct llist_head *head); struct llist_node *llist_reverse_order(struct llist_node *head); #endif /* LLIST_H */
2 2 2 2 2 2 2 7 6 4 3 3 2 2 7 9 10 10 3 3 3 2 2 6 6 6 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 76 75 73 1 2 1 1 76 33 34 34 35 36 36 35 35 36 1 36 35 36 35 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 6 6 6 6 6 6 6 6 6 6 16 16 16 16 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 // SPDX-License-Identifier: GPL-2.0 /* * Wireless utility functions * * Copyright 2007-2009 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2017 Intel Deutschland GmbH * Copyright (C) 2018-2023 Intel Corporation */ #include <linux/export.h> #include <linux/bitops.h> #include <linux/etherdevice.h> #include <linux/slab.h> #include <linux/ieee80211.h> #include <net/cfg80211.h> #include <net/ip.h> #include <net/dsfield.h> #include <linux/if_vlan.h> #include <linux/mpls.h> #include <linux/gcd.h> #include <linux/bitfield.h> #include <linux/nospec.h> #include "core.h" #include "rdev-ops.h" const struct ieee80211_rate * ieee80211_get_response_rate(struct ieee80211_supported_band *sband, u32 basic_rates, int bitrate) { struct ieee80211_rate *result = &sband->bitrates[0]; int i; for (i = 0; i < sband->n_bitrates; i++) { if (!(basic_rates & BIT(i))) continue; if (sband->bitrates[i].bitrate > bitrate) continue; result = &sband->bitrates[i]; } return result; } EXPORT_SYMBOL(ieee80211_get_response_rate); u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband, enum nl80211_bss_scan_width scan_width) { struct ieee80211_rate *bitrates; u32 mandatory_rates = 0; enum ieee80211_rate_flags mandatory_flag; int i; if (WARN_ON(!sband)) return 1; if (sband->band == NL80211_BAND_2GHZ) { if (scan_width == NL80211_BSS_CHAN_WIDTH_5 || scan_width == NL80211_BSS_CHAN_WIDTH_10) mandatory_flag = IEEE80211_RATE_MANDATORY_G; else mandatory_flag = IEEE80211_RATE_MANDATORY_B; } else { mandatory_flag = IEEE80211_RATE_MANDATORY_A; } bitrates = sband->bitrates; for (i = 0; i < sband->n_bitrates; i++) if (bitrates[i].flags & mandatory_flag) mandatory_rates |= BIT(i); return mandatory_rates; } EXPORT_SYMBOL(ieee80211_mandatory_rates); u32 ieee80211_channel_to_freq_khz(int chan, enum nl80211_band band) { /* see 802.11 17.3.8.3.2 and Annex J * there are overlapping channel numbers in 5GHz and 2GHz bands */ if (chan <= 0) return 0; /* not supported */ switch (band) { case NL80211_BAND_2GHZ: case NL80211_BAND_LC: if (chan == 14) return MHZ_TO_KHZ(2484); else if (chan < 14) return MHZ_TO_KHZ(2407 + chan * 5); break; case NL80211_BAND_5GHZ: if (chan >= 182 && chan <= 196) return MHZ_TO_KHZ(4000 + chan * 5); else return MHZ_TO_KHZ(5000 + chan * 5); break; case NL80211_BAND_6GHZ: /* see 802.11ax D6.1 27.3.23.2 */ if (chan == 2) return MHZ_TO_KHZ(5935); if (chan <= 233) return MHZ_TO_KHZ(5950 + chan * 5); break; case NL80211_BAND_60GHZ: if (chan < 7) return MHZ_TO_KHZ(56160 + chan * 2160); break; case NL80211_BAND_S1GHZ: return 902000 + chan * 500; default: ; } return 0; /* not supported */ } EXPORT_SYMBOL(ieee80211_channel_to_freq_khz); enum nl80211_chan_width ieee80211_s1g_channel_width(const struct ieee80211_channel *chan) { if (WARN_ON(!chan || chan->band != NL80211_BAND_S1GHZ)) return NL80211_CHAN_WIDTH_20_NOHT; /*S1G defines a single allowed channel width per channel. * Extract that width here. */ if (chan->flags & IEEE80211_CHAN_1MHZ) return NL80211_CHAN_WIDTH_1; else if (chan->flags & IEEE80211_CHAN_2MHZ) return NL80211_CHAN_WIDTH_2; else if (chan->flags & IEEE80211_CHAN_4MHZ) return NL80211_CHAN_WIDTH_4; else if (chan->flags & IEEE80211_CHAN_8MHZ) return NL80211_CHAN_WIDTH_8; else if (chan->flags & IEEE80211_CHAN_16MHZ) return NL80211_CHAN_WIDTH_16; pr_err("unknown channel width for channel at %dKHz?\n", ieee80211_channel_to_khz(chan)); return NL80211_CHAN_WIDTH_1; } EXPORT_SYMBOL(ieee80211_s1g_channel_width); int ieee80211_freq_khz_to_channel(u32 freq) { /* TODO: just handle MHz for now */ freq = KHZ_TO_MHZ(freq); /* see 802.11 17.3.8.3.2 and Annex J */ if (freq == 2484) return 14; else if (freq < 2484) return (freq - 2407) / 5; else if (freq >= 4910 && freq <= 4980) return (freq - 4000) / 5; else if (freq < 5925) return (freq - 5000) / 5; else if (freq == 5935) return 2; else if (freq <= 45000) /* DMG band lower limit */ /* see 802.11ax D6.1 27.3.22.2 */ return (freq - 5950) / 5; else if (freq >= 58320 && freq <= 70200) return (freq - 56160) / 2160; else return 0; } EXPORT_SYMBOL(ieee80211_freq_khz_to_channel); struct ieee80211_channel *ieee80211_get_channel_khz(struct wiphy *wiphy, u32 freq) { enum nl80211_band band; struct ieee80211_supported_band *sband; int i; for (band = 0; band < NUM_NL80211_BANDS; band++) { sband = wiphy->bands[band]; if (!sband) continue; for (i = 0; i < sband->n_channels; i++) { struct ieee80211_channel *chan = &sband->channels[i]; if (ieee80211_channel_to_khz(chan) == freq) return chan; } } return NULL; } EXPORT_SYMBOL(ieee80211_get_channel_khz); static void set_mandatory_flags_band(struct ieee80211_supported_band *sband) { int i, want; switch (sband->band) { case NL80211_BAND_5GHZ: case NL80211_BAND_6GHZ: want = 3; for (i = 0; i < sband->n_bitrates; i++) { if (sband->bitrates[i].bitrate == 60 || sband->bitrates[i].bitrate == 120 || sband->bitrates[i].bitrate == 240) { sband->bitrates[i].flags |= IEEE80211_RATE_MANDATORY_A; want--; } } WARN_ON(want); break; case NL80211_BAND_2GHZ: case NL80211_BAND_LC: want = 7; for (i = 0; i < sband->n_bitrates; i++) { switch (sband->bitrates[i].bitrate) { case 10: case 20: case 55: case 110: sband->bitrates[i].flags |= IEEE80211_RATE_MANDATORY_B | IEEE80211_RATE_MANDATORY_G; want--; break; case 60: case 120: case 240: sband->bitrates[i].flags |= IEEE80211_RATE_MANDATORY_G; want--; fallthrough; default: sband->bitrates[i].flags |= IEEE80211_RATE_ERP_G; break; } } WARN_ON(want != 0 && want != 3); break; case NL80211_BAND_60GHZ: /* check for mandatory HT MCS 1..4 */ WARN_ON(!sband->ht_cap.ht_supported); WARN_ON((sband->ht_cap.mcs.rx_mask[0] & 0x1e) != 0x1e); break; case NL80211_BAND_S1GHZ: /* Figure 9-589bd: 3 means unsupported, so != 3 means at least * mandatory is ok. */ WARN_ON((sband->s1g_cap.nss_mcs[0] & 0x3) == 0x3); break; case NUM_NL80211_BANDS: default: WARN_ON(1); break; } } void ieee80211_set_bitrate_flags(struct wiphy *wiphy) { enum nl80211_band band; for (band = 0; band < NUM_NL80211_BANDS; band++) if (wiphy->bands[band]) set_mandatory_flags_band(wiphy->bands[band]); } bool cfg80211_supported_cipher_suite(struct wiphy *wiphy, u32 cipher) { int i; for (i = 0; i < wiphy->n_cipher_suites; i++) if (cipher == wiphy->cipher_suites[i]) return true; return false; } static bool cfg80211_igtk_cipher_supported(struct cfg80211_registered_device *rdev) { struct wiphy *wiphy = &rdev->wiphy; int i; for (i = 0; i < wiphy->n_cipher_suites; i++) { switch (wiphy->cipher_suites[i]) { case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: return true; } } return false; } bool cfg80211_valid_key_idx(struct cfg80211_registered_device *rdev, int key_idx, bool pairwise) { int max_key_idx; if (pairwise) max_key_idx = 3; else if (wiphy_ext_feature_isset(&rdev->wiphy, NL80211_EXT_FEATURE_BEACON_PROTECTION) || wiphy_ext_feature_isset(&rdev->wiphy, NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT)) max_key_idx = 7; else if (cfg80211_igtk_cipher_supported(rdev)) max_key_idx = 5; else max_key_idx = 3; if (key_idx < 0 || key_idx > max_key_idx) return false; return true; } int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev, struct key_params *params, int key_idx, bool pairwise, const u8 *mac_addr) { if (!cfg80211_valid_key_idx(rdev, key_idx, pairwise)) return -EINVAL; if (!pairwise && mac_addr && !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN)) return -EINVAL; if (pairwise && !mac_addr) return -EINVAL; switch (params->cipher) { case WLAN_CIPHER_SUITE_TKIP: /* Extended Key ID can only be used with CCMP/GCMP ciphers */ if ((pairwise && key_idx) || params->mode != NL80211_KEY_RX_TX) return -EINVAL; break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: /* IEEE802.11-2016 allows only 0 and - when supporting * Extended Key ID - 1 as index for pairwise keys. * @NL80211_KEY_NO_TX is only allowed for pairwise keys when * the driver supports Extended Key ID. * @NL80211_KEY_SET_TX can't be set when installing and * validating a key. */ if ((params->mode == NL80211_KEY_NO_TX && !pairwise) || params->mode == NL80211_KEY_SET_TX) return -EINVAL; if (wiphy_ext_feature_isset(&rdev->wiphy, NL80211_EXT_FEATURE_EXT_KEY_ID)) { if (pairwise && (key_idx < 0 || key_idx > 1)) return -EINVAL; } else if (pairwise && key_idx) { return -EINVAL; } break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: /* Disallow BIP (group-only) cipher as pairwise cipher */ if (pairwise) return -EINVAL; if (key_idx < 4) return -EINVAL; break; case WLAN_CIPHER_SUITE_WEP40: case WLAN_CIPHER_SUITE_WEP104: if (key_idx > 3) return -EINVAL; break; default: break; } switch (params->cipher) { case WLAN_CIPHER_SUITE_WEP40: if (params->key_len != WLAN_KEY_LEN_WEP40) return -EINVAL; break; case WLAN_CIPHER_SUITE_TKIP: if (params->key_len != WLAN_KEY_LEN_TKIP) return -EINVAL; break; case WLAN_CIPHER_SUITE_CCMP: if (params->key_len != WLAN_KEY_LEN_CCMP) return -EINVAL; break; case WLAN_CIPHER_SUITE_CCMP_256: if (params->key_len != WLAN_KEY_LEN_CCMP_256) return -EINVAL; break; case WLAN_CIPHER_SUITE_GCMP: if (params->key_len != WLAN_KEY_LEN_GCMP) return -EINVAL; break; case WLAN_CIPHER_SUITE_GCMP_256: if (params->key_len != WLAN_KEY_LEN_GCMP_256) return -EINVAL; break; case WLAN_CIPHER_SUITE_WEP104: if (params->key_len != WLAN_KEY_LEN_WEP104) return -EINVAL; break; case WLAN_CIPHER_SUITE_AES_CMAC: if (params->key_len != WLAN_KEY_LEN_AES_CMAC) return -EINVAL; break; case WLAN_CIPHER_SUITE_BIP_CMAC_256: if (params->key_len != WLAN_KEY_LEN_BIP_CMAC_256) return -EINVAL; break; case WLAN_CIPHER_SUITE_BIP_GMAC_128: if (params->key_len != WLAN_KEY_LEN_BIP_GMAC_128) return -EINVAL; break; case WLAN_CIPHER_SUITE_BIP_GMAC_256: if (params->key_len != WLAN_KEY_LEN_BIP_GMAC_256) return -EINVAL; break; default: /* * We don't know anything about this algorithm, * allow using it -- but the driver must check * all parameters! We still check below whether * or not the driver supports this algorithm, * of course. */ break; } if (params->seq) { switch (params->cipher) { case WLAN_CIPHER_SUITE_WEP40: case WLAN_CIPHER_SUITE_WEP104: /* These ciphers do not use key sequence */ return -EINVAL; case WLAN_CIPHER_SUITE_TKIP: case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: if (params->seq_len != 6) return -EINVAL; break; } } if (!cfg80211_supported_cipher_suite(&rdev->wiphy, params->cipher)) return -EINVAL; return 0; } unsigned int __attribute_const__ ieee80211_hdrlen(__le16 fc) { unsigned int hdrlen = 24; if (ieee80211_is_ext(fc)) { hdrlen = 4; goto out; } if (ieee80211_is_data(fc)) { if (ieee80211_has_a4(fc)) hdrlen = 30; if (ieee80211_is_data_qos(fc)) { hdrlen += IEEE80211_QOS_CTL_LEN; if (ieee80211_has_order(fc)) hdrlen += IEEE80211_HT_CTL_LEN; } goto out; } if (ieee80211_is_mgmt(fc)) { if (ieee80211_has_order(fc)) hdrlen += IEEE80211_HT_CTL_LEN; goto out; } if (ieee80211_is_ctl(fc)) { /* * ACK and CTS are 10 bytes, all others 16. To see how * to get this condition consider * subtype mask: 0b0000000011110000 (0x00F0) * ACK subtype: 0b0000000011010000 (0x00D0) * CTS subtype: 0b0000000011000000 (0x00C0) * bits that matter: ^^^ (0x00E0) * value of those: 0b0000000011000000 (0x00C0) */ if ((fc & cpu_to_le16(0x00E0)) == cpu_to_le16(0x00C0)) hdrlen = 10; else hdrlen = 16; } out: return hdrlen; } EXPORT_SYMBOL(ieee80211_hdrlen); unsigned int ieee80211_get_hdrlen_from_skb(const struct sk_buff *skb) { const struct ieee80211_hdr *hdr = (const struct ieee80211_hdr *)skb->data; unsigned int hdrlen; if (unlikely(skb->len < 10)) return 0; hdrlen = ieee80211_hdrlen(hdr->frame_control); if (unlikely(hdrlen > skb->len)) return 0; return hdrlen; } EXPORT_SYMBOL(ieee80211_get_hdrlen_from_skb); static unsigned int __ieee80211_get_mesh_hdrlen(u8 flags) { int ae = flags & MESH_FLAGS_AE; /* 802.11-2012, 8.2.4.7.3 */ switch (ae) { default: case 0: return 6; case MESH_FLAGS_AE_A4: return 12; case MESH_FLAGS_AE_A5_A6: return 18; } } unsigned int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr) { return __ieee80211_get_mesh_hdrlen(meshhdr->flags); } EXPORT_SYMBOL(ieee80211_get_mesh_hdrlen); bool ieee80211_get_8023_tunnel_proto(const void *hdr, __be16 *proto) { const __be16 *hdr_proto = hdr + ETH_ALEN; if (!(ether_addr_equal(hdr, rfc1042_header) && *hdr_proto != htons(ETH_P_AARP) && *hdr_proto != htons(ETH_P_IPX)) && !ether_addr_equal(hdr, bridge_tunnel_header)) return false; *proto = *hdr_proto; return true; } EXPORT_SYMBOL(ieee80211_get_8023_tunnel_proto); int ieee80211_strip_8023_mesh_hdr(struct sk_buff *skb) { const void *mesh_addr; struct { struct ethhdr eth; u8 flags; } payload; int hdrlen; int ret; ret = skb_copy_bits(skb, 0, &payload, sizeof(payload)); if (ret) return ret; hdrlen = sizeof(payload.eth) + __ieee80211_get_mesh_hdrlen(payload.flags); if (likely(pskb_may_pull(skb, hdrlen + 8) && ieee80211_get_8023_tunnel_proto(skb->data + hdrlen, &payload.eth.h_proto))) hdrlen += ETH_ALEN + 2; else if (!pskb_may_pull(skb, hdrlen)) return -EINVAL; else payload.eth.h_proto = htons(skb->len - hdrlen); mesh_addr = skb->data + sizeof(payload.eth) + ETH_ALEN; switch (payload.flags & MESH_FLAGS_AE) { case MESH_FLAGS_AE_A4: memcpy(&payload.eth.h_source, mesh_addr, ETH_ALEN); break; case MESH_FLAGS_AE_A5_A6: memcpy(&payload.eth, mesh_addr, 2 * ETH_ALEN); break; default: break; } pskb_pull(skb, hdrlen - sizeof(payload.eth)); memcpy(skb->data, &payload.eth, sizeof(payload.eth)); return 0; } EXPORT_SYMBOL(ieee80211_strip_8023_mesh_hdr); int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, const u8 *addr, enum nl80211_iftype iftype, u8 data_offset, bool is_amsdu) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct { u8 hdr[ETH_ALEN] __aligned(2); __be16 proto; } payload; struct ethhdr tmp; u16 hdrlen; if (unlikely(!ieee80211_is_data_present(hdr->frame_control))) return -1; hdrlen = ieee80211_hdrlen(hdr->frame_control) + data_offset; if (skb->len < hdrlen) return -1; /* convert IEEE 802.11 header + possible LLC headers into Ethernet * header * IEEE 802.11 address fields: * ToDS FromDS Addr1 Addr2 Addr3 Addr4 * 0 0 DA SA BSSID n/a * 0 1 DA BSSID SA n/a * 1 0 BSSID SA DA n/a * 1 1 RA TA DA SA */ memcpy(tmp.h_dest, ieee80211_get_DA(hdr), ETH_ALEN); memcpy(tmp.h_source, ieee80211_get_SA(hdr), ETH_ALEN); switch (hdr->frame_control & cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) { case cpu_to_le16(IEEE80211_FCTL_TODS): if (unlikely(iftype != NL80211_IFTYPE_AP && iftype != NL80211_IFTYPE_AP_VLAN && iftype != NL80211_IFTYPE_P2P_GO)) return -1; break; case cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS): if (unlikely(iftype != NL80211_IFTYPE_MESH_POINT && iftype != NL80211_IFTYPE_AP_VLAN && iftype != NL80211_IFTYPE_STATION)) return -1; break; case cpu_to_le16(IEEE80211_FCTL_FROMDS): if ((iftype != NL80211_IFTYPE_STATION && iftype != NL80211_IFTYPE_P2P_CLIENT && iftype != NL80211_IFTYPE_MESH_POINT) || (is_multicast_ether_addr(tmp.h_dest) && ether_addr_equal(tmp.h_source, addr))) return -1; break; case cpu_to_le16(0): if (iftype != NL80211_IFTYPE_ADHOC && iftype != NL80211_IFTYPE_STATION && iftype != NL80211_IFTYPE_OCB) return -1; break; } if (likely(!is_amsdu && iftype != NL80211_IFTYPE_MESH_POINT && skb_copy_bits(skb, hdrlen, &payload, sizeof(payload)) == 0 && ieee80211_get_8023_tunnel_proto(&payload, &tmp.h_proto))) { /* remove RFC1042 or Bridge-Tunnel encapsulation */ hdrlen += ETH_ALEN + 2; skb_postpull_rcsum(skb, &payload, ETH_ALEN + 2); } else { tmp.h_proto = htons(skb->len - hdrlen); } pskb_pull(skb, hdrlen); if (!ehdr) ehdr = skb_push(skb, sizeof(struct ethhdr)); memcpy(ehdr, &tmp, sizeof(tmp)); return 0; } EXPORT_SYMBOL(ieee80211_data_to_8023_exthdr); static void __frame_add_frag(struct sk_buff *skb, struct page *page, void *ptr, int len, int size) { struct skb_shared_info *sh = skb_shinfo(skb); int page_offset; get_page(page); page_offset = ptr - page_address(page); skb_add_rx_frag(skb, sh->nr_frags, page, page_offset, len, size); } static void __ieee80211_amsdu_copy_frag(struct sk_buff *skb, struct sk_buff *frame, int offset, int len) { struct skb_shared_info *sh = skb_shinfo(skb); const skb_frag_t *frag = &sh->frags[0]; struct page *frag_page; void *frag_ptr; int frag_len, frag_size; int head_size = skb->len - skb->data_len; int cur_len; frag_page = virt_to_head_page(skb->head); frag_ptr = skb->data; frag_size = head_size; while (offset >= frag_size) { offset -= frag_size; frag_page = skb_frag_page(frag); frag_ptr = skb_frag_address(frag); frag_size = skb_frag_size(frag); frag++; } frag_ptr += offset; frag_len = frag_size - offset; cur_len = min(len, frag_len); __frame_add_frag(frame, frag_page, frag_ptr, cur_len, frag_size); len -= cur_len; while (len > 0) { frag_len = skb_frag_size(frag); cur_len = min(len, frag_len); __frame_add_frag(frame, skb_frag_page(frag), skb_frag_address(frag), cur_len, frag_len); len -= cur_len; frag++; } } static struct sk_buff * __ieee80211_amsdu_copy(struct sk_buff *skb, unsigned int hlen, int offset, int len, bool reuse_frag, int min_len) { struct sk_buff *frame; int cur_len = len; if (skb->len - offset < len) return NULL; /* * When reusing framents, copy some data to the head to simplify * ethernet header handling and speed up protocol header processing * in the stack later. */ if (reuse_frag) cur_len = min_t(int, len, min_len); /* * Allocate and reserve two bytes more for payload * alignment since sizeof(struct ethhdr) is 14. */ frame = dev_alloc_skb(hlen + sizeof(struct ethhdr) + 2 + cur_len); if (!frame) return NULL; frame->priority = skb->priority; skb_reserve(frame, hlen + sizeof(struct ethhdr) + 2); skb_copy_bits(skb, offset, skb_put(frame, cur_len), cur_len); len -= cur_len; if (!len) return frame; offset += cur_len; __ieee80211_amsdu_copy_frag(skb, frame, offset, len); return frame; } static u16 ieee80211_amsdu_subframe_length(void *field, u8 mesh_flags, u8 hdr_type) { __le16 *field_le = field; __be16 *field_be = field; u16 len; if (hdr_type >= 2) len = le16_to_cpu(*field_le); else len = be16_to_cpu(*field_be); if (hdr_type) len += __ieee80211_get_mesh_hdrlen(mesh_flags); return len; } bool ieee80211_is_valid_amsdu(struct sk_buff *skb, u8 mesh_hdr) { int offset = 0, remaining, subframe_len, padding; for (offset = 0; offset < skb->len; offset += subframe_len + padding) { struct { __be16 len; u8 mesh_flags; } hdr; u16 len; if (skb_copy_bits(skb, offset + 2 * ETH_ALEN, &hdr, sizeof(hdr)) < 0) return false; len = ieee80211_amsdu_subframe_length(&hdr.len, hdr.mesh_flags, mesh_hdr); subframe_len = sizeof(struct ethhdr) + len; padding = (4 - subframe_len) & 0x3; remaining = skb->len - offset; if (subframe_len > remaining) return false; } return true; } EXPORT_SYMBOL(ieee80211_is_valid_amsdu); void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list, const u8 *addr, enum nl80211_iftype iftype, const unsigned int extra_headroom, const u8 *check_da, const u8 *check_sa, u8 mesh_control) { unsigned int hlen = ALIGN(extra_headroom, 4); struct sk_buff *frame = NULL; int offset = 0, remaining; struct { struct ethhdr eth; uint8_t flags; } hdr; bool reuse_frag = skb->head_frag && !skb_has_frag_list(skb); bool reuse_skb = false; bool last = false; int copy_len = sizeof(hdr.eth); if (iftype == NL80211_IFTYPE_MESH_POINT) copy_len = sizeof(hdr); while (!last) { unsigned int subframe_len; int len, mesh_len = 0; u8 padding; skb_copy_bits(skb, offset, &hdr, copy_len); if (iftype == NL80211_IFTYPE_MESH_POINT) mesh_len = __ieee80211_get_mesh_hdrlen(hdr.flags); len = ieee80211_amsdu_subframe_length(&hdr.eth.h_proto, hdr.flags, mesh_control); subframe_len = sizeof(struct ethhdr) + len; padding = (4 - subframe_len) & 0x3; /* the last MSDU has no padding */ remaining = skb->len - offset; if (subframe_len > remaining) goto purge; /* mitigate A-MSDU aggregation injection attacks */ if (ether_addr_equal(hdr.eth.h_dest, rfc1042_header)) goto purge; offset += sizeof(struct ethhdr); last = remaining <= subframe_len + padding; /* FIXME: should we really accept multicast DA? */ if ((check_da && !is_multicast_ether_addr(hdr.eth.h_dest) && !ether_addr_equal(check_da, hdr.eth.h_dest)) || (check_sa && !ether_addr_equal(check_sa, hdr.eth.h_source))) { offset += len + padding; continue; } /* reuse skb for the last subframe */ if (!skb_is_nonlinear(skb) && !reuse_frag && last) { skb_pull(skb, offset); frame = skb; reuse_skb = true; } else { frame = __ieee80211_amsdu_copy(skb, hlen, offset, len, reuse_frag, 32 + mesh_len); if (!frame) goto purge; offset += len + padding; } skb_reset_network_header(frame); frame->dev = skb->dev; frame->priority = skb->priority; if (likely(iftype != NL80211_IFTYPE_MESH_POINT && ieee80211_get_8023_tunnel_proto(frame->data, &hdr.eth.h_proto))) skb_pull(frame, ETH_ALEN + 2); memcpy(skb_push(frame, sizeof(hdr.eth)), &hdr.eth, sizeof(hdr.eth)); __skb_queue_tail(list, frame); } if (!reuse_skb) dev_kfree_skb(skb); return; purge: __skb_queue_purge(list); dev_kfree_skb(skb); } EXPORT_SYMBOL(ieee80211_amsdu_to_8023s); /* Given a data frame determine the 802.1p/1d tag to use. */ unsigned int cfg80211_classify8021d(struct sk_buff *skb, struct cfg80211_qos_map *qos_map) { unsigned int dscp; unsigned char vlan_priority; unsigned int ret; /* skb->priority values from 256->263 are magic values to * directly indicate a specific 802.1d priority. This is used * to allow 802.1d priority to be passed directly in from VLAN * tags, etc. */ if (skb->priority >= 256 && skb->priority <= 263) { ret = skb->priority - 256; goto out; } if (skb_vlan_tag_present(skb)) { vlan_priority = (skb_vlan_tag_get(skb) & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; if (vlan_priority > 0) { ret = vlan_priority; goto out; } } switch (skb->protocol) { case htons(ETH_P_IP): dscp = ipv4_get_dsfield(ip_hdr(skb)) & 0xfc; break; case htons(ETH_P_IPV6): dscp = ipv6_get_dsfield(ipv6_hdr(skb)) & 0xfc; break; case htons(ETH_P_MPLS_UC): case htons(ETH_P_MPLS_MC): { struct mpls_label mpls_tmp, *mpls; mpls = skb_header_pointer(skb, sizeof(struct ethhdr), sizeof(*mpls), &mpls_tmp); if (!mpls) return 0; ret = (ntohl(mpls->entry) & MPLS_LS_TC_MASK) >> MPLS_LS_TC_SHIFT; goto out; } case htons(ETH_P_80221): /* 802.21 is always network control traffic */ return 7; default: return 0; } if (qos_map) { unsigned int i, tmp_dscp = dscp >> 2; for (i = 0; i < qos_map->num_des; i++) { if (tmp_dscp == qos_map->dscp_exception[i].dscp) { ret = qos_map->dscp_exception[i].up; goto out; } } for (i = 0; i < 8; i++) { if (tmp_dscp >= qos_map->up[i].low && tmp_dscp <= qos_map->up[i].high) { ret = i; goto out; } } } ret = dscp >> 5; out: return array_index_nospec(ret, IEEE80211_NUM_TIDS); } EXPORT_SYMBOL(cfg80211_classify8021d); const struct element *ieee80211_bss_get_elem(struct cfg80211_bss *bss, u8 id) { const struct cfg80211_bss_ies *ies; ies = rcu_dereference(bss->ies); if (!ies) return NULL; return cfg80211_find_elem(id, ies->data, ies->len); } EXPORT_SYMBOL(ieee80211_bss_get_elem); void cfg80211_upload_connect_keys(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct net_device *dev = wdev->netdev; int i; if (!wdev->connect_keys) return; for (i = 0; i < 4; i++) { if (!wdev->connect_keys->params[i].cipher) continue; if (rdev_add_key(rdev, dev, -1, i, false, NULL, &wdev->connect_keys->params[i])) { netdev_err(dev, "failed to set key %d\n", i); continue; } if (wdev->connect_keys->def == i && rdev_set_default_key(rdev, dev, -1, i, true, true)) { netdev_err(dev, "failed to set defkey %d\n", i); continue; } } kfree_sensitive(wdev->connect_keys); wdev->connect_keys = NULL; } void cfg80211_process_wdev_events(struct wireless_dev *wdev) { struct cfg80211_event *ev; unsigned long flags; spin_lock_irqsave(&wdev->event_lock, flags); while (!list_empty(&wdev->event_list)) { ev = list_first_entry(&wdev->event_list, struct cfg80211_event, list); list_del(&ev->list); spin_unlock_irqrestore(&wdev->event_lock, flags); wdev_lock(wdev); switch (ev->type) { case EVENT_CONNECT_RESULT: __cfg80211_connect_result( wdev->netdev, &ev->cr, ev->cr.status == WLAN_STATUS_SUCCESS); break; case EVENT_ROAMED: __cfg80211_roamed(wdev, &ev->rm); break; case EVENT_DISCONNECTED: __cfg80211_disconnected(wdev->netdev, ev->dc.ie, ev->dc.ie_len, ev->dc.reason, !ev->dc.locally_generated); break; case EVENT_IBSS_JOINED: __cfg80211_ibss_joined(wdev->netdev, ev->ij.bssid, ev->ij.channel); break; case EVENT_STOPPED: __cfg80211_leave(wiphy_to_rdev(wdev->wiphy), wdev); break; case EVENT_PORT_AUTHORIZED: __cfg80211_port_authorized(wdev, ev->pa.bssid, ev->pa.td_bitmap, ev->pa.td_bitmap_len); break; } wdev_unlock(wdev); kfree(ev); spin_lock_irqsave(&wdev->event_lock, flags); } spin_unlock_irqrestore(&wdev->event_lock, flags); } void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev) { struct wireless_dev *wdev; lockdep_assert_held(&rdev->wiphy.mtx); list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) cfg80211_process_wdev_events(wdev); } int cfg80211_change_iface(struct cfg80211_registered_device *rdev, struct net_device *dev, enum nl80211_iftype ntype, struct vif_params *params) { int err; enum nl80211_iftype otype = dev->ieee80211_ptr->iftype; lockdep_assert_held(&rdev->wiphy.mtx); /* don't support changing VLANs, you just re-create them */ if (otype == NL80211_IFTYPE_AP_VLAN) return -EOPNOTSUPP; /* cannot change into P2P device or NAN */ if (ntype == NL80211_IFTYPE_P2P_DEVICE || ntype == NL80211_IFTYPE_NAN) return -EOPNOTSUPP; if (!rdev->ops->change_virtual_intf || !(rdev->wiphy.interface_modes & (1 << ntype))) return -EOPNOTSUPP; if (ntype != otype) { /* if it's part of a bridge, reject changing type to station/ibss */ if (netif_is_bridge_port(dev) && (ntype == NL80211_IFTYPE_ADHOC || ntype == NL80211_IFTYPE_STATION || ntype == NL80211_IFTYPE_P2P_CLIENT)) return -EBUSY; dev->ieee80211_ptr->use_4addr = false; wdev_lock(dev->ieee80211_ptr); rdev_set_qos_map(rdev, dev, NULL); wdev_unlock(dev->ieee80211_ptr); switch (otype) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: cfg80211_stop_ap(rdev, dev, -1, true); break; case NL80211_IFTYPE_ADHOC: cfg80211_leave_ibss(rdev, dev, false); break; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: wdev_lock(dev->ieee80211_ptr); cfg80211_disconnect(rdev, dev, WLAN_REASON_DEAUTH_LEAVING, true); wdev_unlock(dev->ieee80211_ptr); break; case NL80211_IFTYPE_MESH_POINT: /* mesh should be handled? */ break; case NL80211_IFTYPE_OCB: cfg80211_leave_ocb(rdev, dev); break; default: break; } cfg80211_process_rdev_events(rdev); cfg80211_mlme_purge_registrations(dev->ieee80211_ptr); memset(&dev->ieee80211_ptr->u, 0, sizeof(dev->ieee80211_ptr->u)); memset(&dev->ieee80211_ptr->links, 0, sizeof(dev->ieee80211_ptr->links)); } err = rdev_change_virtual_intf(rdev, dev, ntype, params); WARN_ON(!err && dev->ieee80211_ptr->iftype != ntype); if (!err && params && params->use_4addr != -1) dev->ieee80211_ptr->use_4addr = params->use_4addr; if (!err) { dev->priv_flags &= ~IFF_DONT_BRIDGE; switch (ntype) { case NL80211_IFTYPE_STATION: if (dev->ieee80211_ptr->use_4addr) break; fallthrough; case NL80211_IFTYPE_OCB: case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_ADHOC: dev->priv_flags |= IFF_DONT_BRIDGE; break; case NL80211_IFTYPE_P2P_GO: case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_MESH_POINT: /* bridging OK */ break; case NL80211_IFTYPE_MONITOR: /* monitor can't bridge anyway */ break; case NL80211_IFTYPE_UNSPECIFIED: case NUM_NL80211_IFTYPES: /* not happening */ break; case NL80211_IFTYPE_P2P_DEVICE: case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_NAN: WARN_ON(1); break; } } if (!err && ntype != otype && netif_running(dev)) { cfg80211_update_iface_num(rdev, ntype, 1); cfg80211_update_iface_num(rdev, otype, -1); } return err; } static u32 cfg80211_calculate_bitrate_ht(struct rate_info *rate) { int modulation, streams, bitrate; /* the formula below does only work for MCS values smaller than 32 */ if (WARN_ON_ONCE(rate->mcs >= 32)) return 0; modulation = rate->mcs & 7; streams = (rate->mcs >> 3) + 1; bitrate = (rate->bw == RATE_INFO_BW_40) ? 13500000 : 6500000; if (modulation < 4) bitrate *= (modulation + 1); else if (modulation == 4) bitrate *= (modulation + 2); else bitrate *= (modulation + 3); bitrate *= streams; if (rate->flags & RATE_INFO_FLAGS_SHORT_GI) bitrate = (bitrate / 9) * 10; /* do NOT round down here */ return (bitrate + 50000) / 100000; } static u32 cfg80211_calculate_bitrate_dmg(struct rate_info *rate) { static const u32 __mcs2bitrate[] = { /* control PHY */ [0] = 275, /* SC PHY */ [1] = 3850, [2] = 7700, [3] = 9625, [4] = 11550, [5] = 12512, /* 1251.25 mbps */ [6] = 15400, [7] = 19250, [8] = 23100, [9] = 25025, [10] = 30800, [11] = 38500, [12] = 46200, /* OFDM PHY */ [13] = 6930, [14] = 8662, /* 866.25 mbps */ [15] = 13860, [16] = 17325, [17] = 20790, [18] = 27720, [19] = 34650, [20] = 41580, [21] = 45045, [22] = 51975, [23] = 62370, [24] = 67568, /* 6756.75 mbps */ /* LP-SC PHY */ [25] = 6260, [26] = 8340, [27] = 11120, [28] = 12510, [29] = 16680, [30] = 22240, [31] = 25030, }; if (WARN_ON_ONCE(rate->mcs >= ARRAY_SIZE(__mcs2bitrate))) return 0; return __mcs2bitrate[rate->mcs]; } static u32 cfg80211_calculate_bitrate_extended_sc_dmg(struct rate_info *rate) { static const u32 __mcs2bitrate[] = { [6 - 6] = 26950, /* MCS 9.1 : 2695.0 mbps */ [7 - 6] = 50050, /* MCS 12.1 */ [8 - 6] = 53900, [9 - 6] = 57750, [10 - 6] = 63900, [11 - 6] = 75075, [12 - 6] = 80850, }; /* Extended SC MCS not defined for base MCS below 6 or above 12 */ if (WARN_ON_ONCE(rate->mcs < 6 || rate->mcs > 12)) return 0; return __mcs2bitrate[rate->mcs - 6]; } static u32 cfg80211_calculate_bitrate_edmg(struct rate_info *rate) { static const u32 __mcs2bitrate[] = { /* control PHY */ [0] = 275, /* SC PHY */ [1] = 3850, [2] = 7700, [3] = 9625, [4] = 11550, [5] = 12512, /* 1251.25 mbps */ [6] = 13475, [7] = 15400, [8] = 19250, [9] = 23100, [10] = 25025, [11] = 26950, [12] = 30800, [13] = 38500, [14] = 46200, [15] = 50050, [16] = 53900, [17] = 57750, [18] = 69300, [19] = 75075, [20] = 80850, }; if (WARN_ON_ONCE(rate->mcs >= ARRAY_SIZE(__mcs2bitrate))) return 0; return __mcs2bitrate[rate->mcs] * rate->n_bonded_ch; } static u32 cfg80211_calculate_bitrate_vht(struct rate_info *rate) { static const u32 base[4][12] = { { 6500000, 13000000, 19500000, 26000000, 39000000, 52000000, 58500000, 65000000, 78000000, /* not in the spec, but some devices use this: */ 86700000, 97500000, 108300000, }, { 13500000, 27000000, 40500000, 54000000, 81000000, 108000000, 121500000, 135000000, 162000000, 180000000, 202500000, 225000000, }, { 29300000, 58500000, 87800000, 117000000, 175500000, 234000000, 263300000, 292500000, 351000000, 390000000, 438800000, 487500000, }, { 58500000, 117000000, 175500000, 234000000, 351000000, 468000000, 526500000, 585000000, 702000000, 780000000, 877500000, 975000000, }, }; u32 bitrate; int idx; if (rate->mcs > 11) goto warn; switch (rate->bw) { case RATE_INFO_BW_160: idx = 3; break; case RATE_INFO_BW_80: idx = 2; break; case RATE_INFO_BW_40: idx = 1; break; case RATE_INFO_BW_5: case RATE_INFO_BW_10: default: goto warn; case RATE_INFO_BW_20: idx = 0; } bitrate = base[idx][rate->mcs]; bitrate *= rate->nss; if (rate->flags & RATE_INFO_FLAGS_SHORT_GI) bitrate = (bitrate / 9) * 10; /* do NOT round down here */ return (bitrate + 50000) / 100000; warn: WARN_ONCE(1, "invalid rate bw=%d, mcs=%d, nss=%d\n", rate->bw, rate->mcs, rate->nss); return 0; } static u32 cfg80211_calculate_bitrate_he(struct rate_info *rate) { #define SCALE 6144 u32 mcs_divisors[14] = { 102399, /* 16.666666... */ 51201, /* 8.333333... */ 34134, /* 5.555555... */ 25599, /* 4.166666... */ 17067, /* 2.777777... */ 12801, /* 2.083333... */ 11377, /* 1.851725... */ 10239, /* 1.666666... */ 8532, /* 1.388888... */ 7680, /* 1.250000... */ 6828, /* 1.111111... */ 6144, /* 1.000000... */ 5690, /* 0.926106... */ 5120, /* 0.833333... */ }; u32 rates_160M[3] = { 960777777, 907400000, 816666666 }; u32 rates_969[3] = { 480388888, 453700000, 408333333 }; u32 rates_484[3] = { 229411111, 216666666, 195000000 }; u32 rates_242[3] = { 114711111, 108333333, 97500000 }; u32 rates_106[3] = { 40000000, 37777777, 34000000 }; u32 rates_52[3] = { 18820000, 17777777, 16000000 }; u32 rates_26[3] = { 9411111, 8888888, 8000000 }; u64 tmp; u32 result; if (WARN_ON_ONCE(rate->mcs > 13)) return 0; if (WARN_ON_ONCE(rate->he_gi > NL80211_RATE_INFO_HE_GI_3_2)) return 0; if (WARN_ON_ONCE(rate->he_ru_alloc > NL80211_RATE_INFO_HE_RU_ALLOC_2x996)) return 0; if (WARN_ON_ONCE(rate->nss < 1 || rate->nss > 8)) return 0; if (rate->bw == RATE_INFO_BW_160) result = rates_160M[rate->he_gi]; else if (rate->bw == RATE_INFO_BW_80 || (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_996)) result = rates_969[rate->he_gi]; else if (rate->bw == RATE_INFO_BW_40 || (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_484)) result = rates_484[rate->he_gi]; else if (rate->bw == RATE_INFO_BW_20 || (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_242)) result = rates_242[rate->he_gi]; else if (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_106) result = rates_106[rate->he_gi]; else if (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_52) result = rates_52[rate->he_gi]; else if (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_26) result = rates_26[rate->he_gi]; else { WARN(1, "invalid HE MCS: bw:%d, ru:%d\n", rate->bw, rate->he_ru_alloc); return 0; } /* now scale to the appropriate MCS */ tmp = result; tmp *= SCALE; do_div(tmp, mcs_divisors[rate->mcs]); result = tmp; /* and take NSS, DCM into account */ result = (result * rate->nss) / 8; if (rate->he_dcm) result /= 2; return result / 10000; } static u32 cfg80211_calculate_bitrate_eht(struct rate_info *rate) { #define SCALE 6144 static const u32 mcs_divisors[16] = { 102399, /* 16.666666... */ 51201, /* 8.333333... */ 34134, /* 5.555555... */ 25599, /* 4.166666... */ 17067, /* 2.777777... */ 12801, /* 2.083333... */ 11377, /* 1.851725... */ 10239, /* 1.666666... */ 8532, /* 1.388888... */ 7680, /* 1.250000... */ 6828, /* 1.111111... */ 6144, /* 1.000000... */ 5690, /* 0.926106... */ 5120, /* 0.833333... */ 409600, /* 66.666666... */ 204800, /* 33.333333... */ }; static const u32 rates_996[3] = { 480388888, 453700000, 408333333 }; static const u32 rates_484[3] = { 229411111, 216666666, 195000000 }; static const u32 rates_242[3] = { 114711111, 108333333, 97500000 }; static const u32 rates_106[3] = { 40000000, 37777777, 34000000 }; static const u32 rates_52[3] = { 18820000, 17777777, 16000000 }; static const u32 rates_26[3] = { 9411111, 8888888, 8000000 }; u64 tmp; u32 result; if (WARN_ON_ONCE(rate->mcs > 15)) return 0; if (WARN_ON_ONCE(rate->eht_gi > NL80211_RATE_INFO_EHT_GI_3_2)) return 0; if (WARN_ON_ONCE(rate->eht_ru_alloc > NL80211_RATE_INFO_EHT_RU_ALLOC_4x996)) return 0; if (WARN_ON_ONCE(rate->nss < 1 || rate->nss > 8)) return 0; /* Bandwidth checks for MCS 14 */ if (rate->mcs == 14) { if ((rate->bw != RATE_INFO_BW_EHT_RU && rate->bw != RATE_INFO_BW_80 && rate->bw != RATE_INFO_BW_160 && rate->bw != RATE_INFO_BW_320) || (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc != NL80211_RATE_INFO_EHT_RU_ALLOC_996 && rate->eht_ru_alloc != NL80211_RATE_INFO_EHT_RU_ALLOC_2x996 && rate->eht_ru_alloc != NL80211_RATE_INFO_EHT_RU_ALLOC_4x996)) { WARN(1, "invalid EHT BW for MCS 14: bw:%d, ru:%d\n", rate->bw, rate->eht_ru_alloc); return 0; } } if (rate->bw == RATE_INFO_BW_320 || (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_4x996)) result = 4 * rates_996[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_3x996P484) result = 3 * rates_996[rate->eht_gi] + rates_484[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_3x996) result = 3 * rates_996[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_2x996P484) result = 2 * rates_996[rate->eht_gi] + rates_484[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_160 || (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_2x996)) result = 2 * rates_996[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_996P484P242) result = rates_996[rate->eht_gi] + rates_484[rate->eht_gi] + rates_242[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_996P484) result = rates_996[rate->eht_gi] + rates_484[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_80 || (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_996)) result = rates_996[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_484P242) result = rates_484[rate->eht_gi] + rates_242[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_40 || (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_484)) result = rates_484[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_20 || (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_242)) result = rates_242[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_106P26) result = rates_106[rate->eht_gi] + rates_26[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_106) result = rates_106[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_52P26) result = rates_52[rate->eht_gi] + rates_26[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_52) result = rates_52[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_26) result = rates_26[rate->eht_gi]; else { WARN(1, "invalid EHT MCS: bw:%d, ru:%d\n", rate->bw, rate->eht_ru_alloc); return 0; } /* now scale to the appropriate MCS */ tmp = result; tmp *= SCALE; do_div(tmp, mcs_divisors[rate->mcs]); /* and take NSS */ tmp *= rate->nss; do_div(tmp, 8); result = tmp; return result / 10000; } static u32 cfg80211_calculate_bitrate_s1g(struct rate_info *rate) { /* For 1, 2, 4, 8 and 16 MHz channels */ static const u32 base[5][11] = { { 300000, 600000, 900000, 1200000, 1800000, 2400000, 2700000, 3000000, 3600000, 4000000, /* MCS 10 supported in 1 MHz only */ 150000, }, { 650000, 1300000, 1950000, 2600000, 3900000, 5200000, 5850000, 6500000, 7800000, /* MCS 9 not valid */ }, { 1350000, 2700000, 4050000, 5400000, 8100000, 10800000, 12150000, 13500000, 16200000, 18000000, }, { 2925000, 5850000, 8775000, 11700000, 17550000, 23400000, 26325000, 29250000, 35100000, 39000000, }, { 8580000, 11700000, 17550000, 23400000, 35100000, 46800000, 52650000, 58500000, 70200000, 78000000, }, }; u32 bitrate; /* default is 1 MHz index */ int idx = 0; if (rate->mcs >= 11) goto warn; switch (rate->bw) { case RATE_INFO_BW_16: idx = 4; break; case RATE_INFO_BW_8: idx = 3; break; case RATE_INFO_BW_4: idx = 2; break; case RATE_INFO_BW_2: idx = 1; break; case RATE_INFO_BW_1: idx = 0; break; case RATE_INFO_BW_5: case RATE_INFO_BW_10: case RATE_INFO_BW_20: case RATE_INFO_BW_40: case RATE_INFO_BW_80: case RATE_INFO_BW_160: default: goto warn; } bitrate = base[idx][rate->mcs]; bitrate *= rate->nss; if (rate->flags & RATE_INFO_FLAGS_SHORT_GI) bitrate = (bitrate / 9) * 10; /* do NOT round down here */ return (bitrate + 50000) / 100000; warn: WARN_ONCE(1, "invalid rate bw=%d, mcs=%d, nss=%d\n", rate->bw, rate->mcs, rate->nss); return 0; } u32 cfg80211_calculate_bitrate(struct rate_info *rate) { if (rate->flags & RATE_INFO_FLAGS_MCS) return cfg80211_calculate_bitrate_ht(rate); if (rate->flags & RATE_INFO_FLAGS_DMG) return cfg80211_calculate_bitrate_dmg(rate); if (rate->flags & RATE_INFO_FLAGS_EXTENDED_SC_DMG) return cfg80211_calculate_bitrate_extended_sc_dmg(rate); if (rate->flags & RATE_INFO_FLAGS_EDMG) return cfg80211_calculate_bitrate_edmg(rate); if (rate->flags & RATE_INFO_FLAGS_VHT_MCS) return cfg80211_calculate_bitrate_vht(rate); if (rate->flags & RATE_INFO_FLAGS_HE_MCS) return cfg80211_calculate_bitrate_he(rate); if (rate->flags & RATE_INFO_FLAGS_EHT_MCS) return cfg80211_calculate_bitrate_eht(rate); if (rate->flags & RATE_INFO_FLAGS_S1G_MCS) return cfg80211_calculate_bitrate_s1g(rate); return rate->legacy; } EXPORT_SYMBOL(cfg80211_calculate_bitrate); int cfg80211_get_p2p_attr(const u8 *ies, unsigned int len, enum ieee80211_p2p_attr_id attr, u8 *buf, unsigned int bufsize) { u8 *out = buf; u16 attr_remaining = 0; bool desired_attr = false; u16 desired_len = 0; while (len > 0) { unsigned int iedatalen; unsigned int copy; const u8 *iedata; if (len < 2) return -EILSEQ; iedatalen = ies[1]; if (iedatalen + 2 > len) return -EILSEQ; if (ies[0] != WLAN_EID_VENDOR_SPECIFIC) goto cont; if (iedatalen < 4) goto cont; iedata = ies + 2; /* check WFA OUI, P2P subtype */ if (iedata[0] != 0x50 || iedata[1] != 0x6f || iedata[2] != 0x9a || iedata[3] != 0x09) goto cont; iedatalen -= 4; iedata += 4; /* check attribute continuation into this IE */ copy = min_t(unsigned int, attr_remaining, iedatalen); if (copy && desired_attr) { desired_len += copy; if (out) { memcpy(out, iedata, min(bufsize, copy)); out += min(bufsize, copy); bufsize -= min(bufsize, copy); } if (copy == attr_remaining) return desired_len; } attr_remaining -= copy; if (attr_remaining) goto cont; iedatalen -= copy; iedata += copy; while (iedatalen > 0) { u16 attr_len; /* P2P attribute ID & size must fit */ if (iedatalen < 3) return -EILSEQ; desired_attr = iedata[0] == attr; attr_len = get_unaligned_le16(iedata + 1); iedatalen -= 3; iedata += 3; copy = min_t(unsigned int, attr_len, iedatalen); if (desired_attr) { desired_len += copy; if (out) { memcpy(out, iedata, min(bufsize, copy)); out += min(bufsize, copy); bufsize -= min(bufsize, copy); } if (copy == attr_len) return desired_len; } iedata += copy; iedatalen -= copy; attr_remaining = attr_len - copy; } cont: len -= ies[1] + 2; ies += ies[1] + 2; } if (attr_remaining && desired_attr) return -EILSEQ; return -ENOENT; } EXPORT_SYMBOL(cfg80211_get_p2p_attr); static bool ieee80211_id_in_list(const u8 *ids, int n_ids, u8 id, bool id_ext) { int i; /* Make sure array values are legal */ if (WARN_ON(ids[n_ids - 1] == WLAN_EID_EXTENSION)) return false; i = 0; while (i < n_ids) { if (ids[i] == WLAN_EID_EXTENSION) { if (id_ext && (ids[i + 1] == id)) return true; i += 2; continue; } if (ids[i] == id && !id_ext) return true; i++; } return false; } static size_t skip_ie(const u8 *ies, size_t ielen, size_t pos) { /* we assume a validly formed IEs buffer */ u8 len = ies[pos + 1]; pos += 2 + len; /* the IE itself must have 255 bytes for fragments to follow */ if (len < 255) return pos; while (pos < ielen && ies[pos] == WLAN_EID_FRAGMENT) { len = ies[pos + 1]; pos += 2 + len; } return pos; } size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen, const u8 *ids, int n_ids, const u8 *after_ric, int n_after_ric, size_t offset) { size_t pos = offset; while (pos < ielen) { u8 ext = 0; if (ies[pos] == WLAN_EID_EXTENSION) ext = 2; if ((pos + ext) >= ielen) break; if (!ieee80211_id_in_list(ids, n_ids, ies[pos + ext], ies[pos] == WLAN_EID_EXTENSION)) break; if (ies[pos] == WLAN_EID_RIC_DATA && n_after_ric) { pos = skip_ie(ies, ielen, pos); while (pos < ielen) { if (ies[pos] == WLAN_EID_EXTENSION) ext = 2; else ext = 0; if ((pos + ext) >= ielen) break; if (!ieee80211_id_in_list(after_ric, n_after_ric, ies[pos + ext], ext == 2)) pos = skip_ie(ies, ielen, pos); else break; } } else { pos = skip_ie(ies, ielen, pos); } } return pos; } EXPORT_SYMBOL(ieee80211_ie_split_ric); bool ieee80211_operating_class_to_band(u8 operating_class, enum nl80211_band *band) { switch (operating_class) { case 112: case 115 ... 127: case 128 ... 130: *band = NL80211_BAND_5GHZ; return true; case 131 ... 135: *band = NL80211_BAND_6GHZ; return true; case 81: case 82: case 83: case 84: *band = NL80211_BAND_2GHZ; return true; case 180: *band = NL80211_BAND_60GHZ; return true; } return false; } EXPORT_SYMBOL(ieee80211_operating_class_to_band); bool ieee80211_chandef_to_operating_class(struct cfg80211_chan_def *chandef, u8 *op_class) { u8 vht_opclass; u32 freq = chandef->center_freq1; if (freq >= 2412 && freq <= 2472) { if (chandef->width > NL80211_CHAN_WIDTH_40) return false; /* 2.407 GHz, channels 1..13 */ if (chandef->width == NL80211_CHAN_WIDTH_40) { if (freq > chandef->chan->center_freq) *op_class = 83; /* HT40+ */ else *op_class = 84; /* HT40- */ } else { *op_class = 81; } return true; } if (freq == 2484) { /* channel 14 is only for IEEE 802.11b */ if (chandef->width != NL80211_CHAN_WIDTH_20_NOHT) return false; *op_class = 82; /* channel 14 */ return true; } switch (chandef->width) { case NL80211_CHAN_WIDTH_80: vht_opclass = 128; break; case NL80211_CHAN_WIDTH_160: vht_opclass = 129; break; case NL80211_CHAN_WIDTH_80P80: vht_opclass = 130; break; case NL80211_CHAN_WIDTH_10: case NL80211_CHAN_WIDTH_5: return false; /* unsupported for now */ default: vht_opclass = 0; break; } /* 5 GHz, channels 36..48 */ if (freq >= 5180 && freq <= 5240) { if (vht_opclass) { *op_class = vht_opclass; } else if (chandef->width == NL80211_CHAN_WIDTH_40) { if (freq > chandef->chan->center_freq) *op_class = 116; else *op_class = 117; } else { *op_class = 115; } return true; } /* 5 GHz, channels 52..64 */ if (freq >= 5260 && freq <= 5320) { if (vht_opclass) { *op_class = vht_opclass; } else if (chandef->width == NL80211_CHAN_WIDTH_40) { if (freq > chandef->chan->center_freq) *op_class = 119; else *op_class = 120; } else { *op_class = 118; } return true; } /* 5 GHz, channels 100..144 */ if (freq >= 5500 && freq <= 5720) { if (vht_opclass) { *op_class = vht_opclass; } else if (chandef->width == NL80211_CHAN_WIDTH_40) { if (freq > chandef->chan->center_freq) *op_class = 122; else *op_class = 123; } else { *op_class = 121; } return true; } /* 5 GHz, channels 149..169 */ if (freq >= 5745 && freq <= 5845) { if (vht_opclass) { *op_class = vht_opclass; } else if (chandef->width == NL80211_CHAN_WIDTH_40) { if (freq > chandef->chan->center_freq) *op_class = 126; else *op_class = 127; } else if (freq <= 5805) { *op_class = 124; } else { *op_class = 125; } return true; } /* 56.16 GHz, channel 1..4 */ if (freq >= 56160 + 2160 * 1 && freq <= 56160 + 2160 * 6) { if (chandef->width >= NL80211_CHAN_WIDTH_40) return false; *op_class = 180; return true; } /* not supported yet */ return false; } EXPORT_SYMBOL(ieee80211_chandef_to_operating_class); static int cfg80211_wdev_bi(struct wireless_dev *wdev) { switch (wdev->iftype) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: WARN_ON(wdev->valid_links); return wdev->links[0].ap.beacon_interval; case NL80211_IFTYPE_MESH_POINT: return wdev->u.mesh.beacon_interval; case NL80211_IFTYPE_ADHOC: return wdev->u.ibss.beacon_interval; default: break; } return 0; } static void cfg80211_calculate_bi_data(struct wiphy *wiphy, u32 new_beacon_int, u32 *beacon_int_gcd, bool *beacon_int_different) { struct wireless_dev *wdev; *beacon_int_gcd = 0; *beacon_int_different = false; list_for_each_entry(wdev, &wiphy->wdev_list, list) { int wdev_bi; /* this feature isn't supported with MLO */ if (wdev->valid_links) continue; wdev_bi = cfg80211_wdev_bi(wdev); if (!wdev_bi) continue; if (!*beacon_int_gcd) { *beacon_int_gcd = wdev_bi; continue; } if (wdev_bi == *beacon_int_gcd) continue; *beacon_int_different = true; *beacon_int_gcd = gcd(*beacon_int_gcd, wdev_bi); } if (new_beacon_int && *beacon_int_gcd != new_beacon_int) { if (*beacon_int_gcd) *beacon_int_different = true; *beacon_int_gcd = gcd(*beacon_int_gcd, new_beacon_int); } } int cfg80211_validate_beacon_int(struct cfg80211_registered_device *rdev, enum nl80211_iftype iftype, u32 beacon_int) { /* * This is just a basic pre-condition check; if interface combinations * are possible the driver must already be checking those with a call * to cfg80211_check_combinations(), in which case we'll validate more * through the cfg80211_calculate_bi_data() call and code in * cfg80211_iter_combinations(). */ if (beacon_int < 10 || beacon_int > 10000) return -EINVAL; return 0; } int cfg80211_iter_combinations(struct wiphy *wiphy, struct iface_combination_params *params, void (*iter)(const struct ieee80211_iface_combination *c, void *data), void *data) { const struct ieee80211_regdomain *regdom; enum nl80211_dfs_regions region = 0; int i, j, iftype; int num_interfaces = 0; u32 used_iftypes = 0; u32 beacon_int_gcd; bool beacon_int_different; /* * This is a bit strange, since the iteration used to rely only on * the data given by the driver, but here it now relies on context, * in form of the currently operating interfaces. * This is OK for all current users, and saves us from having to * push the GCD calculations into all the drivers. * In the future, this should probably rely more on data that's in * cfg80211 already - the only thing not would appear to be any new * interfaces (while being brought up) and channel/radar data. */ cfg80211_calculate_bi_data(wiphy, params->new_beacon_int, &beacon_int_gcd, &beacon_int_different); if (params->radar_detect) { rcu_read_lock(); regdom = rcu_dereference(cfg80211_regdomain); if (regdom) region = regdom->dfs_region; rcu_read_unlock(); } for (iftype = 0; iftype < NUM_NL80211_IFTYPES; iftype++) { num_interfaces += params->iftype_num[iftype]; if (params->iftype_num[iftype] > 0 && !cfg80211_iftype_allowed(wiphy, iftype, 0, 1)) used_iftypes |= BIT(iftype); } for (i = 0; i < wiphy->n_iface_combinations; i++) { const struct ieee80211_iface_combination *c; struct ieee80211_iface_limit *limits; u32 all_iftypes = 0; c = &wiphy->iface_combinations[i]; if (num_interfaces > c->max_interfaces) continue; if (params->num_different_channels > c->num_different_channels) continue; limits = kmemdup(c->limits, sizeof(limits[0]) * c->n_limits, GFP_KERNEL); if (!limits) return -ENOMEM; for (iftype = 0; iftype < NUM_NL80211_IFTYPES; iftype++) { if (cfg80211_iftype_allowed(wiphy, iftype, 0, 1)) continue; for (j = 0; j < c->n_limits; j++) { all_iftypes |= limits[j].types; if (!(limits[j].types & BIT(iftype))) continue; if (limits[j].max < params->iftype_num[iftype]) goto cont; limits[j].max -= params->iftype_num[iftype]; } } if (params->radar_detect != (c->radar_detect_widths & params->radar_detect)) goto cont; if (params->radar_detect && c->radar_detect_regions && !(c->radar_detect_regions & BIT(region))) goto cont; /* Finally check that all iftypes that we're currently * using are actually part of this combination. If they * aren't then we can't use this combination and have * to continue to the next. */ if ((all_iftypes & used_iftypes) != used_iftypes) goto cont; if (beacon_int_gcd) { if (c->beacon_int_min_gcd && beacon_int_gcd < c->beacon_int_min_gcd) goto cont; if (!c->beacon_int_min_gcd && beacon_int_different) goto cont; } /* This combination covered all interface types and * supported the requested numbers, so we're good. */ (*iter)(c, data); cont: kfree(limits); } return 0; } EXPORT_SYMBOL(cfg80211_iter_combinations); static void cfg80211_iter_sum_ifcombs(const struct ieee80211_iface_combination *c, void *data) { int *num = data; (*num)++; } int cfg80211_check_combinations(struct wiphy *wiphy, struct iface_combination_params *params) { int err, num = 0; err = cfg80211_iter_combinations(wiphy, params, cfg80211_iter_sum_ifcombs, &num); if (err) return err; if (num == 0) return -EBUSY; return 0; } EXPORT_SYMBOL(cfg80211_check_combinations); int ieee80211_get_ratemask(struct ieee80211_supported_band *sband, const u8 *rates, unsigned int n_rates, u32 *mask) { int i, j; if (!sband) return -EINVAL; if (n_rates == 0 || n_rates > NL80211_MAX_SUPP_RATES) return -EINVAL; *mask = 0; for (i = 0; i < n_rates; i++) { int rate = (rates[i] & 0x7f) * 5; bool found = false; for (j = 0; j < sband->n_bitrates; j++) { if (sband->bitrates[j].bitrate == rate) { found = true; *mask |= BIT(j); break; } } if (!found) return -EINVAL; } /* * mask must have at least one bit set here since we * didn't accept a 0-length rates array nor allowed * entries in the array that didn't exist */ return 0; } unsigned int ieee80211_get_num_supported_channels(struct wiphy *wiphy) { enum nl80211_band band; unsigned int n_channels = 0; for (band = 0; band < NUM_NL80211_BANDS; band++) if (wiphy->bands[band]) n_channels += wiphy->bands[band]->n_channels; return n_channels; } EXPORT_SYMBOL(ieee80211_get_num_supported_channels); int cfg80211_get_station(struct net_device *dev, const u8 *mac_addr, struct station_info *sinfo) { struct cfg80211_registered_device *rdev; struct wireless_dev *wdev; wdev = dev->ieee80211_ptr; if (!wdev) return -EOPNOTSUPP; rdev = wiphy_to_rdev(wdev->wiphy); if (!rdev->ops->get_station) return -EOPNOTSUPP; memset(sinfo, 0, sizeof(*sinfo)); return rdev_get_station(rdev, dev, mac_addr, sinfo); } EXPORT_SYMBOL(cfg80211_get_station); void cfg80211_free_nan_func(struct cfg80211_nan_func *f) { int i; if (!f) return; kfree(f->serv_spec_info); kfree(f->srf_bf); kfree(f->srf_macs); for (i = 0; i < f->num_rx_filters; i++) kfree(f->rx_filters[i].filter); for (i = 0; i < f->num_tx_filters; i++) kfree(f->tx_filters[i].filter); kfree(f->rx_filters); kfree(f->tx_filters); kfree(f); } EXPORT_SYMBOL(cfg80211_free_nan_func); bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range, u32 center_freq_khz, u32 bw_khz) { u32 start_freq_khz, end_freq_khz; start_freq_khz = center_freq_khz - (bw_khz / 2); end_freq_khz = center_freq_khz + (bw_khz / 2); if (start_freq_khz >= freq_range->start_freq_khz && end_freq_khz <= freq_range->end_freq_khz) return true; return false; } int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp) { sinfo->pertid = kcalloc(IEEE80211_NUM_TIDS + 1, sizeof(*(sinfo->pertid)), gfp); if (!sinfo->pertid) return -ENOMEM; return 0; } EXPORT_SYMBOL(cfg80211_sinfo_alloc_tid_stats); /* See IEEE 802.1H for LLC/SNAP encapsulation/decapsulation */ /* Ethernet-II snap header (RFC1042 for most EtherTypes) */ const unsigned char rfc1042_header[] __aligned(2) = { 0xaa, 0xaa, 0x03, 0x00, 0x00, 0x00 }; EXPORT_SYMBOL(rfc1042_header); /* Bridge-Tunnel header (for EtherTypes ETH_P_AARP and ETH_P_IPX) */ const unsigned char bridge_tunnel_header[] __aligned(2) = { 0xaa, 0xaa, 0x03, 0x00, 0x00, 0xf8 }; EXPORT_SYMBOL(bridge_tunnel_header); /* Layer 2 Update frame (802.2 Type 1 LLC XID Update response) */ struct iapp_layer2_update { u8 da[ETH_ALEN]; /* broadcast */ u8 sa[ETH_ALEN]; /* STA addr */ __be16 len; /* 6 */ u8 dsap; /* 0 */ u8 ssap; /* 0 */ u8 control; u8 xid_info[3]; } __packed; void cfg80211_send_layer2_update(struct net_device *dev, const u8 *addr) { struct iapp_layer2_update *msg; struct sk_buff *skb; /* Send Level 2 Update Frame to update forwarding tables in layer 2 * bridge devices */ skb = dev_alloc_skb(sizeof(*msg)); if (!skb) return; msg = skb_put(skb, sizeof(*msg)); /* 802.2 Type 1 Logical Link Control (LLC) Exchange Identifier (XID) * Update response frame; IEEE Std 802.2-1998, 5.4.1.2.1 */ eth_broadcast_addr(msg->da); ether_addr_copy(msg->sa, addr); msg->len = htons(6); msg->dsap = 0; msg->ssap = 0x01; /* NULL LSAP, CR Bit: Response */ msg->control = 0xaf; /* XID response lsb.1111F101. * F=0 (no poll command; unsolicited frame) */ msg->xid_info[0] = 0x81; /* XID format identifier */ msg->xid_info[1] = 1; /* LLC types/classes: Type 1 LLC */ msg->xid_info[2] = 0; /* XID sender's receive window size (RW) */ skb->dev = dev; skb->protocol = eth_type_trans(skb, dev); memset(skb->cb, 0, sizeof(skb->cb)); netif_rx(skb); } EXPORT_SYMBOL(cfg80211_send_layer2_update); int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, enum ieee80211_vht_chanwidth bw, int mcs, bool ext_nss_bw_capable, unsigned int max_vht_nss) { u16 map = le16_to_cpu(cap->supp_mcs.rx_mcs_map); int ext_nss_bw; int supp_width; int i, mcs_encoding; if (map == 0xffff) return 0; if (WARN_ON(mcs > 9 || max_vht_nss > 8)) return 0; if (mcs <= 7) mcs_encoding = 0; else if (mcs == 8) mcs_encoding = 1; else mcs_encoding = 2; if (!max_vht_nss) { /* find max_vht_nss for the given MCS */ for (i = 7; i >= 0; i--) { int supp = (map >> (2 * i)) & 3; if (supp == 3) continue; if (supp >= mcs_encoding) { max_vht_nss = i + 1; break; } } } if (!(cap->supp_mcs.tx_mcs_map & cpu_to_le16(IEEE80211_VHT_EXT_NSS_BW_CAPABLE))) return max_vht_nss; ext_nss_bw = le32_get_bits(cap->vht_cap_info, IEEE80211_VHT_CAP_EXT_NSS_BW_MASK); supp_width = le32_get_bits(cap->vht_cap_info, IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK); /* if not capable, treat ext_nss_bw as 0 */ if (!ext_nss_bw_capable) ext_nss_bw = 0; /* This is invalid */ if (supp_width == 3) return 0; /* This is an invalid combination so pretend nothing is supported */ if (supp_width == 2 && (ext_nss_bw == 1 || ext_nss_bw == 2)) return 0; /* * Cover all the special cases according to IEEE 802.11-2016 * Table 9-250. All other cases are either factor of 1 or not * valid/supported. */ switch (bw) { case IEEE80211_VHT_CHANWIDTH_USE_HT: case IEEE80211_VHT_CHANWIDTH_80MHZ: if ((supp_width == 1 || supp_width == 2) && ext_nss_bw == 3) return 2 * max_vht_nss; break; case IEEE80211_VHT_CHANWIDTH_160MHZ: if (supp_width == 0 && (ext_nss_bw == 1 || ext_nss_bw == 2)) return max_vht_nss / 2; if (supp_width == 0 && ext_nss_bw == 3) return (3 * max_vht_nss) / 4; if (supp_width == 1 && ext_nss_bw == 3) return 2 * max_vht_nss; break; case IEEE80211_VHT_CHANWIDTH_80P80MHZ: if (supp_width == 0 && ext_nss_bw == 1) return 0; /* not possible */ if (supp_width == 0 && ext_nss_bw == 2) return max_vht_nss / 2; if (supp_width == 0 && ext_nss_bw == 3) return (3 * max_vht_nss) / 4; if (supp_width == 1 && ext_nss_bw == 0) return 0; /* not possible */ if (supp_width == 1 && ext_nss_bw == 1) return max_vht_nss / 2; if (supp_width == 1 && ext_nss_bw == 2) return (3 * max_vht_nss) / 4; break; } /* not covered or invalid combination received */ return max_vht_nss; } EXPORT_SYMBOL(ieee80211_get_vht_max_nss); bool cfg80211_iftype_allowed(struct wiphy *wiphy, enum nl80211_iftype iftype, bool is_4addr, u8 check_swif) { bool is_vlan = iftype == NL80211_IFTYPE_AP_VLAN; switch (check_swif) { case 0: if (is_vlan && is_4addr) return wiphy->flags & WIPHY_FLAG_4ADDR_AP; return wiphy->interface_modes & BIT(iftype); case 1: if (!(wiphy->software_iftypes & BIT(iftype)) && is_vlan) return wiphy->flags & WIPHY_FLAG_4ADDR_AP; return wiphy->software_iftypes & BIT(iftype); default: break; } return false; } EXPORT_SYMBOL(cfg80211_iftype_allowed); void cfg80211_remove_link(struct wireless_dev *wdev, unsigned int link_id) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); ASSERT_WDEV_LOCK(wdev); switch (wdev->iftype) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: __cfg80211_stop_ap(rdev, wdev->netdev, link_id, true); break; default: /* per-link not relevant */ break; } wdev->valid_links &= ~BIT(link_id); rdev_del_intf_link(rdev, wdev, link_id); eth_zero_addr(wdev->links[link_id].addr); } void cfg80211_remove_links(struct wireless_dev *wdev) { unsigned int link_id; /* * links are controlled by upper layers (userspace/cfg) * only for AP mode, so only remove them here for AP */ if (wdev->iftype != NL80211_IFTYPE_AP) return; wdev_lock(wdev); if (wdev->valid_links) { for_each_valid_link(wdev, link_id) cfg80211_remove_link(wdev, link_id); } wdev_unlock(wdev); } int cfg80211_remove_virtual_intf(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { cfg80211_remove_links(wdev); return rdev_del_virtual_intf(rdev, wdev); } const struct wiphy_iftype_ext_capab * cfg80211_get_iftype_ext_capa(struct wiphy *wiphy, enum nl80211_iftype type) { int i; for (i = 0; i < wiphy->num_iftype_ext_capab; i++) { if (wiphy->iftype_ext_capab[i].iftype == type) return &wiphy->iftype_ext_capab[i]; } return NULL; } EXPORT_SYMBOL(cfg80211_get_iftype_ext_capa);
2 2 2 2 2 1 2 18 18 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 // SPDX-License-Identifier: GPL-2.0-only /* * File: pn_netlink.c * * Phonet netlink interface * * Copyright (C) 2008 Nokia Corporation. * * Authors: Sakari Ailus <sakari.ailus@nokia.com> * Remi Denis-Courmont */ #include <linux/kernel.h> #include <linux/netlink.h> #include <linux/phonet.h> #include <linux/slab.h> #include <net/sock.h> #include <net/phonet/pn_dev.h> /* Device address handling */ static int fill_addr(struct sk_buff *skb, struct net_device *dev, u8 addr, u32 portid, u32 seq, int event); void phonet_address_notify(int event, struct net_device *dev, u8 addr) { struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + nla_total_size(1), GFP_KERNEL); if (skb == NULL) goto errout; err = fill_addr(skb, dev, addr, 0, 0, event); if (err < 0) { WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, dev_net(dev), 0, RTNLGRP_PHONET_IFADDR, NULL, GFP_KERNEL); return; errout: rtnl_set_sk_err(dev_net(dev), RTNLGRP_PHONET_IFADDR, err); } static const struct nla_policy ifa_phonet_policy[IFA_MAX+1] = { [IFA_LOCAL] = { .type = NLA_U8 }, }; static int addr_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tb[IFA_MAX+1]; struct net_device *dev; struct ifaddrmsg *ifm; int err; u8 pnaddr; if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (!netlink_capable(skb, CAP_SYS_ADMIN)) return -EPERM; ASSERT_RTNL(); err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_phonet_policy, extack); if (err < 0) return err; ifm = nlmsg_data(nlh); if (tb[IFA_LOCAL] == NULL) return -EINVAL; pnaddr = nla_get_u8(tb[IFA_LOCAL]); if (pnaddr & 3) /* Phonet addresses only have 6 high-order bits */ return -EINVAL; dev = __dev_get_by_index(net, ifm->ifa_index); if (dev == NULL) return -ENODEV; if (nlh->nlmsg_type == RTM_NEWADDR) err = phonet_address_add(dev, pnaddr); else err = phonet_address_del(dev, pnaddr); if (!err) phonet_address_notify(nlh->nlmsg_type, dev, pnaddr); return err; } static int fill_addr(struct sk_buff *skb, struct net_device *dev, u8 addr, u32 portid, u32 seq, int event) { struct ifaddrmsg *ifm; struct nlmsghdr *nlh; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*ifm), 0); if (nlh == NULL) return -EMSGSIZE; ifm = nlmsg_data(nlh); ifm->ifa_family = AF_PHONET; ifm->ifa_prefixlen = 0; ifm->ifa_flags = IFA_F_PERMANENT; ifm->ifa_scope = RT_SCOPE_LINK; ifm->ifa_index = dev->ifindex; if (nla_put_u8(skb, IFA_LOCAL, addr)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int getaddr_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct phonet_device_list *pndevs; struct phonet_device *pnd; int dev_idx = 0, dev_start_idx = cb->args[0]; int addr_idx = 0, addr_start_idx = cb->args[1]; pndevs = phonet_device_list(sock_net(skb->sk)); rcu_read_lock(); list_for_each_entry_rcu(pnd, &pndevs->list, list) { u8 addr; if (dev_idx > dev_start_idx) addr_start_idx = 0; if (dev_idx++ < dev_start_idx) continue; addr_idx = 0; for_each_set_bit(addr, pnd->addrs, 64) { if (addr_idx++ < addr_start_idx) continue; if (fill_addr(skb, pnd->netdev, addr << 2, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWADDR) < 0) goto out; } } out: rcu_read_unlock(); cb->args[0] = dev_idx; cb->args[1] = addr_idx; return skb->len; } /* Routes handling */ static int fill_route(struct sk_buff *skb, struct net_device *dev, u8 dst, u32 portid, u32 seq, int event) { struct rtmsg *rtm; struct nlmsghdr *nlh; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), 0); if (nlh == NULL) return -EMSGSIZE; rtm = nlmsg_data(nlh); rtm->rtm_family = AF_PHONET; rtm->rtm_dst_len = 6; rtm->rtm_src_len = 0; rtm->rtm_tos = 0; rtm->rtm_table = RT_TABLE_MAIN; rtm->rtm_protocol = RTPROT_STATIC; rtm->rtm_scope = RT_SCOPE_UNIVERSE; rtm->rtm_type = RTN_UNICAST; rtm->rtm_flags = 0; if (nla_put_u8(skb, RTA_DST, dst) || nla_put_u32(skb, RTA_OIF, dev->ifindex)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } void rtm_phonet_notify(int event, struct net_device *dev, u8 dst) { struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + nla_total_size(1) + nla_total_size(4), GFP_KERNEL); if (skb == NULL) goto errout; err = fill_route(skb, dev, dst, 0, 0, event); if (err < 0) { WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, dev_net(dev), 0, RTNLGRP_PHONET_ROUTE, NULL, GFP_KERNEL); return; errout: rtnl_set_sk_err(dev_net(dev), RTNLGRP_PHONET_ROUTE, err); } static const struct nla_policy rtm_phonet_policy[RTA_MAX+1] = { [RTA_DST] = { .type = NLA_U8 }, [RTA_OIF] = { .type = NLA_U32 }, }; static int route_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tb[RTA_MAX+1]; struct net_device *dev; struct rtmsg *rtm; int err; u8 dst; if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (!netlink_capable(skb, CAP_SYS_ADMIN)) return -EPERM; ASSERT_RTNL(); err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_phonet_policy, extack); if (err < 0) return err; rtm = nlmsg_data(nlh); if (rtm->rtm_table != RT_TABLE_MAIN || rtm->rtm_type != RTN_UNICAST) return -EINVAL; if (tb[RTA_DST] == NULL || tb[RTA_OIF] == NULL) return -EINVAL; dst = nla_get_u8(tb[RTA_DST]); if (dst & 3) /* Phonet addresses only have 6 high-order bits */ return -EINVAL; dev = __dev_get_by_index(net, nla_get_u32(tb[RTA_OIF])); if (dev == NULL) return -ENODEV; if (nlh->nlmsg_type == RTM_NEWROUTE) err = phonet_route_add(dev, dst); else err = phonet_route_del(dev, dst); if (!err) rtm_phonet_notify(nlh->nlmsg_type, dev, dst); return err; } static int route_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); u8 addr; rcu_read_lock(); for (addr = cb->args[0]; addr < 64; addr++) { struct net_device *dev = phonet_route_get_rcu(net, addr << 2); if (!dev) continue; if (fill_route(skb, dev, addr << 2, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWROUTE) < 0) goto out; } out: rcu_read_unlock(); cb->args[0] = addr; return skb->len; } int __init phonet_netlink_register(void) { int err = rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_NEWADDR, addr_doit, NULL, 0); if (err) return err; /* Further rtnl_register_module() cannot fail */ rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_DELADDR, addr_doit, NULL, 0); rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_GETADDR, NULL, getaddr_dumpit, 0); rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_NEWROUTE, route_doit, NULL, 0); rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_DELROUTE, route_doit, NULL, 0); rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_GETROUTE, NULL, route_dumpit, 0); return 0; }
1322 4856 4858 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_RT_H #define _LINUX_SCHED_RT_H #include <linux/sched.h> struct task_struct; static inline int rt_prio(int prio) { if (unlikely(prio < MAX_RT_PRIO)) return 1; return 0; } static inline int rt_task(struct task_struct *p) { return rt_prio(p->prio); } static inline bool task_is_realtime(struct task_struct *tsk) { int policy = tsk->policy; if (policy == SCHED_FIFO || policy == SCHED_RR) return true; if (policy == SCHED_DEADLINE) return true; return false; } #ifdef CONFIG_RT_MUTEXES /* * Must hold either p->pi_lock or task_rq(p)->lock. */ static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *p) { return p->pi_top_task; } extern void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task); extern void rt_mutex_adjust_pi(struct task_struct *p); #else static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task) { return NULL; } # define rt_mutex_adjust_pi(p) do { } while (0) #endif extern void normalize_rt_tasks(void); /* * default timeslice is 100 msecs (used only for SCHED_RR tasks). * Timeslices get refilled after they expire. */ #define RR_TIMESLICE (100 * HZ / 1000) #endif /* _LINUX_SCHED_RT_H */
1528 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 /* Copyright 2011, Siemens AG * written by Alexander Smirnov <alex.bluesman.smirnov@gmail.com> */ /* Based on patches from Jon Smirl <jonsmirl@gmail.com> * Copyright (c) 2011 Jon Smirl <jonsmirl@gmail.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ /* Jon's code is based on 6lowpan implementation for Contiki which is: * Copyright (c) 2008, Swedish Institute of Computer Science. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the Institute nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <linux/module.h> #include <linux/netdevice.h> #include <linux/ieee802154.h> #include <linux/if_arp.h> #include <net/ipv6.h> #include "6lowpan_i.h" static int open_count; static const struct header_ops lowpan_header_ops = { .create = lowpan_header_create, }; static int lowpan_dev_init(struct net_device *ldev) { netdev_lockdep_set_classes(ldev); return 0; } static int lowpan_open(struct net_device *dev) { if (!open_count) lowpan_rx_init(); open_count++; return 0; } static int lowpan_stop(struct net_device *dev) { open_count--; if (!open_count) lowpan_rx_exit(); return 0; } static int lowpan_neigh_construct(struct net_device *dev, struct neighbour *n) { struct lowpan_802154_neigh *neigh = lowpan_802154_neigh(neighbour_priv(n)); /* default no short_addr is available for a neighbour */ neigh->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC); return 0; } static int lowpan_get_iflink(const struct net_device *dev) { return lowpan_802154_dev(dev)->wdev->ifindex; } static const struct net_device_ops lowpan_netdev_ops = { .ndo_init = lowpan_dev_init, .ndo_start_xmit = lowpan_xmit, .ndo_open = lowpan_open, .ndo_stop = lowpan_stop, .ndo_neigh_construct = lowpan_neigh_construct, .ndo_get_iflink = lowpan_get_iflink, }; static void lowpan_setup(struct net_device *ldev) { memset(ldev->broadcast, 0xff, IEEE802154_ADDR_LEN); /* We need an ipv6hdr as minimum len when calling xmit */ ldev->hard_header_len = sizeof(struct ipv6hdr); ldev->flags = IFF_BROADCAST | IFF_MULTICAST; ldev->priv_flags |= IFF_NO_QUEUE; ldev->netdev_ops = &lowpan_netdev_ops; ldev->header_ops = &lowpan_header_ops; ldev->needs_free_netdev = true; ldev->features |= NETIF_F_NETNS_LOCAL; } static int lowpan_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != IEEE802154_ADDR_LEN) return -EINVAL; } return 0; } static int lowpan_newlink(struct net *src_net, struct net_device *ldev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct net_device *wdev; int ret; ASSERT_RTNL(); pr_debug("adding new link\n"); if (!tb[IFLA_LINK]) return -EINVAL; /* find and hold wpan device */ wdev = dev_get_by_index(dev_net(ldev), nla_get_u32(tb[IFLA_LINK])); if (!wdev) return -ENODEV; if (wdev->type != ARPHRD_IEEE802154) { dev_put(wdev); return -EINVAL; } if (wdev->ieee802154_ptr->lowpan_dev) { dev_put(wdev); return -EBUSY; } lowpan_802154_dev(ldev)->wdev = wdev; /* Set the lowpan hardware address to the wpan hardware address. */ __dev_addr_set(ldev, wdev->dev_addr, IEEE802154_ADDR_LEN); /* We need headroom for possible wpan_dev_hard_header call and tailroom * for encryption/fcs handling. The lowpan interface will replace * the IPv6 header with 6LoWPAN header. At worst case the 6LoWPAN * header has LOWPAN_IPHC_MAX_HEADER_LEN more bytes than the IPv6 * header. */ ldev->needed_headroom = LOWPAN_IPHC_MAX_HEADER_LEN + wdev->needed_headroom; ldev->needed_tailroom = wdev->needed_tailroom; ldev->neigh_priv_len = sizeof(struct lowpan_802154_neigh); ret = lowpan_register_netdevice(ldev, LOWPAN_LLTYPE_IEEE802154); if (ret < 0) { dev_put(wdev); return ret; } wdev->ieee802154_ptr->lowpan_dev = ldev; return 0; } static void lowpan_dellink(struct net_device *ldev, struct list_head *head) { struct net_device *wdev = lowpan_802154_dev(ldev)->wdev; ASSERT_RTNL(); wdev->ieee802154_ptr->lowpan_dev = NULL; lowpan_unregister_netdevice(ldev); dev_put(wdev); } static struct rtnl_link_ops lowpan_link_ops __read_mostly = { .kind = "lowpan", .priv_size = LOWPAN_PRIV_SIZE(sizeof(struct lowpan_802154_dev)), .setup = lowpan_setup, .newlink = lowpan_newlink, .dellink = lowpan_dellink, .validate = lowpan_validate, }; static inline int __init lowpan_netlink_init(void) { return rtnl_link_register(&lowpan_link_ops); } static inline void lowpan_netlink_fini(void) { rtnl_link_unregister(&lowpan_link_ops); } static int lowpan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *ndev = netdev_notifier_info_to_dev(ptr); struct wpan_dev *wpan_dev; if (ndev->type != ARPHRD_IEEE802154) return NOTIFY_DONE; wpan_dev = ndev->ieee802154_ptr; if (!wpan_dev) return NOTIFY_DONE; switch (event) { case NETDEV_UNREGISTER: /* Check if wpan interface is unregistered that we * also delete possible lowpan interfaces which belongs * to the wpan interface. */ if (wpan_dev->lowpan_dev) lowpan_dellink(wpan_dev->lowpan_dev, NULL); break; default: return NOTIFY_DONE; } return NOTIFY_OK; } static struct notifier_block lowpan_dev_notifier = { .notifier_call = lowpan_device_event, }; static int __init lowpan_init_module(void) { int err = 0; err = lowpan_net_frag_init(); if (err < 0) goto out; err = lowpan_netlink_init(); if (err < 0) goto out_frag; err = register_netdevice_notifier(&lowpan_dev_notifier); if (err < 0) goto out_pack; return 0; out_pack: lowpan_netlink_fini(); out_frag: lowpan_net_frag_exit(); out: return err; } static void __exit lowpan_cleanup_module(void) { lowpan_netlink_fini(); lowpan_net_frag_exit(); unregister_netdevice_notifier(&lowpan_dev_notifier); } module_init(lowpan_init_module); module_exit(lowpan_cleanup_module); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("lowpan");
147 1536 1534 147 147 140 25 25 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2005 Voltaire Inc. All rights reserved. * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved. * Copyright (c) 1999-2019, Mellanox Technologies, Inc. All rights reserved. * Copyright (c) 2005-2006 Intel Corporation. All rights reserved. */ #include <linux/completion.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/mutex.h> #include <linux/random.h> #include <linux/rbtree.h> #include <linux/igmp.h> #include <linux/xarray.h> #include <linux/inetdevice.h> #include <linux/slab.h> #include <linux/module.h> #include <net/route.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/netevent.h> #include <net/tcp.h> #include <net/ipv6.h> #include <net/ip_fib.h> #include <net/ip6_route.h> #include <rdma/rdma_cm.h> #include <rdma/rdma_cm_ib.h> #include <rdma/rdma_netlink.h> #include <rdma/ib.h> #include <rdma/ib_cache.h> #include <rdma/ib_cm.h> #include <rdma/ib_sa.h> #include <rdma/iw_cm.h> #include "core_priv.h" #include "cma_priv.h" #include "cma_trace.h" MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("Generic RDMA CM Agent"); MODULE_LICENSE("Dual BSD/GPL"); #define CMA_CM_RESPONSE_TIMEOUT 20 #define CMA_MAX_CM_RETRIES 15 #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24) #define CMA_IBOE_PACKET_LIFETIME 16 #define CMA_PREFERRED_ROCE_GID_TYPE IB_GID_TYPE_ROCE_UDP_ENCAP static const char * const cma_events[] = { [RDMA_CM_EVENT_ADDR_RESOLVED] = "address resolved", [RDMA_CM_EVENT_ADDR_ERROR] = "address error", [RDMA_CM_EVENT_ROUTE_RESOLVED] = "route resolved ", [RDMA_CM_EVENT_ROUTE_ERROR] = "route error", [RDMA_CM_EVENT_CONNECT_REQUEST] = "connect request", [RDMA_CM_EVENT_CONNECT_RESPONSE] = "connect response", [RDMA_CM_EVENT_CONNECT_ERROR] = "connect error", [RDMA_CM_EVENT_UNREACHABLE] = "unreachable", [RDMA_CM_EVENT_REJECTED] = "rejected", [RDMA_CM_EVENT_ESTABLISHED] = "established", [RDMA_CM_EVENT_DISCONNECTED] = "disconnected", [RDMA_CM_EVENT_DEVICE_REMOVAL] = "device removal", [RDMA_CM_EVENT_MULTICAST_JOIN] = "multicast join", [RDMA_CM_EVENT_MULTICAST_ERROR] = "multicast error", [RDMA_CM_EVENT_ADDR_CHANGE] = "address change", [RDMA_CM_EVENT_TIMEWAIT_EXIT] = "timewait exit", }; static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid, enum ib_gid_type gid_type); const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event) { size_t index = event; return (index < ARRAY_SIZE(cma_events) && cma_events[index]) ? cma_events[index] : "unrecognized event"; } EXPORT_SYMBOL(rdma_event_msg); const char *__attribute_const__ rdma_reject_msg(struct rdma_cm_id *id, int reason) { if (rdma_ib_or_roce(id->device, id->port_num)) return ibcm_reject_msg(reason); if (rdma_protocol_iwarp(id->device, id->port_num)) return iwcm_reject_msg(reason); WARN_ON_ONCE(1); return "unrecognized transport"; } EXPORT_SYMBOL(rdma_reject_msg); /** * rdma_is_consumer_reject - return true if the consumer rejected the connect * request. * @id: Communication identifier that received the REJECT event. * @reason: Value returned in the REJECT event status field. */ static bool rdma_is_consumer_reject(struct rdma_cm_id *id, int reason) { if (rdma_ib_or_roce(id->device, id->port_num)) return reason == IB_CM_REJ_CONSUMER_DEFINED; if (rdma_protocol_iwarp(id->device, id->port_num)) return reason == -ECONNREFUSED; WARN_ON_ONCE(1); return false; } const void *rdma_consumer_reject_data(struct rdma_cm_id *id, struct rdma_cm_event *ev, u8 *data_len) { const void *p; if (rdma_is_consumer_reject(id, ev->status)) { *data_len = ev->param.conn.private_data_len; p = ev->param.conn.private_data; } else { *data_len = 0; p = NULL; } return p; } EXPORT_SYMBOL(rdma_consumer_reject_data); /** * rdma_iw_cm_id() - return the iw_cm_id pointer for this cm_id. * @id: Communication Identifier */ struct iw_cm_id *rdma_iw_cm_id(struct rdma_cm_id *id) { struct rdma_id_private *id_priv; id_priv = container_of(id, struct rdma_id_private, id); if (id->device->node_type == RDMA_NODE_RNIC) return id_priv->cm_id.iw; return NULL; } EXPORT_SYMBOL(rdma_iw_cm_id); /** * rdma_res_to_id() - return the rdma_cm_id pointer for this restrack. * @res: rdma resource tracking entry pointer */ struct rdma_cm_id *rdma_res_to_id(struct rdma_restrack_entry *res) { struct rdma_id_private *id_priv = container_of(res, struct rdma_id_private, res); return &id_priv->id; } EXPORT_SYMBOL(rdma_res_to_id); static int cma_add_one(struct ib_device *device); static void cma_remove_one(struct ib_device *device, void *client_data); static struct ib_client cma_client = { .name = "cma", .add = cma_add_one, .remove = cma_remove_one }; static struct ib_sa_client sa_client; static LIST_HEAD(dev_list); static LIST_HEAD(listen_any_list); static DEFINE_MUTEX(lock); static struct rb_root id_table = RB_ROOT; /* Serialize operations of id_table tree */ static DEFINE_SPINLOCK(id_table_lock); static struct workqueue_struct *cma_wq; static unsigned int cma_pernet_id; struct cma_pernet { struct xarray tcp_ps; struct xarray udp_ps; struct xarray ipoib_ps; struct xarray ib_ps; }; static struct cma_pernet *cma_pernet(struct net *net) { return net_generic(net, cma_pernet_id); } static struct xarray *cma_pernet_xa(struct net *net, enum rdma_ucm_port_space ps) { struct cma_pernet *pernet = cma_pernet(net); switch (ps) { case RDMA_PS_TCP: return &pernet->tcp_ps; case RDMA_PS_UDP: return &pernet->udp_ps; case RDMA_PS_IPOIB: return &pernet->ipoib_ps; case RDMA_PS_IB: return &pernet->ib_ps; default: return NULL; } } struct id_table_entry { struct list_head id_list; struct rb_node rb_node; }; struct cma_device { struct list_head list; struct ib_device *device; struct completion comp; refcount_t refcount; struct list_head id_list; enum ib_gid_type *default_gid_type; u8 *default_roce_tos; }; struct rdma_bind_list { enum rdma_ucm_port_space ps; struct hlist_head owners; unsigned short port; }; static int cma_ps_alloc(struct net *net, enum rdma_ucm_port_space ps, struct rdma_bind_list *bind_list, int snum) { struct xarray *xa = cma_pernet_xa(net, ps); return xa_insert(xa, snum, bind_list, GFP_KERNEL); } static struct rdma_bind_list *cma_ps_find(struct net *net, enum rdma_ucm_port_space ps, int snum) { struct xarray *xa = cma_pernet_xa(net, ps); return xa_load(xa, snum); } static void cma_ps_remove(struct net *net, enum rdma_ucm_port_space ps, int snum) { struct xarray *xa = cma_pernet_xa(net, ps); xa_erase(xa, snum); } enum { CMA_OPTION_AFONLY, }; void cma_dev_get(struct cma_device *cma_dev) { refcount_inc(&cma_dev->refcount); } void cma_dev_put(struct cma_device *cma_dev) { if (refcount_dec_and_test(&cma_dev->refcount)) complete(&cma_dev->comp); } struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter, void *cookie) { struct cma_device *cma_dev; struct cma_device *found_cma_dev = NULL; mutex_lock(&lock); list_for_each_entry(cma_dev, &dev_list, list) if (filter(cma_dev->device, cookie)) { found_cma_dev = cma_dev; break; } if (found_cma_dev) cma_dev_get(found_cma_dev); mutex_unlock(&lock); return found_cma_dev; } int cma_get_default_gid_type(struct cma_device *cma_dev, u32 port) { if (!rdma_is_port_valid(cma_dev->device, port)) return -EINVAL; return cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)]; } int cma_set_default_gid_type(struct cma_device *cma_dev, u32 port, enum ib_gid_type default_gid_type) { unsigned long supported_gids; if (!rdma_is_port_valid(cma_dev->device, port)) return -EINVAL; if (default_gid_type == IB_GID_TYPE_IB && rdma_protocol_roce_eth_encap(cma_dev->device, port)) default_gid_type = IB_GID_TYPE_ROCE; supported_gids = roce_gid_type_mask_support(cma_dev->device, port); if (!(supported_gids & 1 << default_gid_type)) return -EINVAL; cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)] = default_gid_type; return 0; } int cma_get_default_roce_tos(struct cma_device *cma_dev, u32 port) { if (!rdma_is_port_valid(cma_dev->device, port)) return -EINVAL; return cma_dev->default_roce_tos[port - rdma_start_port(cma_dev->device)]; } int cma_set_default_roce_tos(struct cma_device *cma_dev, u32 port, u8 default_roce_tos) { if (!rdma_is_port_valid(cma_dev->device, port)) return -EINVAL; cma_dev->default_roce_tos[port - rdma_start_port(cma_dev->device)] = default_roce_tos; return 0; } struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev) { return cma_dev->device; } /* * Device removal can occur at anytime, so we need extra handling to * serialize notifying the user of device removal with other callbacks. * We do this by disabling removal notification while a callback is in process, * and reporting it after the callback completes. */ struct cma_multicast { struct rdma_id_private *id_priv; union { struct ib_sa_multicast *sa_mc; struct { struct work_struct work; struct rdma_cm_event event; } iboe_join; }; struct list_head list; void *context; struct sockaddr_storage addr; u8 join_state; }; struct cma_work { struct work_struct work; struct rdma_id_private *id; enum rdma_cm_state old_state; enum rdma_cm_state new_state; struct rdma_cm_event event; }; union cma_ip_addr { struct in6_addr ip6; struct { __be32 pad[3]; __be32 addr; } ip4; }; struct cma_hdr { u8 cma_version; u8 ip_version; /* IP version: 7:4 */ __be16 port; union cma_ip_addr src_addr; union cma_ip_addr dst_addr; }; #define CMA_VERSION 0x00 struct cma_req_info { struct sockaddr_storage listen_addr_storage; struct sockaddr_storage src_addr_storage; struct ib_device *device; union ib_gid local_gid; __be64 service_id; int port; bool has_gid; u16 pkey; }; static int cma_comp_exch(struct rdma_id_private *id_priv, enum rdma_cm_state comp, enum rdma_cm_state exch) { unsigned long flags; int ret; /* * The FSM uses a funny double locking where state is protected by both * the handler_mutex and the spinlock. State is not allowed to change * to/from a handler_mutex protected value without also holding * handler_mutex. */ if (comp == RDMA_CM_CONNECT || exch == RDMA_CM_CONNECT) lockdep_assert_held(&id_priv->handler_mutex); spin_lock_irqsave(&id_priv->lock, flags); if ((ret = (id_priv->state == comp))) id_priv->state = exch; spin_unlock_irqrestore(&id_priv->lock, flags); return ret; } static inline u8 cma_get_ip_ver(const struct cma_hdr *hdr) { return hdr->ip_version >> 4; } static void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver) { hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF); } static struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv) { return (struct sockaddr *)&id_priv->id.route.addr.src_addr; } static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv) { return (struct sockaddr *)&id_priv->id.route.addr.dst_addr; } static int cma_igmp_send(struct net_device *ndev, union ib_gid *mgid, bool join) { struct in_device *in_dev = NULL; if (ndev) { rtnl_lock(); in_dev = __in_dev_get_rtnl(ndev); if (in_dev) { if (join) ip_mc_inc_group(in_dev, *(__be32 *)(mgid->raw + 12)); else ip_mc_dec_group(in_dev, *(__be32 *)(mgid->raw + 12)); } rtnl_unlock(); } return (in_dev) ? 0 : -ENODEV; } static int compare_netdev_and_ip(int ifindex_a, struct sockaddr *sa, struct id_table_entry *entry_b) { struct rdma_id_private *id_priv = list_first_entry( &entry_b->id_list, struct rdma_id_private, id_list_entry); int ifindex_b = id_priv->id.route.addr.dev_addr.bound_dev_if; struct sockaddr *sb = cma_dst_addr(id_priv); if (ifindex_a != ifindex_b) return (ifindex_a > ifindex_b) ? 1 : -1; if (sa->sa_family != sb->sa_family) return sa->sa_family - sb->sa_family; if (sa->sa_family == AF_INET && __builtin_object_size(sa, 0) >= sizeof(struct sockaddr_in)) { return memcmp(&((struct sockaddr_in *)sa)->sin_addr, &((struct sockaddr_in *)sb)->sin_addr, sizeof(((struct sockaddr_in *)sa)->sin_addr)); } if (sa->sa_family == AF_INET6 && __builtin_object_size(sa, 0) >= sizeof(struct sockaddr_in6)) { return ipv6_addr_cmp(&((struct sockaddr_in6 *)sa)->sin6_addr, &((struct sockaddr_in6 *)sb)->sin6_addr); } return -1; } static int cma_add_id_to_tree(struct rdma_id_private *node_id_priv) { struct rb_node **new, *parent = NULL; struct id_table_entry *this, *node; unsigned long flags; int result; node = kzalloc(sizeof(*node), GFP_KERNEL); if (!node) return -ENOMEM; spin_lock_irqsave(&id_table_lock, flags); new = &id_table.rb_node; while (*new) { this = container_of(*new, struct id_table_entry, rb_node); result = compare_netdev_and_ip( node_id_priv->id.route.addr.dev_addr.bound_dev_if, cma_dst_addr(node_id_priv), this); parent = *new; if (result < 0) new = &((*new)->rb_left); else if (result > 0) new = &((*new)->rb_right); else { list_add_tail(&node_id_priv->id_list_entry, &this->id_list); kfree(node); goto unlock; } } INIT_LIST_HEAD(&node->id_list); list_add_tail(&node_id_priv->id_list_entry, &node->id_list); rb_link_node(&node->rb_node, parent, new); rb_insert_color(&node->rb_node, &id_table); unlock: spin_unlock_irqrestore(&id_table_lock, flags); return 0; } static struct id_table_entry * node_from_ndev_ip(struct rb_root *root, int ifindex, struct sockaddr *sa) { struct rb_node *node = root->rb_node; struct id_table_entry *data; int result; while (node) { data = container_of(node, struct id_table_entry, rb_node); result = compare_netdev_and_ip(ifindex, sa, data); if (result < 0) node = node->rb_left; else if (result > 0) node = node->rb_right; else return data; } return NULL; } static void cma_remove_id_from_tree(struct rdma_id_private *id_priv) { struct id_table_entry *data; unsigned long flags; spin_lock_irqsave(&id_table_lock, flags); if (list_empty(&id_priv->id_list_entry)) goto out; data = node_from_ndev_ip(&id_table, id_priv->id.route.addr.dev_addr.bound_dev_if, cma_dst_addr(id_priv)); if (!data) goto out; list_del_init(&id_priv->id_list_entry); if (list_empty(&data->id_list)) { rb_erase(&data->rb_node, &id_table); kfree(data); } out: spin_unlock_irqrestore(&id_table_lock, flags); } static void _cma_attach_to_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev) { cma_dev_get(cma_dev); id_priv->cma_dev = cma_dev; id_priv->id.device = cma_dev->device; id_priv->id.route.addr.dev_addr.transport = rdma_node_get_transport(cma_dev->device->node_type); list_add_tail(&id_priv->device_item, &cma_dev->id_list); trace_cm_id_attach(id_priv, cma_dev->device); } static void cma_attach_to_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev) { _cma_attach_to_dev(id_priv, cma_dev); id_priv->gid_type = cma_dev->default_gid_type[id_priv->id.port_num - rdma_start_port(cma_dev->device)]; } static void cma_release_dev(struct rdma_id_private *id_priv) { mutex_lock(&lock); list_del_init(&id_priv->device_item); cma_dev_put(id_priv->cma_dev); id_priv->cma_dev = NULL; id_priv->id.device = NULL; if (id_priv->id.route.addr.dev_addr.sgid_attr) { rdma_put_gid_attr(id_priv->id.route.addr.dev_addr.sgid_attr); id_priv->id.route.addr.dev_addr.sgid_attr = NULL; } mutex_unlock(&lock); } static inline unsigned short cma_family(struct rdma_id_private *id_priv) { return id_priv->id.route.addr.src_addr.ss_family; } static int cma_set_default_qkey(struct rdma_id_private *id_priv) { struct ib_sa_mcmember_rec rec; int ret = 0; switch (id_priv->id.ps) { case RDMA_PS_UDP: case RDMA_PS_IB: id_priv->qkey = RDMA_UDP_QKEY; break; case RDMA_PS_IPOIB: ib_addr_get_mgid(&id_priv->id.route.addr.dev_addr, &rec.mgid); ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num, &rec.mgid, &rec); if (!ret) id_priv->qkey = be32_to_cpu(rec.qkey); break; default: break; } return ret; } static int cma_set_qkey(struct rdma_id_private *id_priv, u32 qkey) { if (!qkey || (id_priv->qkey && (id_priv->qkey != qkey))) return -EINVAL; id_priv->qkey = qkey; return 0; } static void cma_translate_ib(struct sockaddr_ib *sib, struct rdma_dev_addr *dev_addr) { dev_addr->dev_type = ARPHRD_INFINIBAND; rdma_addr_set_sgid(dev_addr, (union ib_gid *) &sib->sib_addr); ib_addr_set_pkey(dev_addr, ntohs(sib->sib_pkey)); } static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_addr) { int ret; if (addr->sa_family != AF_IB) { ret = rdma_translate_ip(addr, dev_addr); } else { cma_translate_ib((struct sockaddr_ib *) addr, dev_addr); ret = 0; } return ret; } static const struct ib_gid_attr * cma_validate_port(struct ib_device *device, u32 port, enum ib_gid_type gid_type, union ib_gid *gid, struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; const struct ib_gid_attr *sgid_attr = ERR_PTR(-ENODEV); int bound_if_index = dev_addr->bound_dev_if; int dev_type = dev_addr->dev_type; struct net_device *ndev = NULL; if (!rdma_dev_access_netns(device, id_priv->id.route.addr.dev_addr.net)) goto out; if ((dev_type == ARPHRD_INFINIBAND) && !rdma_protocol_ib(device, port)) goto out; if ((dev_type != ARPHRD_INFINIBAND) && rdma_protocol_ib(device, port)) goto out; /* * For drivers that do not associate more than one net device with * their gid tables, such as iWARP drivers, it is sufficient to * return the first table entry. * * Other driver classes might be included in the future. */ if (rdma_protocol_iwarp(device, port)) { sgid_attr = rdma_get_gid_attr(device, port, 0); if (IS_ERR(sgid_attr)) goto out; rcu_read_lock(); ndev = rcu_dereference(sgid_attr->ndev); if (!net_eq(dev_net(ndev), dev_addr->net) || ndev->ifindex != bound_if_index) sgid_attr = ERR_PTR(-ENODEV); rcu_read_unlock(); goto out; } if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) { ndev = dev_get_by_index(dev_addr->net, bound_if_index); if (!ndev) goto out; } else { gid_type = IB_GID_TYPE_IB; } sgid_attr = rdma_find_gid_by_port(device, gid, gid_type, port, ndev); dev_put(ndev); out: return sgid_attr; } static void cma_bind_sgid_attr(struct rdma_id_private *id_priv, const struct ib_gid_attr *sgid_attr) { WARN_ON(id_priv->id.route.addr.dev_addr.sgid_attr); id_priv->id.route.addr.dev_addr.sgid_attr = sgid_attr; } /** * cma_acquire_dev_by_src_ip - Acquire cma device, port, gid attribute * based on source ip address. * @id_priv: cm_id which should be bound to cma device * * cma_acquire_dev_by_src_ip() binds cm id to cma device, port and GID attribute * based on source IP address. It returns 0 on success or error code otherwise. * It is applicable to active and passive side cm_id. */ static int cma_acquire_dev_by_src_ip(struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; const struct ib_gid_attr *sgid_attr; union ib_gid gid, iboe_gid, *gidp; struct cma_device *cma_dev; enum ib_gid_type gid_type; int ret = -ENODEV; u32 port; if (dev_addr->dev_type != ARPHRD_INFINIBAND && id_priv->id.ps == RDMA_PS_IPOIB) return -EINVAL; rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &iboe_gid); memcpy(&gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof(gid)); mutex_lock(&lock); list_for_each_entry(cma_dev, &dev_list, list) { rdma_for_each_port (cma_dev->device, port) { gidp = rdma_protocol_roce(cma_dev->device, port) ? &iboe_gid : &gid; gid_type = cma_dev->default_gid_type[port - 1]; sgid_attr = cma_validate_port(cma_dev->device, port, gid_type, gidp, id_priv); if (!IS_ERR(sgid_attr)) { id_priv->id.port_num = port; cma_bind_sgid_attr(id_priv, sgid_attr); cma_attach_to_dev(id_priv, cma_dev); ret = 0; goto out; } } } out: mutex_unlock(&lock); return ret; } /** * cma_ib_acquire_dev - Acquire cma device, port and SGID attribute * @id_priv: cm id to bind to cma device * @listen_id_priv: listener cm id to match against * @req: Pointer to req structure containaining incoming * request information * cma_ib_acquire_dev() acquires cma device, port and SGID attribute when * rdma device matches for listen_id and incoming request. It also verifies * that a GID table entry is present for the source address. * Returns 0 on success, or returns error code otherwise. */ static int cma_ib_acquire_dev(struct rdma_id_private *id_priv, const struct rdma_id_private *listen_id_priv, struct cma_req_info *req) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; const struct ib_gid_attr *sgid_attr; enum ib_gid_type gid_type; union ib_gid gid; if (dev_addr->dev_type != ARPHRD_INFINIBAND && id_priv->id.ps == RDMA_PS_IPOIB) return -EINVAL; if (rdma_protocol_roce(req->device, req->port)) rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &gid); else memcpy(&gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof(gid)); gid_type = listen_id_priv->cma_dev->default_gid_type[req->port - 1]; sgid_attr = cma_validate_port(req->device, req->port, gid_type, &gid, id_priv); if (IS_ERR(sgid_attr)) return PTR_ERR(sgid_attr); id_priv->id.port_num = req->port; cma_bind_sgid_attr(id_priv, sgid_attr); /* Need to acquire lock to protect against reader * of cma_dev->id_list such as cma_netdev_callback() and * cma_process_remove(). */ mutex_lock(&lock); cma_attach_to_dev(id_priv, listen_id_priv->cma_dev); mutex_unlock(&lock); rdma_restrack_add(&id_priv->res); return 0; } static int cma_iw_acquire_dev(struct rdma_id_private *id_priv, const struct rdma_id_private *listen_id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; const struct ib_gid_attr *sgid_attr; struct cma_device *cma_dev; enum ib_gid_type gid_type; int ret = -ENODEV; union ib_gid gid; u32 port; if (dev_addr->dev_type != ARPHRD_INFINIBAND && id_priv->id.ps == RDMA_PS_IPOIB) return -EINVAL; memcpy(&gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof(gid)); mutex_lock(&lock); cma_dev = listen_id_priv->cma_dev; port = listen_id_priv->id.port_num; gid_type = listen_id_priv->gid_type; sgid_attr = cma_validate_port(cma_dev->device, port, gid_type, &gid, id_priv); if (!IS_ERR(sgid_attr)) { id_priv->id.port_num = port; cma_bind_sgid_attr(id_priv, sgid_attr); ret = 0; goto out; } list_for_each_entry(cma_dev, &dev_list, list) { rdma_for_each_port (cma_dev->device, port) { if (listen_id_priv->cma_dev == cma_dev && listen_id_priv->id.port_num == port) continue; gid_type = cma_dev->default_gid_type[port - 1]; sgid_attr = cma_validate_port(cma_dev->device, port, gid_type, &gid, id_priv); if (!IS_ERR(sgid_attr)) { id_priv->id.port_num = port; cma_bind_sgid_attr(id_priv, sgid_attr); ret = 0; goto out; } } } out: if (!ret) { cma_attach_to_dev(id_priv, cma_dev); rdma_restrack_add(&id_priv->res); } mutex_unlock(&lock); return ret; } /* * Select the source IB device and address to reach the destination IB address. */ static int cma_resolve_ib_dev(struct rdma_id_private *id_priv) { struct cma_device *cma_dev, *cur_dev; struct sockaddr_ib *addr; union ib_gid gid, sgid, *dgid; unsigned int p; u16 pkey, index; enum ib_port_state port_state; int ret; int i; cma_dev = NULL; addr = (struct sockaddr_ib *) cma_dst_addr(id_priv); dgid = (union ib_gid *) &addr->sib_addr; pkey = ntohs(addr->sib_pkey); mutex_lock(&lock); list_for_each_entry(cur_dev, &dev_list, list) { rdma_for_each_port (cur_dev->device, p) { if (!rdma_cap_af_ib(cur_dev->device, p)) continue; if (ib_find_cached_pkey(cur_dev->device, p, pkey, &index)) continue; if (ib_get_cached_port_state(cur_dev->device, p, &port_state)) continue; for (i = 0; i < cur_dev->device->port_data[p].immutable.gid_tbl_len; ++i) { ret = rdma_query_gid(cur_dev->device, p, i, &gid); if (ret) continue; if (!memcmp(&gid, dgid, sizeof(gid))) { cma_dev = cur_dev; sgid = gid; id_priv->id.port_num = p; goto found; } if (!cma_dev && (gid.global.subnet_prefix == dgid->global.subnet_prefix) && port_state == IB_PORT_ACTIVE) { cma_dev = cur_dev; sgid = gid; id_priv->id.port_num = p; goto found; } } } } mutex_unlock(&lock); return -ENODEV; found: cma_attach_to_dev(id_priv, cma_dev); rdma_restrack_add(&id_priv->res); mutex_unlock(&lock); addr = (struct sockaddr_ib *)cma_src_addr(id_priv); memcpy(&addr->sib_addr, &sgid, sizeof(sgid)); cma_translate_ib(addr, &id_priv->id.route.addr.dev_addr); return 0; } static void cma_id_get(struct rdma_id_private *id_priv) { refcount_inc(&id_priv->refcount); } static void cma_id_put(struct rdma_id_private *id_priv) { if (refcount_dec_and_test(&id_priv->refcount)) complete(&id_priv->comp); } static struct rdma_id_private * __rdma_create_id(struct net *net, rdma_cm_event_handler event_handler, void *context, enum rdma_ucm_port_space ps, enum ib_qp_type qp_type, const struct rdma_id_private *parent) { struct rdma_id_private *id_priv; id_priv = kzalloc(sizeof *id_priv, GFP_KERNEL); if (!id_priv) return ERR_PTR(-ENOMEM); id_priv->state = RDMA_CM_IDLE; id_priv->id.context = context; id_priv->id.event_handler = event_handler; id_priv->id.ps = ps; id_priv->id.qp_type = qp_type; id_priv->tos_set = false; id_priv->timeout_set = false; id_priv->min_rnr_timer_set = false; id_priv->gid_type = IB_GID_TYPE_IB; spin_lock_init(&id_priv->lock); mutex_init(&id_priv->qp_mutex); init_completion(&id_priv->comp); refcount_set(&id_priv->refcount, 1); mutex_init(&id_priv->handler_mutex); INIT_LIST_HEAD(&id_priv->device_item); INIT_LIST_HEAD(&id_priv->id_list_entry); INIT_LIST_HEAD(&id_priv->listen_list); INIT_LIST_HEAD(&id_priv->mc_list); get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num); id_priv->id.route.addr.dev_addr.net = get_net(net); id_priv->seq_num &= 0x00ffffff; rdma_restrack_new(&id_priv->res, RDMA_RESTRACK_CM_ID); if (parent) rdma_restrack_parent_name(&id_priv->res, &parent->res); return id_priv; } struct rdma_cm_id * __rdma_create_kernel_id(struct net *net, rdma_cm_event_handler event_handler, void *context, enum rdma_ucm_port_space ps, enum ib_qp_type qp_type, const char *caller) { struct rdma_id_private *ret; ret = __rdma_create_id(net, event_handler, context, ps, qp_type, NULL); if (IS_ERR(ret)) return ERR_CAST(ret); rdma_restrack_set_name(&ret->res, caller); return &ret->id; } EXPORT_SYMBOL(__rdma_create_kernel_id); struct rdma_cm_id *rdma_create_user_id(rdma_cm_event_handler event_handler, void *context, enum rdma_ucm_port_space ps, enum ib_qp_type qp_type) { struct rdma_id_private *ret; ret = __rdma_create_id(current->nsproxy->net_ns, event_handler, context, ps, qp_type, NULL); if (IS_ERR(ret)) return ERR_CAST(ret); rdma_restrack_set_name(&ret->res, NULL); return &ret->id; } EXPORT_SYMBOL(rdma_create_user_id); static int cma_init_ud_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; qp_attr.qp_state = IB_QPS_INIT; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) return ret; ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); if (ret) return ret; qp_attr.qp_state = IB_QPS_RTR; ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE); if (ret) return ret; qp_attr.qp_state = IB_QPS_RTS; qp_attr.sq_psn = 0; ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE | IB_QP_SQ_PSN); return ret; } static int cma_init_conn_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; qp_attr.qp_state = IB_QPS_INIT; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) return ret; return ib_modify_qp(qp, &qp_attr, qp_attr_mask); } int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr) { struct rdma_id_private *id_priv; struct ib_qp *qp; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (id->device != pd->device) { ret = -EINVAL; goto out_err; } qp_init_attr->port_num = id->port_num; qp = ib_create_qp(pd, qp_init_attr); if (IS_ERR(qp)) { ret = PTR_ERR(qp); goto out_err; } if (id->qp_type == IB_QPT_UD) ret = cma_init_ud_qp(id_priv, qp); else ret = cma_init_conn_qp(id_priv, qp); if (ret) goto out_destroy; id->qp = qp; id_priv->qp_num = qp->qp_num; id_priv->srq = (qp->srq != NULL); trace_cm_qp_create(id_priv, pd, qp_init_attr, 0); return 0; out_destroy: ib_destroy_qp(qp); out_err: trace_cm_qp_create(id_priv, pd, qp_init_attr, ret); return ret; } EXPORT_SYMBOL(rdma_create_qp); void rdma_destroy_qp(struct rdma_cm_id *id) { struct rdma_id_private *id_priv; id_priv = container_of(id, struct rdma_id_private, id); trace_cm_qp_destroy(id_priv); mutex_lock(&id_priv->qp_mutex); ib_destroy_qp(id_priv->id.qp); id_priv->id.qp = NULL; mutex_unlock(&id_priv->qp_mutex); } EXPORT_SYMBOL(rdma_destroy_qp); static int cma_modify_qp_rtr(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; mutex_lock(&id_priv->qp_mutex); if (!id_priv->id.qp) { ret = 0; goto out; } /* Need to update QP attributes from default values. */ qp_attr.qp_state = IB_QPS_INIT; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) goto out; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); if (ret) goto out; qp_attr.qp_state = IB_QPS_RTR; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) goto out; BUG_ON(id_priv->cma_dev->device != id_priv->id.device); if (conn_param) qp_attr.max_dest_rd_atomic = conn_param->responder_resources; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); out: mutex_unlock(&id_priv->qp_mutex); return ret; } static int cma_modify_qp_rts(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; mutex_lock(&id_priv->qp_mutex); if (!id_priv->id.qp) { ret = 0; goto out; } qp_attr.qp_state = IB_QPS_RTS; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) goto out; if (conn_param) qp_attr.max_rd_atomic = conn_param->initiator_depth; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); out: mutex_unlock(&id_priv->qp_mutex); return ret; } static int cma_modify_qp_err(struct rdma_id_private *id_priv) { struct ib_qp_attr qp_attr; int ret; mutex_lock(&id_priv->qp_mutex); if (!id_priv->id.qp) { ret = 0; goto out; } qp_attr.qp_state = IB_QPS_ERR; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, IB_QP_STATE); out: mutex_unlock(&id_priv->qp_mutex); return ret; } static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; int ret; u16 pkey; if (rdma_cap_eth_ah(id_priv->id.device, id_priv->id.port_num)) pkey = 0xffff; else pkey = ib_addr_get_pkey(dev_addr); ret = ib_find_cached_pkey(id_priv->id.device, id_priv->id.port_num, pkey, &qp_attr->pkey_index); if (ret) return ret; qp_attr->port_num = id_priv->id.port_num; *qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT; if (id_priv->id.qp_type == IB_QPT_UD) { ret = cma_set_default_qkey(id_priv); if (ret) return ret; qp_attr->qkey = id_priv->qkey; *qp_attr_mask |= IB_QP_QKEY; } else { qp_attr->qp_access_flags = 0; *qp_attr_mask |= IB_QP_ACCESS_FLAGS; } return 0; } int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { struct rdma_id_private *id_priv; int ret = 0; id_priv = container_of(id, struct rdma_id_private, id); if (rdma_cap_ib_cm(id->device, id->port_num)) { if (!id_priv->cm_id.ib || (id_priv->id.qp_type == IB_QPT_UD)) ret = cma_ib_init_qp_attr(id_priv, qp_attr, qp_attr_mask); else ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, qp_attr, qp_attr_mask); if (qp_attr->qp_state == IB_QPS_RTR) qp_attr->rq_psn = id_priv->seq_num; } else if (rdma_cap_iw_cm(id->device, id->port_num)) { if (!id_priv->cm_id.iw) { qp_attr->qp_access_flags = 0; *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS; } else ret = iw_cm_init_qp_attr(id_priv->cm_id.iw, qp_attr, qp_attr_mask); qp_attr->port_num = id_priv->id.port_num; *qp_attr_mask |= IB_QP_PORT; } else { ret = -ENOSYS; } if ((*qp_attr_mask & IB_QP_TIMEOUT) && id_priv->timeout_set) qp_attr->timeout = id_priv->timeout; if ((*qp_attr_mask & IB_QP_MIN_RNR_TIMER) && id_priv->min_rnr_timer_set) qp_attr->min_rnr_timer = id_priv->min_rnr_timer; return ret; } EXPORT_SYMBOL(rdma_init_qp_attr); static inline bool cma_zero_addr(const struct sockaddr *addr) { switch (addr->sa_family) { case AF_INET: return ipv4_is_zeronet(((struct sockaddr_in *)addr)->sin_addr.s_addr); case AF_INET6: return ipv6_addr_any(&((struct sockaddr_in6 *)addr)->sin6_addr); case AF_IB: return ib_addr_any(&((struct sockaddr_ib *)addr)->sib_addr); default: return false; } } static inline bool cma_loopback_addr(const struct sockaddr *addr) { switch (addr->sa_family) { case AF_INET: return ipv4_is_loopback( ((struct sockaddr_in *)addr)->sin_addr.s_addr); case AF_INET6: return ipv6_addr_loopback( &((struct sockaddr_in6 *)addr)->sin6_addr); case AF_IB: return ib_addr_loopback( &((struct sockaddr_ib *)addr)->sib_addr); default: return false; } } static inline bool cma_any_addr(const struct sockaddr *addr) { return cma_zero_addr(addr) || cma_loopback_addr(addr); } static int cma_addr_cmp(const struct sockaddr *src, const struct sockaddr *dst) { if (src->sa_family != dst->sa_family) return -1; switch (src->sa_family) { case AF_INET: return ((struct sockaddr_in *)src)->sin_addr.s_addr != ((struct sockaddr_in *)dst)->sin_addr.s_addr; case AF_INET6: { struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *)src; struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *)dst; bool link_local; if (ipv6_addr_cmp(&src_addr6->sin6_addr, &dst_addr6->sin6_addr)) return 1; link_local = ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL; /* Link local must match their scope_ids */ return link_local ? (src_addr6->sin6_scope_id != dst_addr6->sin6_scope_id) : 0; } default: return ib_addr_cmp(&((struct sockaddr_ib *) src)->sib_addr, &((struct sockaddr_ib *) dst)->sib_addr); } } static __be16 cma_port(const struct sockaddr *addr) { struct sockaddr_ib *sib; switch (addr->sa_family) { case AF_INET: return ((struct sockaddr_in *) addr)->sin_port; case AF_INET6: return ((struct sockaddr_in6 *) addr)->sin6_port; case AF_IB: sib = (struct sockaddr_ib *) addr; return htons((u16) (be64_to_cpu(sib->sib_sid) & be64_to_cpu(sib->sib_sid_mask))); default: return 0; } } static inline int cma_any_port(const struct sockaddr *addr) { return !cma_port(addr); } static void cma_save_ib_info(struct sockaddr *src_addr, struct sockaddr *dst_addr, const struct rdma_cm_id *listen_id, const struct sa_path_rec *path) { struct sockaddr_ib *listen_ib, *ib; listen_ib = (struct sockaddr_ib *) &listen_id->route.addr.src_addr; if (src_addr) { ib = (struct sockaddr_ib *)src_addr; ib->sib_family = AF_IB; if (path) { ib->sib_pkey = path->pkey; ib->sib_flowinfo = path->flow_label; memcpy(&ib->sib_addr, &path->sgid, 16); ib->sib_sid = path->service_id; ib->sib_scope_id = 0; } else { ib->sib_pkey = listen_ib->sib_pkey; ib->sib_flowinfo = listen_ib->sib_flowinfo; ib->sib_addr = listen_ib->sib_addr; ib->sib_sid = listen_ib->sib_sid; ib->sib_scope_id = listen_ib->sib_scope_id; } ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL); } if (dst_addr) { ib = (struct sockaddr_ib *)dst_addr; ib->sib_family = AF_IB; if (path) { ib->sib_pkey = path->pkey; ib->sib_flowinfo = path->flow_label; memcpy(&ib->sib_addr, &path->dgid, 16); } } } static void cma_save_ip4_info(struct sockaddr_in *src_addr, struct sockaddr_in *dst_addr, struct cma_hdr *hdr, __be16 local_port) { if (src_addr) { *src_addr = (struct sockaddr_in) { .sin_family = AF_INET, .sin_addr.s_addr = hdr->dst_addr.ip4.addr, .sin_port = local_port, }; } if (dst_addr) { *dst_addr = (struct sockaddr_in) { .sin_family = AF_INET, .sin_addr.s_addr = hdr->src_addr.ip4.addr, .sin_port = hdr->port, }; } } static void cma_save_ip6_info(struct sockaddr_in6 *src_addr, struct sockaddr_in6 *dst_addr, struct cma_hdr *hdr, __be16 local_port) { if (src_addr) { *src_addr = (struct sockaddr_in6) { .sin6_family = AF_INET6, .sin6_addr = hdr->dst_addr.ip6, .sin6_port = local_port, }; } if (dst_addr) { *dst_addr = (struct sockaddr_in6) { .sin6_family = AF_INET6, .sin6_addr = hdr->src_addr.ip6, .sin6_port = hdr->port, }; } } static u16 cma_port_from_service_id(__be64 service_id) { return (u16)be64_to_cpu(service_id); } static int cma_save_ip_info(struct sockaddr *src_addr, struct sockaddr *dst_addr, const struct ib_cm_event *ib_event, __be64 service_id) { struct cma_hdr *hdr; __be16 port; hdr = ib_event->private_data; if (hdr->cma_version != CMA_VERSION) return -EINVAL; port = htons(cma_port_from_service_id(service_id)); switch (cma_get_ip_ver(hdr)) { case 4: cma_save_ip4_info((struct sockaddr_in *)src_addr, (struct sockaddr_in *)dst_addr, hdr, port); break; case 6: cma_save_ip6_info((struct sockaddr_in6 *)src_addr, (struct sockaddr_in6 *)dst_addr, hdr, port); break; default: return -EAFNOSUPPORT; } return 0; } static int cma_save_net_info(struct sockaddr *src_addr, struct sockaddr *dst_addr, const struct rdma_cm_id *listen_id, const struct ib_cm_event *ib_event, sa_family_t sa_family, __be64 service_id) { if (sa_family == AF_IB) { if (ib_event->event == IB_CM_REQ_RECEIVED) cma_save_ib_info(src_addr, dst_addr, listen_id, ib_event->param.req_rcvd.primary_path); else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) cma_save_ib_info(src_addr, dst_addr, listen_id, NULL); return 0; } return cma_save_ip_info(src_addr, dst_addr, ib_event, service_id); } static int cma_save_req_info(const struct ib_cm_event *ib_event, struct cma_req_info *req) { const struct ib_cm_req_event_param *req_param = &ib_event->param.req_rcvd; const struct ib_cm_sidr_req_event_param *sidr_param = &ib_event->param.sidr_req_rcvd; switch (ib_event->event) { case IB_CM_REQ_RECEIVED: req->device = req_param->listen_id->device; req->port = req_param->port; memcpy(&req->local_gid, &req_param->primary_path->sgid, sizeof(req->local_gid)); req->has_gid = true; req->service_id = req_param->primary_path->service_id; req->pkey = be16_to_cpu(req_param->primary_path->pkey); if (req->pkey != req_param->bth_pkey) pr_warn_ratelimited("RDMA CMA: got different BTH P_Key (0x%x) and primary path P_Key (0x%x)\n" "RDMA CMA: in the future this may cause the request to be dropped\n", req_param->bth_pkey, req->pkey); break; case IB_CM_SIDR_REQ_RECEIVED: req->device = sidr_param->listen_id->device; req->port = sidr_param->port; req->has_gid = false; req->service_id = sidr_param->service_id; req->pkey = sidr_param->pkey; if (req->pkey != sidr_param->bth_pkey) pr_warn_ratelimited("RDMA CMA: got different BTH P_Key (0x%x) and SIDR request payload P_Key (0x%x)\n" "RDMA CMA: in the future this may cause the request to be dropped\n", sidr_param->bth_pkey, req->pkey); break; default: return -EINVAL; } return 0; } static bool validate_ipv4_net_dev(struct net_device *net_dev, const struct sockaddr_in *dst_addr, const struct sockaddr_in *src_addr) { __be32 daddr = dst_addr->sin_addr.s_addr, saddr = src_addr->sin_addr.s_addr; struct fib_result res; struct flowi4 fl4; int err; bool ret; if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || ipv4_is_lbcast(daddr) || ipv4_is_zeronet(saddr) || ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr) || ipv4_is_loopback(saddr)) return false; memset(&fl4, 0, sizeof(fl4)); fl4.flowi4_oif = net_dev->ifindex; fl4.daddr = daddr; fl4.saddr = saddr; rcu_read_lock(); err = fib_lookup(dev_net(net_dev), &fl4, &res, 0); ret = err == 0 && FIB_RES_DEV(res) == net_dev; rcu_read_unlock(); return ret; } static bool validate_ipv6_net_dev(struct net_device *net_dev, const struct sockaddr_in6 *dst_addr, const struct sockaddr_in6 *src_addr) { #if IS_ENABLED(CONFIG_IPV6) const int strict = ipv6_addr_type(&dst_addr->sin6_addr) & IPV6_ADDR_LINKLOCAL; struct rt6_info *rt = rt6_lookup(dev_net(net_dev), &dst_addr->sin6_addr, &src_addr->sin6_addr, net_dev->ifindex, NULL, strict); bool ret; if (!rt) return false; ret = rt->rt6i_idev->dev == net_dev; ip6_rt_put(rt); return ret; #else return false; #endif } static bool validate_net_dev(struct net_device *net_dev, const struct sockaddr *daddr, const struct sockaddr *saddr) { const struct sockaddr_in *daddr4 = (const struct sockaddr_in *)daddr; const struct sockaddr_in *saddr4 = (const struct sockaddr_in *)saddr; const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr; const struct sockaddr_in6 *saddr6 = (const struct sockaddr_in6 *)saddr; switch (daddr->sa_family) { case AF_INET: return saddr->sa_family == AF_INET && validate_ipv4_net_dev(net_dev, daddr4, saddr4); case AF_INET6: return saddr->sa_family == AF_INET6 && validate_ipv6_net_dev(net_dev, daddr6, saddr6); default: return false; } } static struct net_device * roce_get_net_dev_by_cm_event(const struct ib_cm_event *ib_event) { const struct ib_gid_attr *sgid_attr = NULL; struct net_device *ndev; if (ib_event->event == IB_CM_REQ_RECEIVED) sgid_attr = ib_event->param.req_rcvd.ppath_sgid_attr; else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) sgid_attr = ib_event->param.sidr_req_rcvd.sgid_attr; if (!sgid_attr) return NULL; rcu_read_lock(); ndev = rdma_read_gid_attr_ndev_rcu(sgid_attr); if (IS_ERR(ndev)) ndev = NULL; else dev_hold(ndev); rcu_read_unlock(); return ndev; } static struct net_device *cma_get_net_dev(const struct ib_cm_event *ib_event, struct cma_req_info *req) { struct sockaddr *listen_addr = (struct sockaddr *)&req->listen_addr_storage; struct sockaddr *src_addr = (struct sockaddr *)&req->src_addr_storage; struct net_device *net_dev; const union ib_gid *gid = req->has_gid ? &req->local_gid : NULL; int err; err = cma_save_ip_info(listen_addr, src_addr, ib_event, req->service_id); if (err) return ERR_PTR(err); if (rdma_protocol_roce(req->device, req->port)) net_dev = roce_get_net_dev_by_cm_event(ib_event); else net_dev = ib_get_net_dev_by_params(req->device, req->port, req->pkey, gid, listen_addr); if (!net_dev) return ERR_PTR(-ENODEV); return net_dev; } static enum rdma_ucm_port_space rdma_ps_from_service_id(__be64 service_id) { return (be64_to_cpu(service_id) >> 16) & 0xffff; } static bool cma_match_private_data(struct rdma_id_private *id_priv, const struct cma_hdr *hdr) { struct sockaddr *addr = cma_src_addr(id_priv); __be32 ip4_addr; struct in6_addr ip6_addr; if (cma_any_addr(addr) && !id_priv->afonly) return true; switch (addr->sa_family) { case AF_INET: ip4_addr = ((struct sockaddr_in *)addr)->sin_addr.s_addr; if (cma_get_ip_ver(hdr) != 4) return false; if (!cma_any_addr(addr) && hdr->dst_addr.ip4.addr != ip4_addr) return false; break; case AF_INET6: ip6_addr = ((struct sockaddr_in6 *)addr)->sin6_addr; if (cma_get_ip_ver(hdr) != 6) return false; if (!cma_any_addr(addr) && memcmp(&hdr->dst_addr.ip6, &ip6_addr, sizeof(ip6_addr))) return false; break; case AF_IB: return true; default: return false; } return true; } static bool cma_protocol_roce(const struct rdma_cm_id *id) { struct ib_device *device = id->device; const u32 port_num = id->port_num ?: rdma_start_port(device); return rdma_protocol_roce(device, port_num); } static bool cma_is_req_ipv6_ll(const struct cma_req_info *req) { const struct sockaddr *daddr = (const struct sockaddr *)&req->listen_addr_storage; const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr; /* Returns true if the req is for IPv6 link local */ return (daddr->sa_family == AF_INET6 && (ipv6_addr_type(&daddr6->sin6_addr) & IPV6_ADDR_LINKLOCAL)); } static bool cma_match_net_dev(const struct rdma_cm_id *id, const struct net_device *net_dev, const struct cma_req_info *req) { const struct rdma_addr *addr = &id->route.addr; if (!net_dev) /* This request is an AF_IB request */ return (!id->port_num || id->port_num == req->port) && (addr->src_addr.ss_family == AF_IB); /* * If the request is not for IPv6 link local, allow matching * request to any netdevice of the one or multiport rdma device. */ if (!cma_is_req_ipv6_ll(req)) return true; /* * Net namespaces must match, and if the listner is listening * on a specific netdevice than netdevice must match as well. */ if (net_eq(dev_net(net_dev), addr->dev_addr.net) && (!!addr->dev_addr.bound_dev_if == (addr->dev_addr.bound_dev_if == net_dev->ifindex))) return true; else return false; } static struct rdma_id_private *cma_find_listener( const struct rdma_bind_list *bind_list, const struct ib_cm_id *cm_id, const struct ib_cm_event *ib_event, const struct cma_req_info *req, const struct net_device *net_dev) { struct rdma_id_private *id_priv, *id_priv_dev; lockdep_assert_held(&lock); if (!bind_list) return ERR_PTR(-EINVAL); hlist_for_each_entry(id_priv, &bind_list->owners, node) { if (cma_match_private_data(id_priv, ib_event->private_data)) { if (id_priv->id.device == cm_id->device && cma_match_net_dev(&id_priv->id, net_dev, req)) return id_priv; list_for_each_entry(id_priv_dev, &id_priv->listen_list, listen_item) { if (id_priv_dev->id.device == cm_id->device && cma_match_net_dev(&id_priv_dev->id, net_dev, req)) return id_priv_dev; } } } return ERR_PTR(-EINVAL); } static struct rdma_id_private * cma_ib_id_from_event(struct ib_cm_id *cm_id, const struct ib_cm_event *ib_event, struct cma_req_info *req, struct net_device **net_dev) { struct rdma_bind_list *bind_list; struct rdma_id_private *id_priv; int err; err = cma_save_req_info(ib_event, req); if (err) return ERR_PTR(err); *net_dev = cma_get_net_dev(ib_event, req); if (IS_ERR(*net_dev)) { if (PTR_ERR(*net_dev) == -EAFNOSUPPORT) { /* Assuming the protocol is AF_IB */ *net_dev = NULL; } else { return ERR_CAST(*net_dev); } } mutex_lock(&lock); /* * Net namespace might be getting deleted while route lookup, * cm_id lookup is in progress. Therefore, perform netdevice * validation, cm_id lookup under rcu lock. * RCU lock along with netdevice state check, synchronizes with * netdevice migrating to different net namespace and also avoids * case where net namespace doesn't get deleted while lookup is in * progress. * If the device state is not IFF_UP, its properties such as ifindex * and nd_net cannot be trusted to remain valid without rcu lock. * net/core/dev.c change_net_namespace() ensures to synchronize with * ongoing operations on net device after device is closed using * synchronize_net(). */ rcu_read_lock(); if (*net_dev) { /* * If netdevice is down, it is likely that it is administratively * down or it might be migrating to different namespace. * In that case avoid further processing, as the net namespace * or ifindex may change. */ if (((*net_dev)->flags & IFF_UP) == 0) { id_priv = ERR_PTR(-EHOSTUNREACH); goto err; } if (!validate_net_dev(*net_dev, (struct sockaddr *)&req->src_addr_storage, (struct sockaddr *)&req->listen_addr_storage)) { id_priv = ERR_PTR(-EHOSTUNREACH); goto err; } } bind_list = cma_ps_find(*net_dev ? dev_net(*net_dev) : &init_net, rdma_ps_from_service_id(req->service_id), cma_port_from_service_id(req->service_id)); id_priv = cma_find_listener(bind_list, cm_id, ib_event, req, *net_dev); err: rcu_read_unlock(); mutex_unlock(&lock); if (IS_ERR(id_priv) && *net_dev) { dev_put(*net_dev); *net_dev = NULL; } return id_priv; } static inline u8 cma_user_data_offset(struct rdma_id_private *id_priv) { return cma_family(id_priv) == AF_IB ? 0 : sizeof(struct cma_hdr); } static void cma_cancel_route(struct rdma_id_private *id_priv) { if (rdma_cap_ib_sa(id_priv->id.device, id_priv->id.port_num)) { if (id_priv->query) ib_sa_cancel_query(id_priv->query_id, id_priv->query); } } static void _cma_cancel_listens(struct rdma_id_private *id_priv) { struct rdma_id_private *dev_id_priv; lockdep_assert_held(&lock); /* * Remove from listen_any_list to prevent added devices from spawning * additional listen requests. */ list_del_init(&id_priv->listen_any_item); while (!list_empty(&id_priv->listen_list)) { dev_id_priv = list_first_entry(&id_priv->listen_list, struct rdma_id_private, listen_item); /* sync with device removal to avoid duplicate destruction */ list_del_init(&dev_id_priv->device_item); list_del_init(&dev_id_priv->listen_item); mutex_unlock(&lock); rdma_destroy_id(&dev_id_priv->id); mutex_lock(&lock); } } static void cma_cancel_listens(struct rdma_id_private *id_priv) { mutex_lock(&lock); _cma_cancel_listens(id_priv); mutex_unlock(&lock); } static void cma_cancel_operation(struct rdma_id_private *id_priv, enum rdma_cm_state state) { switch (state) { case RDMA_CM_ADDR_QUERY: /* * We can avoid doing the rdma_addr_cancel() based on state, * only RDMA_CM_ADDR_QUERY has a work that could still execute. * Notice that the addr_handler work could still be exiting * outside this state, however due to the interaction with the * handler_mutex the work is guaranteed not to touch id_priv * during exit. */ rdma_addr_cancel(&id_priv->id.route.addr.dev_addr); break; case RDMA_CM_ROUTE_QUERY: cma_cancel_route(id_priv); break; case RDMA_CM_LISTEN: if (cma_any_addr(cma_src_addr(id_priv)) && !id_priv->cma_dev) cma_cancel_listens(id_priv); break; default: break; } } static void cma_release_port(struct rdma_id_private *id_priv) { struct rdma_bind_list *bind_list = id_priv->bind_list; struct net *net = id_priv->id.route.addr.dev_addr.net; if (!bind_list) return; mutex_lock(&lock); hlist_del(&id_priv->node); if (hlist_empty(&bind_list->owners)) { cma_ps_remove(net, bind_list->ps, bind_list->port); kfree(bind_list); } mutex_unlock(&lock); } static void destroy_mc(struct rdma_id_private *id_priv, struct cma_multicast *mc) { bool send_only = mc->join_state == BIT(SENDONLY_FULLMEMBER_JOIN); if (rdma_cap_ib_mcast(id_priv->id.device, id_priv->id.port_num)) ib_sa_free_multicast(mc->sa_mc); if (rdma_protocol_roce(id_priv->id.device, id_priv->id.port_num)) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct net_device *ndev = NULL; if (dev_addr->bound_dev_if) ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); if (ndev && !send_only) { enum ib_gid_type gid_type; union ib_gid mgid; gid_type = id_priv->cma_dev->default_gid_type [id_priv->id.port_num - rdma_start_port( id_priv->cma_dev->device)]; cma_iboe_set_mgid((struct sockaddr *)&mc->addr, &mgid, gid_type); cma_igmp_send(ndev, &mgid, false); } dev_put(ndev); cancel_work_sync(&mc->iboe_join.work); } kfree(mc); } static void cma_leave_mc_groups(struct rdma_id_private *id_priv) { struct cma_multicast *mc; while (!list_empty(&id_priv->mc_list)) { mc = list_first_entry(&id_priv->mc_list, struct cma_multicast, list); list_del(&mc->list); destroy_mc(id_priv, mc); } } static void _destroy_id(struct rdma_id_private *id_priv, enum rdma_cm_state state) { cma_cancel_operation(id_priv, state); rdma_restrack_del(&id_priv->res); cma_remove_id_from_tree(id_priv); if (id_priv->cma_dev) { if (rdma_cap_ib_cm(id_priv->id.device, 1)) { if (id_priv->cm_id.ib) ib_destroy_cm_id(id_priv->cm_id.ib); } else if (rdma_cap_iw_cm(id_priv->id.device, 1)) { if (id_priv->cm_id.iw) iw_destroy_cm_id(id_priv->cm_id.iw); } cma_leave_mc_groups(id_priv); cma_release_dev(id_priv); } cma_release_port(id_priv); cma_id_put(id_priv); wait_for_completion(&id_priv->comp); if (id_priv->internal_id) cma_id_put(id_priv->id.context); kfree(id_priv->id.route.path_rec); kfree(id_priv->id.route.path_rec_inbound); kfree(id_priv->id.route.path_rec_outbound); put_net(id_priv->id.route.addr.dev_addr.net); kfree(id_priv); } /* * destroy an ID from within the handler_mutex. This ensures that no other * handlers can start running concurrently. */ static void destroy_id_handler_unlock(struct rdma_id_private *id_priv) __releases(&idprv->handler_mutex) { enum rdma_cm_state state; unsigned long flags; trace_cm_id_destroy(id_priv); /* * Setting the state to destroyed under the handler mutex provides a * fence against calling handler callbacks. If this is invoked due to * the failure of a handler callback then it guarentees that no future * handlers will be called. */ lockdep_assert_held(&id_priv->handler_mutex); spin_lock_irqsave(&id_priv->lock, flags); state = id_priv->state; id_priv->state = RDMA_CM_DESTROYING; spin_unlock_irqrestore(&id_priv->lock, flags); mutex_unlock(&id_priv->handler_mutex); _destroy_id(id_priv, state); } void rdma_destroy_id(struct rdma_cm_id *id) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); mutex_lock(&id_priv->handler_mutex); destroy_id_handler_unlock(id_priv); } EXPORT_SYMBOL(rdma_destroy_id); static int cma_rep_recv(struct rdma_id_private *id_priv) { int ret; ret = cma_modify_qp_rtr(id_priv, NULL); if (ret) goto reject; ret = cma_modify_qp_rts(id_priv, NULL); if (ret) goto reject; trace_cm_send_rtu(id_priv); ret = ib_send_cm_rtu(id_priv->cm_id.ib, NULL, 0); if (ret) goto reject; return 0; reject: pr_debug_ratelimited("RDMA CM: CONNECT_ERROR: failed to handle reply. status %d\n", ret); cma_modify_qp_err(id_priv); trace_cm_send_rej(id_priv); ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, NULL, 0); return ret; } static void cma_set_rep_event_data(struct rdma_cm_event *event, const struct ib_cm_rep_event_param *rep_data, void *private_data) { event->param.conn.private_data = private_data; event->param.conn.private_data_len = IB_CM_REP_PRIVATE_DATA_SIZE; event->param.conn.responder_resources = rep_data->responder_resources; event->param.conn.initiator_depth = rep_data->initiator_depth; event->param.conn.flow_control = rep_data->flow_control; event->param.conn.rnr_retry_count = rep_data->rnr_retry_count; event->param.conn.srq = rep_data->srq; event->param.conn.qp_num = rep_data->remote_qpn; event->ece.vendor_id = rep_data->ece.vendor_id; event->ece.attr_mod = rep_data->ece.attr_mod; } static int cma_cm_event_handler(struct rdma_id_private *id_priv, struct rdma_cm_event *event) { int ret; lockdep_assert_held(&id_priv->handler_mutex); trace_cm_event_handler(id_priv, event); ret = id_priv->id.event_handler(&id_priv->id, event); trace_cm_event_done(id_priv, event, ret); return ret; } static int cma_ib_handler(struct ib_cm_id *cm_id, const struct ib_cm_event *ib_event) { struct rdma_id_private *id_priv = cm_id->context; struct rdma_cm_event event = {}; enum rdma_cm_state state; int ret; mutex_lock(&id_priv->handler_mutex); state = READ_ONCE(id_priv->state); if ((ib_event->event != IB_CM_TIMEWAIT_EXIT && state != RDMA_CM_CONNECT) || (ib_event->event == IB_CM_TIMEWAIT_EXIT && state != RDMA_CM_DISCONNECT)) goto out; switch (ib_event->event) { case IB_CM_REQ_ERROR: case IB_CM_REP_ERROR: event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = -ETIMEDOUT; break; case IB_CM_REP_RECEIVED: if (state == RDMA_CM_CONNECT && (id_priv->id.qp_type != IB_QPT_UD)) { trace_cm_send_mra(id_priv); ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); } if (id_priv->id.qp) { event.status = cma_rep_recv(id_priv); event.event = event.status ? RDMA_CM_EVENT_CONNECT_ERROR : RDMA_CM_EVENT_ESTABLISHED; } else { event.event = RDMA_CM_EVENT_CONNECT_RESPONSE; } cma_set_rep_event_data(&event, &ib_event->param.rep_rcvd, ib_event->private_data); break; case IB_CM_RTU_RECEIVED: case IB_CM_USER_ESTABLISHED: event.event = RDMA_CM_EVENT_ESTABLISHED; break; case IB_CM_DREQ_ERROR: event.status = -ETIMEDOUT; fallthrough; case IB_CM_DREQ_RECEIVED: case IB_CM_DREP_RECEIVED: if (!cma_comp_exch(id_priv, RDMA_CM_CONNECT, RDMA_CM_DISCONNECT)) goto out; event.event = RDMA_CM_EVENT_DISCONNECTED; break; case IB_CM_TIMEWAIT_EXIT: event.event = RDMA_CM_EVENT_TIMEWAIT_EXIT; break; case IB_CM_MRA_RECEIVED: /* ignore event */ goto out; case IB_CM_REJ_RECEIVED: pr_debug_ratelimited("RDMA CM: REJECTED: %s\n", rdma_reject_msg(&id_priv->id, ib_event->param.rej_rcvd.reason)); cma_modify_qp_err(id_priv); event.status = ib_event->param.rej_rcvd.reason; event.event = RDMA_CM_EVENT_REJECTED; event.param.conn.private_data = ib_event->private_data; event.param.conn.private_data_len = IB_CM_REJ_PRIVATE_DATA_SIZE; break; default: pr_err("RDMA CMA: unexpected IB CM event: %d\n", ib_event->event); goto out; } ret = cma_cm_event_handler(id_priv, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.ib = NULL; destroy_id_handler_unlock(id_priv); return ret; } out: mutex_unlock(&id_priv->handler_mutex); return 0; } static struct rdma_id_private * cma_ib_new_conn_id(const struct rdma_cm_id *listen_id, const struct ib_cm_event *ib_event, struct net_device *net_dev) { struct rdma_id_private *listen_id_priv; struct rdma_id_private *id_priv; struct rdma_cm_id *id; struct rdma_route *rt; const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family; struct sa_path_rec *path = ib_event->param.req_rcvd.primary_path; const __be64 service_id = ib_event->param.req_rcvd.primary_path->service_id; int ret; listen_id_priv = container_of(listen_id, struct rdma_id_private, id); id_priv = __rdma_create_id(listen_id->route.addr.dev_addr.net, listen_id->event_handler, listen_id->context, listen_id->ps, ib_event->param.req_rcvd.qp_type, listen_id_priv); if (IS_ERR(id_priv)) return NULL; id = &id_priv->id; if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr, (struct sockaddr *)&id->route.addr.dst_addr, listen_id, ib_event, ss_family, service_id)) goto err; rt = &id->route; rt->num_pri_alt_paths = ib_event->param.req_rcvd.alternate_path ? 2 : 1; rt->path_rec = kmalloc_array(rt->num_pri_alt_paths, sizeof(*rt->path_rec), GFP_KERNEL); if (!rt->path_rec) goto err; rt->path_rec[0] = *path; if (rt->num_pri_alt_paths == 2) rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path; if (net_dev) { rdma_copy_src_l2_addr(&rt->addr.dev_addr, net_dev); } else { if (!cma_protocol_roce(listen_id) && cma_any_addr(cma_src_addr(id_priv))) { rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND; rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid); ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey)); } else if (!cma_any_addr(cma_src_addr(id_priv))) { ret = cma_translate_addr(cma_src_addr(id_priv), &rt->addr.dev_addr); if (ret) goto err; } } rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid); id_priv->state = RDMA_CM_CONNECT; return id_priv; err: rdma_destroy_id(id); return NULL; } static struct rdma_id_private * cma_ib_new_udp_id(const struct rdma_cm_id *listen_id, const struct ib_cm_event *ib_event, struct net_device *net_dev) { const struct rdma_id_private *listen_id_priv; struct rdma_id_private *id_priv; struct rdma_cm_id *id; const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family; struct net *net = listen_id->route.addr.dev_addr.net; int ret; listen_id_priv = container_of(listen_id, struct rdma_id_private, id); id_priv = __rdma_create_id(net, listen_id->event_handler, listen_id->context, listen_id->ps, IB_QPT_UD, listen_id_priv); if (IS_ERR(id_priv)) return NULL; id = &id_priv->id; if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr, (struct sockaddr *)&id->route.addr.dst_addr, listen_id, ib_event, ss_family, ib_event->param.sidr_req_rcvd.service_id)) goto err; if (net_dev) { rdma_copy_src_l2_addr(&id->route.addr.dev_addr, net_dev); } else { if (!cma_any_addr(cma_src_addr(id_priv))) { ret = cma_translate_addr(cma_src_addr(id_priv), &id->route.addr.dev_addr); if (ret) goto err; } } id_priv->state = RDMA_CM_CONNECT; return id_priv; err: rdma_destroy_id(id); return NULL; } static void cma_set_req_event_data(struct rdma_cm_event *event, const struct ib_cm_req_event_param *req_data, void *private_data, int offset) { event->param.conn.private_data = private_data + offset; event->param.conn.private_data_len = IB_CM_REQ_PRIVATE_DATA_SIZE - offset; event->param.conn.responder_resources = req_data->responder_resources; event->param.conn.initiator_depth = req_data->initiator_depth; event->param.conn.flow_control = req_data->flow_control; event->param.conn.retry_count = req_data->retry_count; event->param.conn.rnr_retry_count = req_data->rnr_retry_count; event->param.conn.srq = req_data->srq; event->param.conn.qp_num = req_data->remote_qpn; event->ece.vendor_id = req_data->ece.vendor_id; event->ece.attr_mod = req_data->ece.attr_mod; } static int cma_ib_check_req_qp_type(const struct rdma_cm_id *id, const struct ib_cm_event *ib_event) { return (((ib_event->event == IB_CM_REQ_RECEIVED) && (ib_event->param.req_rcvd.qp_type == id->qp_type)) || ((ib_event->event == IB_CM_SIDR_REQ_RECEIVED) && (id->qp_type == IB_QPT_UD)) || (!id->qp_type)); } static int cma_ib_req_handler(struct ib_cm_id *cm_id, const struct ib_cm_event *ib_event) { struct rdma_id_private *listen_id, *conn_id = NULL; struct rdma_cm_event event = {}; struct cma_req_info req = {}; struct net_device *net_dev; u8 offset; int ret; listen_id = cma_ib_id_from_event(cm_id, ib_event, &req, &net_dev); if (IS_ERR(listen_id)) return PTR_ERR(listen_id); trace_cm_req_handler(listen_id, ib_event->event); if (!cma_ib_check_req_qp_type(&listen_id->id, ib_event)) { ret = -EINVAL; goto net_dev_put; } mutex_lock(&listen_id->handler_mutex); if (READ_ONCE(listen_id->state) != RDMA_CM_LISTEN) { ret = -ECONNABORTED; goto err_unlock; } offset = cma_user_data_offset(listen_id); event.event = RDMA_CM_EVENT_CONNECT_REQUEST; if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) { conn_id = cma_ib_new_udp_id(&listen_id->id, ib_event, net_dev); event.param.ud.private_data = ib_event->private_data + offset; event.param.ud.private_data_len = IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset; } else { conn_id = cma_ib_new_conn_id(&listen_id->id, ib_event, net_dev); cma_set_req_event_data(&event, &ib_event->param.req_rcvd, ib_event->private_data, offset); } if (!conn_id) { ret = -ENOMEM; goto err_unlock; } mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); ret = cma_ib_acquire_dev(conn_id, listen_id, &req); if (ret) { destroy_id_handler_unlock(conn_id); goto err_unlock; } conn_id->cm_id.ib = cm_id; cm_id->context = conn_id; cm_id->cm_handler = cma_ib_handler; ret = cma_cm_event_handler(conn_id, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ conn_id->cm_id.ib = NULL; mutex_unlock(&listen_id->handler_mutex); destroy_id_handler_unlock(conn_id); goto net_dev_put; } if (READ_ONCE(conn_id->state) == RDMA_CM_CONNECT && conn_id->id.qp_type != IB_QPT_UD) { trace_cm_send_mra(cm_id->context); ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); } mutex_unlock(&conn_id->handler_mutex); err_unlock: mutex_unlock(&listen_id->handler_mutex); net_dev_put: dev_put(net_dev); return ret; } __be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr) { if (addr->sa_family == AF_IB) return ((struct sockaddr_ib *) addr)->sib_sid; return cpu_to_be64(((u64)id->ps << 16) + be16_to_cpu(cma_port(addr))); } EXPORT_SYMBOL(rdma_get_service_id); void rdma_read_gids(struct rdma_cm_id *cm_id, union ib_gid *sgid, union ib_gid *dgid) { struct rdma_addr *addr = &cm_id->route.addr; if (!cm_id->device) { if (sgid) memset(sgid, 0, sizeof(*sgid)); if (dgid) memset(dgid, 0, sizeof(*dgid)); return; } if (rdma_protocol_roce(cm_id->device, cm_id->port_num)) { if (sgid) rdma_ip2gid((struct sockaddr *)&addr->src_addr, sgid); if (dgid) rdma_ip2gid((struct sockaddr *)&addr->dst_addr, dgid); } else { if (sgid) rdma_addr_get_sgid(&addr->dev_addr, sgid); if (dgid) rdma_addr_get_dgid(&addr->dev_addr, dgid); } } EXPORT_SYMBOL(rdma_read_gids); static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) { struct rdma_id_private *id_priv = iw_id->context; struct rdma_cm_event event = {}; int ret = 0; struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr; struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; mutex_lock(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT) goto out; switch (iw_event->event) { case IW_CM_EVENT_CLOSE: event.event = RDMA_CM_EVENT_DISCONNECTED; break; case IW_CM_EVENT_CONNECT_REPLY: memcpy(cma_src_addr(id_priv), laddr, rdma_addr_size(laddr)); memcpy(cma_dst_addr(id_priv), raddr, rdma_addr_size(raddr)); switch (iw_event->status) { case 0: event.event = RDMA_CM_EVENT_ESTABLISHED; event.param.conn.initiator_depth = iw_event->ird; event.param.conn.responder_resources = iw_event->ord; break; case -ECONNRESET: case -ECONNREFUSED: event.event = RDMA_CM_EVENT_REJECTED; break; case -ETIMEDOUT: event.event = RDMA_CM_EVENT_UNREACHABLE; break; default: event.event = RDMA_CM_EVENT_CONNECT_ERROR; break; } break; case IW_CM_EVENT_ESTABLISHED: event.event = RDMA_CM_EVENT_ESTABLISHED; event.param.conn.initiator_depth = iw_event->ird; event.param.conn.responder_resources = iw_event->ord; break; default: goto out; } event.status = iw_event->status; event.param.conn.private_data = iw_event->private_data; event.param.conn.private_data_len = iw_event->private_data_len; ret = cma_cm_event_handler(id_priv, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.iw = NULL; destroy_id_handler_unlock(id_priv); return ret; } out: mutex_unlock(&id_priv->handler_mutex); return ret; } static int iw_conn_req_handler(struct iw_cm_id *cm_id, struct iw_cm_event *iw_event) { struct rdma_id_private *listen_id, *conn_id; struct rdma_cm_event event = {}; int ret = -ECONNABORTED; struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr; struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; event.event = RDMA_CM_EVENT_CONNECT_REQUEST; event.param.conn.private_data = iw_event->private_data; event.param.conn.private_data_len = iw_event->private_data_len; event.param.conn.initiator_depth = iw_event->ird; event.param.conn.responder_resources = iw_event->ord; listen_id = cm_id->context; mutex_lock(&listen_id->handler_mutex); if (READ_ONCE(listen_id->state) != RDMA_CM_LISTEN) goto out; /* Create a new RDMA id for the new IW CM ID */ conn_id = __rdma_create_id(listen_id->id.route.addr.dev_addr.net, listen_id->id.event_handler, listen_id->id.context, RDMA_PS_TCP, IB_QPT_RC, listen_id); if (IS_ERR(conn_id)) { ret = -ENOMEM; goto out; } mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); conn_id->state = RDMA_CM_CONNECT; ret = rdma_translate_ip(laddr, &conn_id->id.route.addr.dev_addr); if (ret) { mutex_unlock(&listen_id->handler_mutex); destroy_id_handler_unlock(conn_id); return ret; } ret = cma_iw_acquire_dev(conn_id, listen_id); if (ret) { mutex_unlock(&listen_id->handler_mutex); destroy_id_handler_unlock(conn_id); return ret; } conn_id->cm_id.iw = cm_id; cm_id->context = conn_id; cm_id->cm_handler = cma_iw_handler; memcpy(cma_src_addr(conn_id), laddr, rdma_addr_size(laddr)); memcpy(cma_dst_addr(conn_id), raddr, rdma_addr_size(raddr)); ret = cma_cm_event_handler(conn_id, &event); if (ret) { /* User wants to destroy the CM ID */ conn_id->cm_id.iw = NULL; mutex_unlock(&listen_id->handler_mutex); destroy_id_handler_unlock(conn_id); return ret; } mutex_unlock(&conn_id->handler_mutex); out: mutex_unlock(&listen_id->handler_mutex); return ret; } static int cma_ib_listen(struct rdma_id_private *id_priv) { struct sockaddr *addr; struct ib_cm_id *id; __be64 svc_id; addr = cma_src_addr(id_priv); svc_id = rdma_get_service_id(&id_priv->id, addr); id = ib_cm_insert_listen(id_priv->id.device, cma_ib_req_handler, svc_id); if (IS_ERR(id)) return PTR_ERR(id); id_priv->cm_id.ib = id; return 0; } static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog) { int ret; struct iw_cm_id *id; id = iw_create_cm_id(id_priv->id.device, iw_conn_req_handler, id_priv); if (IS_ERR(id)) return PTR_ERR(id); mutex_lock(&id_priv->qp_mutex); id->tos = id_priv->tos; id->tos_set = id_priv->tos_set; mutex_unlock(&id_priv->qp_mutex); id->afonly = id_priv->afonly; id_priv->cm_id.iw = id; memcpy(&id_priv->cm_id.iw->local_addr, cma_src_addr(id_priv), rdma_addr_size(cma_src_addr(id_priv))); ret = iw_cm_listen(id_priv->cm_id.iw, backlog); if (ret) { iw_destroy_cm_id(id_priv->cm_id.iw); id_priv->cm_id.iw = NULL; } return ret; } static int cma_listen_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) { struct rdma_id_private *id_priv = id->context; /* Listening IDs are always destroyed on removal */ if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) return -1; id->context = id_priv->id.context; id->event_handler = id_priv->id.event_handler; trace_cm_event_handler(id_priv, event); return id_priv->id.event_handler(id, event); } static int cma_listen_on_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev, struct rdma_id_private **to_destroy) { struct rdma_id_private *dev_id_priv; struct net *net = id_priv->id.route.addr.dev_addr.net; int ret; lockdep_assert_held(&lock); *to_destroy = NULL; if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cma_dev->device, 1)) return 0; dev_id_priv = __rdma_create_id(net, cma_listen_handler, id_priv, id_priv->id.ps, id_priv->id.qp_type, id_priv); if (IS_ERR(dev_id_priv)) return PTR_ERR(dev_id_priv); dev_id_priv->state = RDMA_CM_ADDR_BOUND; memcpy(cma_src_addr(dev_id_priv), cma_src_addr(id_priv), rdma_addr_size(cma_src_addr(id_priv))); _cma_attach_to_dev(dev_id_priv, cma_dev); rdma_restrack_add(&dev_id_priv->res); cma_id_get(id_priv); dev_id_priv->internal_id = 1; dev_id_priv->afonly = id_priv->afonly; mutex_lock(&id_priv->qp_mutex); dev_id_priv->tos_set = id_priv->tos_set; dev_id_priv->tos = id_priv->tos; mutex_unlock(&id_priv->qp_mutex); ret = rdma_listen(&dev_id_priv->id, id_priv->backlog); if (ret) goto err_listen; list_add_tail(&dev_id_priv->listen_item, &id_priv->listen_list); return 0; err_listen: /* Caller must destroy this after releasing lock */ *to_destroy = dev_id_priv; dev_warn(&cma_dev->device->dev, "RDMA CMA: %s, error %d\n", __func__, ret); return ret; } static int cma_listen_on_all(struct rdma_id_private *id_priv) { struct rdma_id_private *to_destroy; struct cma_device *cma_dev; int ret; mutex_lock(&lock); list_add_tail(&id_priv->listen_any_item, &listen_any_list); list_for_each_entry(cma_dev, &dev_list, list) { ret = cma_listen_on_dev(id_priv, cma_dev, &to_destroy); if (ret) { /* Prevent racing with cma_process_remove() */ if (to_destroy) list_del_init(&to_destroy->device_item); goto err_listen; } } mutex_unlock(&lock); return 0; err_listen: _cma_cancel_listens(id_priv); mutex_unlock(&lock); if (to_destroy) rdma_destroy_id(&to_destroy->id); return ret; } void rdma_set_service_type(struct rdma_cm_id *id, int tos) { struct rdma_id_private *id_priv; id_priv = container_of(id, struct rdma_id_private, id); mutex_lock(&id_priv->qp_mutex); id_priv->tos = (u8) tos; id_priv->tos_set = true; mutex_unlock(&id_priv->qp_mutex); } EXPORT_SYMBOL(rdma_set_service_type); /** * rdma_set_ack_timeout() - Set the ack timeout of QP associated * with a connection identifier. * @id: Communication identifier to associated with service type. * @timeout: Ack timeout to set a QP, expressed as 4.096 * 2^(timeout) usec. * * This function should be called before rdma_connect() on active side, * and on passive side before rdma_accept(). It is applicable to primary * path only. The timeout will affect the local side of the QP, it is not * negotiated with remote side and zero disables the timer. In case it is * set before rdma_resolve_route, the value will also be used to determine * PacketLifeTime for RoCE. * * Return: 0 for success */ int rdma_set_ack_timeout(struct rdma_cm_id *id, u8 timeout) { struct rdma_id_private *id_priv; if (id->qp_type != IB_QPT_RC && id->qp_type != IB_QPT_XRC_INI) return -EINVAL; id_priv = container_of(id, struct rdma_id_private, id); mutex_lock(&id_priv->qp_mutex); id_priv->timeout = timeout; id_priv->timeout_set = true; mutex_unlock(&id_priv->qp_mutex); return 0; } EXPORT_SYMBOL(rdma_set_ack_timeout); /** * rdma_set_min_rnr_timer() - Set the minimum RNR Retry timer of the * QP associated with a connection identifier. * @id: Communication identifier to associated with service type. * @min_rnr_timer: 5-bit value encoded as Table 45: "Encoding for RNR NAK * Timer Field" in the IBTA specification. * * This function should be called before rdma_connect() on active * side, and on passive side before rdma_accept(). The timer value * will be associated with the local QP. When it receives a send it is * not read to handle, typically if the receive queue is empty, an RNR * Retry NAK is returned to the requester with the min_rnr_timer * encoded. The requester will then wait at least the time specified * in the NAK before retrying. The default is zero, which translates * to a minimum RNR Timer value of 655 ms. * * Return: 0 for success */ int rdma_set_min_rnr_timer(struct rdma_cm_id *id, u8 min_rnr_timer) { struct rdma_id_private *id_priv; /* It is a five-bit value */ if (min_rnr_timer & 0xe0) return -EINVAL; if (WARN_ON(id->qp_type != IB_QPT_RC && id->qp_type != IB_QPT_XRC_TGT)) return -EINVAL; id_priv = container_of(id, struct rdma_id_private, id); mutex_lock(&id_priv->qp_mutex); id_priv->min_rnr_timer = min_rnr_timer; id_priv->min_rnr_timer_set = true; mutex_unlock(&id_priv->qp_mutex); return 0; } EXPORT_SYMBOL(rdma_set_min_rnr_timer); static int route_set_path_rec_inbound(struct cma_work *work, struct sa_path_rec *path_rec) { struct rdma_route *route = &work->id->id.route; if (!route->path_rec_inbound) { route->path_rec_inbound = kzalloc(sizeof(*route->path_rec_inbound), GFP_KERNEL); if (!route->path_rec_inbound) return -ENOMEM; } *route->path_rec_inbound = *path_rec; return 0; } static int route_set_path_rec_outbound(struct cma_work *work, struct sa_path_rec *path_rec) { struct rdma_route *route = &work->id->id.route; if (!route->path_rec_outbound) { route->path_rec_outbound = kzalloc(sizeof(*route->path_rec_outbound), GFP_KERNEL); if (!route->path_rec_outbound) return -ENOMEM; } *route->path_rec_outbound = *path_rec; return 0; } static void cma_query_handler(int status, struct sa_path_rec *path_rec, unsigned int num_prs, void *context) { struct cma_work *work = context; struct rdma_route *route; int i; route = &work->id->id.route; if (status) goto fail; for (i = 0; i < num_prs; i++) { if (!path_rec[i].flags || (path_rec[i].flags & IB_PATH_GMP)) *route->path_rec = path_rec[i]; else if (path_rec[i].flags & IB_PATH_INBOUND) status = route_set_path_rec_inbound(work, &path_rec[i]); else if (path_rec[i].flags & IB_PATH_OUTBOUND) status = route_set_path_rec_outbound(work, &path_rec[i]); else status = -EINVAL; if (status) goto fail; } route->num_pri_alt_paths = 1; queue_work(cma_wq, &work->work); return; fail: work->old_state = RDMA_CM_ROUTE_QUERY; work->new_state = RDMA_CM_ADDR_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_ERROR; work->event.status = status; pr_debug_ratelimited("RDMA CM: ROUTE_ERROR: failed to query path. status %d\n", status); queue_work(cma_wq, &work->work); } static int cma_query_ib_route(struct rdma_id_private *id_priv, unsigned long timeout_ms, struct cma_work *work) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct sa_path_rec path_rec; ib_sa_comp_mask comp_mask; struct sockaddr_in6 *sin6; struct sockaddr_ib *sib; memset(&path_rec, 0, sizeof path_rec); if (rdma_cap_opa_ah(id_priv->id.device, id_priv->id.port_num)) path_rec.rec_type = SA_PATH_REC_TYPE_OPA; else path_rec.rec_type = SA_PATH_REC_TYPE_IB; rdma_addr_get_sgid(dev_addr, &path_rec.sgid); rdma_addr_get_dgid(dev_addr, &path_rec.dgid); path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); path_rec.numb_path = 1; path_rec.reversible = 1; path_rec.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); comp_mask = IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH | IB_SA_PATH_REC_REVERSIBLE | IB_SA_PATH_REC_SERVICE_ID; switch (cma_family(id_priv)) { case AF_INET: path_rec.qos_class = cpu_to_be16((u16) id_priv->tos); comp_mask |= IB_SA_PATH_REC_QOS_CLASS; break; case AF_INET6: sin6 = (struct sockaddr_in6 *) cma_src_addr(id_priv); path_rec.traffic_class = (u8) (be32_to_cpu(sin6->sin6_flowinfo) >> 20); comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS; break; case AF_IB: sib = (struct sockaddr_ib *) cma_src_addr(id_priv); path_rec.traffic_class = (u8) (be32_to_cpu(sib->sib_flowinfo) >> 20); comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS; break; } id_priv->query_id = ib_sa_path_rec_get(&sa_client, id_priv->id.device, id_priv->id.port_num, &path_rec, comp_mask, timeout_ms, GFP_KERNEL, cma_query_handler, work, &id_priv->query); return (id_priv->query_id < 0) ? id_priv->query_id : 0; } static void cma_iboe_join_work_handler(struct work_struct *work) { struct cma_multicast *mc = container_of(work, struct cma_multicast, iboe_join.work); struct rdma_cm_event *event = &mc->iboe_join.event; struct rdma_id_private *id_priv = mc->id_priv; int ret; mutex_lock(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING || READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL) goto out_unlock; ret = cma_cm_event_handler(id_priv, event); WARN_ON(ret); out_unlock: mutex_unlock(&id_priv->handler_mutex); if (event->event == RDMA_CM_EVENT_MULTICAST_JOIN) rdma_destroy_ah_attr(&event->param.ud.ah_attr); } static void cma_work_handler(struct work_struct *_work) { struct cma_work *work = container_of(_work, struct cma_work, work); struct rdma_id_private *id_priv = work->id; mutex_lock(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING || READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL) goto out_unlock; if (work->old_state != 0 || work->new_state != 0) { if (!cma_comp_exch(id_priv, work->old_state, work->new_state)) goto out_unlock; } if (cma_cm_event_handler(id_priv, &work->event)) { cma_id_put(id_priv); destroy_id_handler_unlock(id_priv); goto out_free; } out_unlock: mutex_unlock(&id_priv->handler_mutex); cma_id_put(id_priv); out_free: if (work->event.event == RDMA_CM_EVENT_MULTICAST_JOIN) rdma_destroy_ah_attr(&work->event.param.ud.ah_attr); kfree(work); } static void cma_init_resolve_route_work(struct cma_work *work, struct rdma_id_private *id_priv) { work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); work->old_state = RDMA_CM_ROUTE_QUERY; work->new_state = RDMA_CM_ROUTE_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; } static void enqueue_resolve_addr_work(struct cma_work *work, struct rdma_id_private *id_priv) { /* Balances with cma_id_put() in cma_work_handler */ cma_id_get(id_priv); work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); work->old_state = RDMA_CM_ADDR_QUERY; work->new_state = RDMA_CM_ADDR_RESOLVED; work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; queue_work(cma_wq, &work->work); } static int cma_resolve_ib_route(struct rdma_id_private *id_priv, unsigned long timeout_ms) { struct rdma_route *route = &id_priv->id.route; struct cma_work *work; int ret; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; cma_init_resolve_route_work(work, id_priv); if (!route->path_rec) route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL); if (!route->path_rec) { ret = -ENOMEM; goto err1; } ret = cma_query_ib_route(id_priv, timeout_ms, work); if (ret) goto err2; return 0; err2: kfree(route->path_rec); route->path_rec = NULL; err1: kfree(work); return ret; } static enum ib_gid_type cma_route_gid_type(enum rdma_network_type network_type, unsigned long supported_gids, enum ib_gid_type default_gid) { if ((network_type == RDMA_NETWORK_IPV4 || network_type == RDMA_NETWORK_IPV6) && test_bit(IB_GID_TYPE_ROCE_UDP_ENCAP, &supported_gids)) return IB_GID_TYPE_ROCE_UDP_ENCAP; return default_gid; } /* * cma_iboe_set_path_rec_l2_fields() is helper function which sets * path record type based on GID type. * It also sets up other L2 fields which includes destination mac address * netdev ifindex, of the path record. * It returns the netdev of the bound interface for this path record entry. */ static struct net_device * cma_iboe_set_path_rec_l2_fields(struct rdma_id_private *id_priv) { struct rdma_route *route = &id_priv->id.route; enum ib_gid_type gid_type = IB_GID_TYPE_ROCE; struct rdma_addr *addr = &route->addr; unsigned long supported_gids; struct net_device *ndev; if (!addr->dev_addr.bound_dev_if) return NULL; ndev = dev_get_by_index(addr->dev_addr.net, addr->dev_addr.bound_dev_if); if (!ndev) return NULL; supported_gids = roce_gid_type_mask_support(id_priv->id.device, id_priv->id.port_num); gid_type = cma_route_gid_type(addr->dev_addr.network, supported_gids, id_priv->gid_type); /* Use the hint from IP Stack to select GID Type */ if (gid_type < ib_network_to_gid_type(addr->dev_addr.network)) gid_type = ib_network_to_gid_type(addr->dev_addr.network); route->path_rec->rec_type = sa_conv_gid_to_pathrec_type(gid_type); route->path_rec->roce.route_resolved = true; sa_path_set_dmac(route->path_rec, addr->dev_addr.dst_dev_addr); return ndev; } int rdma_set_ib_path(struct rdma_cm_id *id, struct sa_path_rec *path_rec) { struct rdma_id_private *id_priv; struct net_device *ndev; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_RESOLVED)) return -EINVAL; id->route.path_rec = kmemdup(path_rec, sizeof(*path_rec), GFP_KERNEL); if (!id->route.path_rec) { ret = -ENOMEM; goto err; } if (rdma_protocol_roce(id->device, id->port_num)) { ndev = cma_iboe_set_path_rec_l2_fields(id_priv); if (!ndev) { ret = -ENODEV; goto err_free; } dev_put(ndev); } id->route.num_pri_alt_paths = 1; return 0; err_free: kfree(id->route.path_rec); id->route.path_rec = NULL; err: cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_ADDR_RESOLVED); return ret; } EXPORT_SYMBOL(rdma_set_ib_path); static int cma_resolve_iw_route(struct rdma_id_private *id_priv) { struct cma_work *work; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; cma_init_resolve_route_work(work, id_priv); queue_work(cma_wq, &work->work); return 0; } static int get_vlan_ndev_tc(struct net_device *vlan_ndev, int prio) { struct net_device *dev; dev = vlan_dev_real_dev(vlan_ndev); if (dev->num_tc) return netdev_get_prio_tc_map(dev, prio); return (vlan_dev_get_egress_qos_mask(vlan_ndev, prio) & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; } struct iboe_prio_tc_map { int input_prio; int output_tc; bool found; }; static int get_lower_vlan_dev_tc(struct net_device *dev, struct netdev_nested_priv *priv) { struct iboe_prio_tc_map *map = (struct iboe_prio_tc_map *)priv->data; if (is_vlan_dev(dev)) map->output_tc = get_vlan_ndev_tc(dev, map->input_prio); else if (dev->num_tc) map->output_tc = netdev_get_prio_tc_map(dev, map->input_prio); else map->output_tc = 0; /* We are interested only in first level VLAN device, so always * return 1 to stop iterating over next level devices. */ map->found = true; return 1; } static int iboe_tos_to_sl(struct net_device *ndev, int tos) { struct iboe_prio_tc_map prio_tc_map = {}; int prio = rt_tos2priority(tos); struct netdev_nested_priv priv; /* If VLAN device, get it directly from the VLAN netdev */ if (is_vlan_dev(ndev)) return get_vlan_ndev_tc(ndev, prio); prio_tc_map.input_prio = prio; priv.data = (void *)&prio_tc_map; rcu_read_lock(); netdev_walk_all_lower_dev_rcu(ndev, get_lower_vlan_dev_tc, &priv); rcu_read_unlock(); /* If map is found from lower device, use it; Otherwise * continue with the current netdevice to get priority to tc map. */ if (prio_tc_map.found) return prio_tc_map.output_tc; else if (ndev->num_tc) return netdev_get_prio_tc_map(ndev, prio); else return 0; } static __be32 cma_get_roce_udp_flow_label(struct rdma_id_private *id_priv) { struct sockaddr_in6 *addr6; u16 dport, sport; u32 hash, fl; addr6 = (struct sockaddr_in6 *)cma_src_addr(id_priv); fl = be32_to_cpu(addr6->sin6_flowinfo) & IB_GRH_FLOWLABEL_MASK; if ((cma_family(id_priv) != AF_INET6) || !fl) { dport = be16_to_cpu(cma_port(cma_dst_addr(id_priv))); sport = be16_to_cpu(cma_port(cma_src_addr(id_priv))); hash = (u32)sport * 31 + dport; fl = hash & IB_GRH_FLOWLABEL_MASK; } return cpu_to_be32(fl); } static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) { struct rdma_route *route = &id_priv->id.route; struct rdma_addr *addr = &route->addr; struct cma_work *work; int ret; struct net_device *ndev; u8 default_roce_tos = id_priv->cma_dev->default_roce_tos[id_priv->id.port_num - rdma_start_port(id_priv->cma_dev->device)]; u8 tos; mutex_lock(&id_priv->qp_mutex); tos = id_priv->tos_set ? id_priv->tos : default_roce_tos; mutex_unlock(&id_priv->qp_mutex); work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; route->path_rec = kzalloc(sizeof *route->path_rec, GFP_KERNEL); if (!route->path_rec) { ret = -ENOMEM; goto err1; } route->num_pri_alt_paths = 1; ndev = cma_iboe_set_path_rec_l2_fields(id_priv); if (!ndev) { ret = -ENODEV; goto err2; } rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &route->path_rec->sgid); rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr, &route->path_rec->dgid); if (((struct sockaddr *)&id_priv->id.route.addr.dst_addr)->sa_family != AF_IB) /* TODO: get the hoplimit from the inet/inet6 device */ route->path_rec->hop_limit = addr->dev_addr.hoplimit; else route->path_rec->hop_limit = 1; route->path_rec->reversible = 1; route->path_rec->pkey = cpu_to_be16(0xffff); route->path_rec->mtu_selector = IB_SA_EQ; route->path_rec->sl = iboe_tos_to_sl(ndev, tos); route->path_rec->traffic_class = tos; route->path_rec->mtu = iboe_get_mtu(ndev->mtu); route->path_rec->rate_selector = IB_SA_EQ; route->path_rec->rate = IB_RATE_PORT_CURRENT; dev_put(ndev); route->path_rec->packet_life_time_selector = IB_SA_EQ; /* In case ACK timeout is set, use this value to calculate * PacketLifeTime. As per IBTA 12.7.34, * local ACK timeout = (2 * PacketLifeTime + Local CA’s ACK delay). * Assuming a negligible local ACK delay, we can use * PacketLifeTime = local ACK timeout/2 * as a reasonable approximation for RoCE networks. */ mutex_lock(&id_priv->qp_mutex); if (id_priv->timeout_set && id_priv->timeout) route->path_rec->packet_life_time = id_priv->timeout - 1; else route->path_rec->packet_life_time = CMA_IBOE_PACKET_LIFETIME; mutex_unlock(&id_priv->qp_mutex); if (!route->path_rec->mtu) { ret = -EINVAL; goto err2; } if (rdma_protocol_roce_udp_encap(id_priv->id.device, id_priv->id.port_num)) route->path_rec->flow_label = cma_get_roce_udp_flow_label(id_priv); cma_init_resolve_route_work(work, id_priv); queue_work(cma_wq, &work->work); return 0; err2: kfree(route->path_rec); route->path_rec = NULL; route->num_pri_alt_paths = 0; err1: kfree(work); return ret; } int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms) { struct rdma_id_private *id_priv; int ret; if (!timeout_ms) return -EINVAL; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_QUERY)) return -EINVAL; cma_id_get(id_priv); if (rdma_cap_ib_sa(id->device, id->port_num)) ret = cma_resolve_ib_route(id_priv, timeout_ms); else if (rdma_protocol_roce(id->device, id->port_num)) { ret = cma_resolve_iboe_route(id_priv); if (!ret) cma_add_id_to_tree(id_priv); } else if (rdma_protocol_iwarp(id->device, id->port_num)) ret = cma_resolve_iw_route(id_priv); else ret = -ENOSYS; if (ret) goto err; return 0; err: cma_comp_exch(id_priv, RDMA_CM_ROUTE_QUERY, RDMA_CM_ADDR_RESOLVED); cma_id_put(id_priv); return ret; } EXPORT_SYMBOL(rdma_resolve_route); static void cma_set_loopback(struct sockaddr *addr) { switch (addr->sa_family) { case AF_INET: ((struct sockaddr_in *) addr)->sin_addr.s_addr = htonl(INADDR_LOOPBACK); break; case AF_INET6: ipv6_addr_set(&((struct sockaddr_in6 *) addr)->sin6_addr, 0, 0, 0, htonl(1)); break; default: ib_addr_set(&((struct sockaddr_ib *) addr)->sib_addr, 0, 0, 0, htonl(1)); break; } } static int cma_bind_loopback(struct rdma_id_private *id_priv) { struct cma_device *cma_dev, *cur_dev; union ib_gid gid; enum ib_port_state port_state; unsigned int p; u16 pkey; int ret; cma_dev = NULL; mutex_lock(&lock); list_for_each_entry(cur_dev, &dev_list, list) { if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cur_dev->device, 1)) continue; if (!cma_dev) cma_dev = cur_dev; rdma_for_each_port (cur_dev->device, p) { if (!ib_get_cached_port_state(cur_dev->device, p, &port_state) && port_state == IB_PORT_ACTIVE) { cma_dev = cur_dev; goto port_found; } } } if (!cma_dev) { ret = -ENODEV; goto out; } p = 1; port_found: ret = rdma_query_gid(cma_dev->device, p, 0, &gid); if (ret) goto out; ret = ib_get_cached_pkey(cma_dev->device, p, 0, &pkey); if (ret) goto out; id_priv->id.route.addr.dev_addr.dev_type = (rdma_protocol_ib(cma_dev->device, p)) ? ARPHRD_INFINIBAND : ARPHRD_ETHER; rdma_addr_set_sgid(&id_priv->id.route.addr.dev_addr, &gid); ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, pkey); id_priv->id.port_num = p; cma_attach_to_dev(id_priv, cma_dev); rdma_restrack_add(&id_priv->res); cma_set_loopback(cma_src_addr(id_priv)); out: mutex_unlock(&lock); return ret; } static void addr_handler(int status, struct sockaddr *src_addr, struct rdma_dev_addr *dev_addr, void *context) { struct rdma_id_private *id_priv = context; struct rdma_cm_event event = {}; struct sockaddr *addr; struct sockaddr_storage old_addr; mutex_lock(&id_priv->handler_mutex); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_RESOLVED)) goto out; /* * Store the previous src address, so that if we fail to acquire * matching rdma device, old address can be restored back, which helps * to cancel the cma listen operation correctly. */ addr = cma_src_addr(id_priv); memcpy(&old_addr, addr, rdma_addr_size(addr)); memcpy(addr, src_addr, rdma_addr_size(src_addr)); if (!status && !id_priv->cma_dev) { status = cma_acquire_dev_by_src_ip(id_priv); if (status) pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to acquire device. status %d\n", status); rdma_restrack_add(&id_priv->res); } else if (status) { pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to resolve IP. status %d\n", status); } if (status) { memcpy(addr, &old_addr, rdma_addr_size((struct sockaddr *)&old_addr)); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ADDR_BOUND)) goto out; event.event = RDMA_CM_EVENT_ADDR_ERROR; event.status = status; } else event.event = RDMA_CM_EVENT_ADDR_RESOLVED; if (cma_cm_event_handler(id_priv, &event)) { destroy_id_handler_unlock(id_priv); return; } out: mutex_unlock(&id_priv->handler_mutex); } static int cma_resolve_loopback(struct rdma_id_private *id_priv) { struct cma_work *work; union ib_gid gid; int ret; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; if (!id_priv->cma_dev) { ret = cma_bind_loopback(id_priv); if (ret) goto err; } rdma_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid); rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid); enqueue_resolve_addr_work(work, id_priv); return 0; err: kfree(work); return ret; } static int cma_resolve_ib_addr(struct rdma_id_private *id_priv) { struct cma_work *work; int ret; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; if (!id_priv->cma_dev) { ret = cma_resolve_ib_dev(id_priv); if (ret) goto err; } rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, (union ib_gid *) &(((struct sockaddr_ib *) &id_priv->id.route.addr.dst_addr)->sib_addr)); enqueue_resolve_addr_work(work, id_priv); return 0; err: kfree(work); return ret; } int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse) { struct rdma_id_private *id_priv; unsigned long flags; int ret; id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irqsave(&id_priv->lock, flags); if ((reuse && id_priv->state != RDMA_CM_LISTEN) || id_priv->state == RDMA_CM_IDLE) { id_priv->reuseaddr = reuse; ret = 0; } else { ret = -EINVAL; } spin_unlock_irqrestore(&id_priv->lock, flags); return ret; } EXPORT_SYMBOL(rdma_set_reuseaddr); int rdma_set_afonly(struct rdma_cm_id *id, int afonly) { struct rdma_id_private *id_priv; unsigned long flags; int ret; id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irqsave(&id_priv->lock, flags); if (id_priv->state == RDMA_CM_IDLE || id_priv->state == RDMA_CM_ADDR_BOUND) { id_priv->options |= (1 << CMA_OPTION_AFONLY); id_priv->afonly = afonly; ret = 0; } else { ret = -EINVAL; } spin_unlock_irqrestore(&id_priv->lock, flags); return ret; } EXPORT_SYMBOL(rdma_set_afonly); static void cma_bind_port(struct rdma_bind_list *bind_list, struct rdma_id_private *id_priv) { struct sockaddr *addr; struct sockaddr_ib *sib; u64 sid, mask; __be16 port; lockdep_assert_held(&lock); addr = cma_src_addr(id_priv); port = htons(bind_list->port); switch (addr->sa_family) { case AF_INET: ((struct sockaddr_in *) addr)->sin_port = port; break; case AF_INET6: ((struct sockaddr_in6 *) addr)->sin6_port = port; break; case AF_IB: sib = (struct sockaddr_ib *) addr; sid = be64_to_cpu(sib->sib_sid); mask = be64_to_cpu(sib->sib_sid_mask); sib->sib_sid = cpu_to_be64((sid & mask) | (u64) ntohs(port)); sib->sib_sid_mask = cpu_to_be64(~0ULL); break; } id_priv->bind_list = bind_list; hlist_add_head(&id_priv->node, &bind_list->owners); } static int cma_alloc_port(enum rdma_ucm_port_space ps, struct rdma_id_private *id_priv, unsigned short snum) { struct rdma_bind_list *bind_list; int ret; lockdep_assert_held(&lock); bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL); if (!bind_list) return -ENOMEM; ret = cma_ps_alloc(id_priv->id.route.addr.dev_addr.net, ps, bind_list, snum); if (ret < 0) goto err; bind_list->ps = ps; bind_list->port = snum; cma_bind_port(bind_list, id_priv); return 0; err: kfree(bind_list); return ret == -ENOSPC ? -EADDRNOTAVAIL : ret; } static int cma_port_is_unique(struct rdma_bind_list *bind_list, struct rdma_id_private *id_priv) { struct rdma_id_private *cur_id; struct sockaddr *daddr = cma_dst_addr(id_priv); struct sockaddr *saddr = cma_src_addr(id_priv); __be16 dport = cma_port(daddr); lockdep_assert_held(&lock); hlist_for_each_entry(cur_id, &bind_list->owners, node) { struct sockaddr *cur_daddr = cma_dst_addr(cur_id); struct sockaddr *cur_saddr = cma_src_addr(cur_id); __be16 cur_dport = cma_port(cur_daddr); if (id_priv == cur_id) continue; /* different dest port -> unique */ if (!cma_any_port(daddr) && !cma_any_port(cur_daddr) && (dport != cur_dport)) continue; /* different src address -> unique */ if (!cma_any_addr(saddr) && !cma_any_addr(cur_saddr) && cma_addr_cmp(saddr, cur_saddr)) continue; /* different dst address -> unique */ if (!cma_any_addr(daddr) && !cma_any_addr(cur_daddr) && cma_addr_cmp(daddr, cur_daddr)) continue; return -EADDRNOTAVAIL; } return 0; } static int cma_alloc_any_port(enum rdma_ucm_port_space ps, struct rdma_id_private *id_priv) { static unsigned int last_used_port; int low, high, remaining; unsigned int rover; struct net *net = id_priv->id.route.addr.dev_addr.net; lockdep_assert_held(&lock); inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; rover = get_random_u32_inclusive(low, remaining + low - 1); retry: if (last_used_port != rover) { struct rdma_bind_list *bind_list; int ret; bind_list = cma_ps_find(net, ps, (unsigned short)rover); if (!bind_list) { ret = cma_alloc_port(ps, id_priv, rover); } else { ret = cma_port_is_unique(bind_list, id_priv); if (!ret) cma_bind_port(bind_list, id_priv); } /* * Remember previously used port number in order to avoid * re-using same port immediately after it is closed. */ if (!ret) last_used_port = rover; if (ret != -EADDRNOTAVAIL) return ret; } if (--remaining) { rover++; if ((rover < low) || (rover > high)) rover = low; goto retry; } return -EADDRNOTAVAIL; } /* * Check that the requested port is available. This is called when trying to * bind to a specific port, or when trying to listen on a bound port. In * the latter case, the provided id_priv may already be on the bind_list, but * we still need to check that it's okay to start listening. */ static int cma_check_port(struct rdma_bind_list *bind_list, struct rdma_id_private *id_priv, uint8_t reuseaddr) { struct rdma_id_private *cur_id; struct sockaddr *addr, *cur_addr; lockdep_assert_held(&lock); addr = cma_src_addr(id_priv); hlist_for_each_entry(cur_id, &bind_list->owners, node) { if (id_priv == cur_id) continue; if (reuseaddr && cur_id->reuseaddr) continue; cur_addr = cma_src_addr(cur_id); if (id_priv->afonly && cur_id->afonly && (addr->sa_family != cur_addr->sa_family)) continue; if (cma_any_addr(addr) || cma_any_addr(cur_addr)) return -EADDRNOTAVAIL; if (!cma_addr_cmp(addr, cur_addr)) return -EADDRINUSE; } return 0; } static int cma_use_port(enum rdma_ucm_port_space ps, struct rdma_id_private *id_priv) { struct rdma_bind_list *bind_list; unsigned short snum; int ret; lockdep_assert_held(&lock); snum = ntohs(cma_port(cma_src_addr(id_priv))); if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) return -EACCES; bind_list = cma_ps_find(id_priv->id.route.addr.dev_addr.net, ps, snum); if (!bind_list) { ret = cma_alloc_port(ps, id_priv, snum); } else { ret = cma_check_port(bind_list, id_priv, id_priv->reuseaddr); if (!ret) cma_bind_port(bind_list, id_priv); } return ret; } static enum rdma_ucm_port_space cma_select_inet_ps(struct rdma_id_private *id_priv) { switch (id_priv->id.ps) { case RDMA_PS_TCP: case RDMA_PS_UDP: case RDMA_PS_IPOIB: case RDMA_PS_IB: return id_priv->id.ps; default: return 0; } } static enum rdma_ucm_port_space cma_select_ib_ps(struct rdma_id_private *id_priv) { enum rdma_ucm_port_space ps = 0; struct sockaddr_ib *sib; u64 sid_ps, mask, sid; sib = (struct sockaddr_ib *) cma_src_addr(id_priv); mask = be64_to_cpu(sib->sib_sid_mask) & RDMA_IB_IP_PS_MASK; sid = be64_to_cpu(sib->sib_sid) & mask; if ((id_priv->id.ps == RDMA_PS_IB) && (sid == (RDMA_IB_IP_PS_IB & mask))) { sid_ps = RDMA_IB_IP_PS_IB; ps = RDMA_PS_IB; } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_TCP)) && (sid == (RDMA_IB_IP_PS_TCP & mask))) { sid_ps = RDMA_IB_IP_PS_TCP; ps = RDMA_PS_TCP; } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_UDP)) && (sid == (RDMA_IB_IP_PS_UDP & mask))) { sid_ps = RDMA_IB_IP_PS_UDP; ps = RDMA_PS_UDP; } if (ps) { sib->sib_sid = cpu_to_be64(sid_ps | ntohs(cma_port((struct sockaddr *) sib))); sib->sib_sid_mask = cpu_to_be64(RDMA_IB_IP_PS_MASK | be64_to_cpu(sib->sib_sid_mask)); } return ps; } static int cma_get_port(struct rdma_id_private *id_priv) { enum rdma_ucm_port_space ps; int ret; if (cma_family(id_priv) != AF_IB) ps = cma_select_inet_ps(id_priv); else ps = cma_select_ib_ps(id_priv