Total coverage: 372254 (19%)of 2011806
12 12 12 12 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 // SPDX-License-Identifier: GPL-2.0-only /* * User address space access functions. * * Copyright 1997 Andi Kleen <ak@muc.de> * Copyright 1997 Linus Torvalds * Copyright 2002 Andi Kleen <ak@suse.de> */ #include <linux/export.h> #include <linux/uaccess.h> #include <linux/highmem.h> #include <linux/libnvdimm.h> /* * Zero Userspace */ #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE /** * clean_cache_range - write back a cache range with CLWB * @addr: virtual start address * @size: number of bytes to write back * * Write back a cache range using the CLWB (cache line write back) * instruction. Note that @size is internally rounded up to be cache * line size aligned. */ static void clean_cache_range(void *addr, size_t size) { u16 x86_clflush_size = boot_cpu_data.x86_clflush_size; unsigned long clflush_mask = x86_clflush_size - 1; void *vend = addr + size; void *p; for (p = (void *)((unsigned long)addr & ~clflush_mask); p < vend; p += x86_clflush_size) clwb(p); } void arch_wb_cache_pmem(void *addr, size_t size) { clean_cache_range(addr, size); } EXPORT_SYMBOL_GPL(arch_wb_cache_pmem); long __copy_user_flushcache(void *dst, const void __user *src, unsigned size) { unsigned long flushed, dest = (unsigned long) dst; long rc; stac(); rc = __copy_user_nocache(dst, src, size); clac(); /* * __copy_user_nocache() uses non-temporal stores for the bulk * of the transfer, but we need to manually flush if the * transfer is unaligned. A cached memory copy is used when * destination or size is not naturally aligned. That is: * - Require 8-byte alignment when size is 8 bytes or larger. * - Require 4-byte alignment when size is 4 bytes. */ if (size < 8) { if (!IS_ALIGNED(dest, 4) || size != 4) clean_cache_range(dst, size); } else { if (!IS_ALIGNED(dest, 8)) { dest = ALIGN(dest, boot_cpu_data.x86_clflush_size); clean_cache_range(dst, 1); } flushed = dest - (unsigned long) dst; if (size > flushed && !IS_ALIGNED(size - flushed, 8)) clean_cache_range(dst + size - 1, 1); } return rc; } void __memcpy_flushcache(void *_dst, const void *_src, size_t size) { unsigned long dest = (unsigned long) _dst; unsigned long source = (unsigned long) _src; /* cache copy and flush to align dest */ if (!IS_ALIGNED(dest, 8)) { size_t len = min_t(size_t, size, ALIGN(dest, 8) - dest); memcpy((void *) dest, (void *) source, len); clean_cache_range((void *) dest, len); dest += len; source += len; size -= len; if (!size) return; } /* 4x8 movnti loop */ while (size >= 32) { asm("movq (%0), %%r8\n" "movq 8(%0), %%r9\n" "movq 16(%0), %%r10\n" "movq 24(%0), %%r11\n" "movnti %%r8, (%1)\n" "movnti %%r9, 8(%1)\n" "movnti %%r10, 16(%1)\n" "movnti %%r11, 24(%1)\n" :: "r" (source), "r" (dest) : "memory", "r8", "r9", "r10", "r11"); dest += 32; source += 32; size -= 32; } /* 1x8 movnti loop */ while (size >= 8) { asm("movq (%0), %%r8\n" "movnti %%r8, (%1)\n" :: "r" (source), "r" (dest) : "memory", "r8"); dest += 8; source += 8; size -= 8; } /* 1x4 movnti loop */ while (size >= 4) { asm("movl (%0), %%r8d\n" "movnti %%r8d, (%1)\n" :: "r" (source), "r" (dest) : "memory", "r8"); dest += 4; source += 4; size -= 4; } /* cache copy for remaining bytes */ if (size) { memcpy((void *) dest, (void *) source, size); clean_cache_range((void *) dest, size); } } EXPORT_SYMBOL_GPL(__memcpy_flushcache); #endif
3 3 3 3 3 5 5 5 3 3 5 5 5 5 5 3 5 5 5 29 29 29 29 29 29 5 5 5 5 5 5 5 3 5 29 29 29 29 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 // SPDX-License-Identifier: GPL-2.0 /* * Universal Host Controller Interface driver for USB. * * Maintainer: Alan Stern <stern@rowland.harvard.edu> * * (C) Copyright 1999 Linus Torvalds * (C) Copyright 1999-2002 Johannes Erdfelt, johannes@erdfelt.com * (C) Copyright 1999 Randy Dunlap * (C) Copyright 1999 Georg Acher, acher@in.tum.de * (C) Copyright 1999 Deti Fliegl, deti@fliegl.de * (C) Copyright 1999 Thomas Sailer, sailer@ife.ee.ethz.ch * (C) Copyright 1999 Roman Weissgaerber, weissg@vienna.at * (C) Copyright 2000 Yggdrasil Computing, Inc. (port of new PCI interface * support from usb-ohci.c by Adam Richter, adam@yggdrasil.com). * (C) Copyright 1999 Gregory P. Smith (from usb-ohci.c) * (C) Copyright 2004-2007 Alan Stern, stern@rowland.harvard.edu * * Intel documents this fairly well, and as far as I know there * are no royalties or anything like that, but even so there are * people who decided that they want to do the same thing in a * completely different way. * */ #include <linux/module.h> #include <linux/pci.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/delay.h> #include <linux/ioport.h> #include <linux/slab.h> #include <linux/errno.h> #include <linux/unistd.h> #include <linux/interrupt.h> #include <linux/spinlock.h> #include <linux/debugfs.h> #include <linux/pm.h> #include <linux/dmapool.h> #include <linux/dma-mapping.h> #include <linux/usb.h> #include <linux/usb/hcd.h> #include <linux/bitops.h> #include <linux/dmi.h> #include <linux/uaccess.h> #include <asm/io.h> #include <asm/irq.h> #include "uhci-hcd.h" /* * Version Information */ #define DRIVER_AUTHOR \ "Linus 'Frodo Rabbit' Torvalds, Johannes Erdfelt, " \ "Randy Dunlap, Georg Acher, Deti Fliegl, Thomas Sailer, " \ "Roman Weissgaerber, Alan Stern" #define DRIVER_DESC "USB Universal Host Controller Interface driver" /* for flakey hardware, ignore overcurrent indicators */ static bool ignore_oc; module_param(ignore_oc, bool, S_IRUGO); MODULE_PARM_DESC(ignore_oc, "ignore hardware overcurrent indications"); /* * debug = 0, no debugging messages * debug = 1, dump failed URBs except for stalls * debug = 2, dump all failed URBs (including stalls) * show all queues in /sys/kernel/debug/uhci/[pci_addr] * debug = 3, show all TDs in URBs when dumping */ #ifdef CONFIG_DYNAMIC_DEBUG static int debug = 1; module_param(debug, int, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(debug, "Debug level"); static char *errbuf; #else #define debug 0 #define errbuf NULL #endif #define ERRBUF_LEN (32 * 1024) static struct kmem_cache *uhci_up_cachep; /* urb_priv */ static void suspend_rh(struct uhci_hcd *uhci, enum uhci_rh_state new_state); static void wakeup_rh(struct uhci_hcd *uhci); static void uhci_get_current_frame_number(struct uhci_hcd *uhci); /* * Calculate the link pointer DMA value for the first Skeleton QH in a frame. */ static __hc32 uhci_frame_skel_link(struct uhci_hcd *uhci, int frame) { int skelnum; /* * The interrupt queues will be interleaved as evenly as possible. * There's not much to be done about period-1 interrupts; they have * to occur in every frame. But we can schedule period-2 interrupts * in odd-numbered frames, period-4 interrupts in frames congruent * to 2 (mod 4), and so on. This way each frame only has two * interrupt QHs, which will help spread out bandwidth utilization. * * ffs (Find First bit Set) does exactly what we need: * 1,3,5,... => ffs = 0 => use period-2 QH = skelqh[8], * 2,6,10,... => ffs = 1 => use period-4 QH = skelqh[7], etc. * ffs >= 7 => not on any high-period queue, so use * period-1 QH = skelqh[9]. * Add in UHCI_NUMFRAMES to insure at least one bit is set. */ skelnum = 8 - (int) __ffs(frame | UHCI_NUMFRAMES); if (skelnum <= 1) skelnum = 9; return LINK_TO_QH(uhci, uhci->skelqh[skelnum]); } #include "uhci-debug.c" #include "uhci-q.c" #include "uhci-hub.c" /* * Finish up a host controller reset and update the recorded state. */ static void finish_reset(struct uhci_hcd *uhci) { int port; /* HCRESET doesn't affect the Suspend, Reset, and Resume Detect * bits in the port status and control registers. * We have to clear them by hand. */ for (port = 0; port < uhci->rh_numports; ++port) uhci_writew(uhci, 0, USBPORTSC1 + (port * 2)); uhci->port_c_suspend = uhci->resuming_ports = 0; uhci->rh_state = UHCI_RH_RESET; uhci->is_stopped = UHCI_IS_STOPPED; clear_bit(HCD_FLAG_POLL_RH, &uhci_to_hcd(uhci)->flags); } /* * Last rites for a defunct/nonfunctional controller * or one we don't want to use any more. */ static void uhci_hc_died(struct uhci_hcd *uhci) { uhci_get_current_frame_number(uhci); uhci->reset_hc(uhci); finish_reset(uhci); uhci->dead = 1; /* The current frame may already be partway finished */ ++uhci->frame_number; } /* * Initialize a controller that was newly discovered or has lost power * or otherwise been reset while it was suspended. In none of these cases * can we be sure of its previous state. */ static void check_and_reset_hc(struct uhci_hcd *uhci) { if (uhci->check_and_reset_hc(uhci)) finish_reset(uhci); } #if defined(CONFIG_USB_UHCI_SUPPORT_NON_PCI_HC) /* * The two functions below are generic reset functions that are used on systems * that do not have keyboard and mouse legacy support. We assume that we are * running on such a system if CONFIG_USB_UHCI_SUPPORT_NON_PCI_HC is defined. */ /* * Make sure the controller is completely inactive, unable to * generate interrupts or do DMA. */ static void uhci_generic_reset_hc(struct uhci_hcd *uhci) { /* Reset the HC - this will force us to get a * new notification of any already connected * ports due to the virtual disconnect that it * implies. */ uhci_writew(uhci, USBCMD_HCRESET, USBCMD); mb(); udelay(5); if (uhci_readw(uhci, USBCMD) & USBCMD_HCRESET) dev_warn(uhci_dev(uhci), "HCRESET not completed yet!\n"); /* Just to be safe, disable interrupt requests and * make sure the controller is stopped. */ uhci_writew(uhci, 0, USBINTR); uhci_writew(uhci, 0, USBCMD); } /* * Initialize a controller that was newly discovered or has just been * resumed. In either case we can't be sure of its previous state. * * Returns: 1 if the controller was reset, 0 otherwise. */ static int uhci_generic_check_and_reset_hc(struct uhci_hcd *uhci) { unsigned int cmd, intr; /* * When restarting a suspended controller, we expect all the * settings to be the same as we left them: * * Controller is stopped and configured with EGSM set; * No interrupts enabled except possibly Resume Detect. * * If any of these conditions are violated we do a complete reset. */ cmd = uhci_readw(uhci, USBCMD); if ((cmd & USBCMD_RS) || !(cmd & USBCMD_CF) || !(cmd & USBCMD_EGSM)) { dev_dbg(uhci_dev(uhci), "%s: cmd = 0x%04x\n", __func__, cmd); goto reset_needed; } intr = uhci_readw(uhci, USBINTR); if (intr & (~USBINTR_RESUME)) { dev_dbg(uhci_dev(uhci), "%s: intr = 0x%04x\n", __func__, intr); goto reset_needed; } return 0; reset_needed: dev_dbg(uhci_dev(uhci), "Performing full reset\n"); uhci_generic_reset_hc(uhci); return 1; } #endif /* CONFIG_USB_UHCI_SUPPORT_NON_PCI_HC */ /* * Store the basic register settings needed by the controller. */ static void configure_hc(struct uhci_hcd *uhci) { /* Set the frame length to the default: 1 ms exactly */ uhci_writeb(uhci, USBSOF_DEFAULT, USBSOF); /* Store the frame list base address */ uhci_writel(uhci, uhci->frame_dma_handle, USBFLBASEADD); /* Set the current frame number */ uhci_writew(uhci, uhci->frame_number & UHCI_MAX_SOF_NUMBER, USBFRNUM); /* perform any arch/bus specific configuration */ if (uhci->configure_hc) uhci->configure_hc(uhci); } static int resume_detect_interrupts_are_broken(struct uhci_hcd *uhci) { /* * If we have to ignore overcurrent events then almost by definition * we can't depend on resume-detect interrupts. * * Those interrupts also don't seem to work on ASpeed SoCs. */ if (ignore_oc || uhci_is_aspeed(uhci)) return 1; return uhci->resume_detect_interrupts_are_broken ? uhci->resume_detect_interrupts_are_broken(uhci) : 0; } static int global_suspend_mode_is_broken(struct uhci_hcd *uhci) { return uhci->global_suspend_mode_is_broken ? uhci->global_suspend_mode_is_broken(uhci) : 0; } static void suspend_rh(struct uhci_hcd *uhci, enum uhci_rh_state new_state) __releases(uhci->lock) __acquires(uhci->lock) { int auto_stop; int int_enable, egsm_enable, wakeup_enable; struct usb_device *rhdev = uhci_to_hcd(uhci)->self.root_hub; auto_stop = (new_state == UHCI_RH_AUTO_STOPPED); dev_dbg(&rhdev->dev, "%s%s\n", __func__, (auto_stop ? " (auto-stop)" : "")); /* Start off by assuming Resume-Detect interrupts and EGSM work * and that remote wakeups should be enabled. */ egsm_enable = USBCMD_EGSM; int_enable = USBINTR_RESUME; wakeup_enable = 1; /* * In auto-stop mode, we must be able to detect new connections. * The user can force us to poll by disabling remote wakeup; * otherwise we will use the EGSM/RD mechanism. */ if (auto_stop) { if (!device_may_wakeup(&rhdev->dev)) egsm_enable = int_enable = 0; } #ifdef CONFIG_PM /* * In bus-suspend mode, we use the wakeup setting specified * for the root hub. */ else { if (!rhdev->do_remote_wakeup) wakeup_enable = 0; } #endif /* * UHCI doesn't distinguish between wakeup requests from downstream * devices and local connect/disconnect events. There's no way to * enable one without the other; both are controlled by EGSM. Thus * if wakeups are disallowed then EGSM must be turned off -- in which * case remote wakeup requests from downstream during system sleep * will be lost. * * In addition, if EGSM is broken then we can't use it. Likewise, * if Resume-Detect interrupts are broken then we can't use them. * * Finally, neither EGSM nor RD is useful by itself. Without EGSM, * the RD status bit will never get set. Without RD, the controller * won't generate interrupts to tell the system about wakeup events. */ if (!wakeup_enable || global_suspend_mode_is_broken(uhci) || resume_detect_interrupts_are_broken(uhci)) egsm_enable = int_enable = 0; uhci->RD_enable = !!int_enable; uhci_writew(uhci, int_enable, USBINTR); uhci_writew(uhci, egsm_enable | USBCMD_CF, USBCMD); mb(); udelay(5); /* If we're auto-stopping then no devices have been attached * for a while, so there shouldn't be any active URBs and the * controller should stop after a few microseconds. Otherwise * we will give the controller one frame to stop. */ if (!auto_stop && !(uhci_readw(uhci, USBSTS) & USBSTS_HCH)) { uhci->rh_state = UHCI_RH_SUSPENDING; spin_unlock_irq(&uhci->lock); msleep(1); spin_lock_irq(&uhci->lock); if (uhci->dead) return; } if (!(uhci_readw(uhci, USBSTS) & USBSTS_HCH)) dev_warn(uhci_dev(uhci), "Controller not stopped yet!\n"); uhci_get_current_frame_number(uhci); uhci->rh_state = new_state; uhci->is_stopped = UHCI_IS_STOPPED; /* * If remote wakeup is enabled but either EGSM or RD interrupts * doesn't work, then we won't get an interrupt when a wakeup event * occurs. Thus the suspended root hub needs to be polled. */ if (wakeup_enable && (!int_enable || !egsm_enable)) set_bit(HCD_FLAG_POLL_RH, &uhci_to_hcd(uhci)->flags); else clear_bit(HCD_FLAG_POLL_RH, &uhci_to_hcd(uhci)->flags); uhci_scan_schedule(uhci); uhci_fsbr_off(uhci); } static void start_rh(struct uhci_hcd *uhci) { uhci->is_stopped = 0; /* * Clear stale status bits on Aspeed as we get a stale HCH * which causes problems later on */ if (uhci_is_aspeed(uhci)) uhci_writew(uhci, uhci_readw(uhci, USBSTS), USBSTS); /* Mark it configured and running with a 64-byte max packet. * All interrupts are enabled, even though RESUME won't do anything. */ uhci_writew(uhci, USBCMD_RS | USBCMD_CF | USBCMD_MAXP, USBCMD); uhci_writew(uhci, USBINTR_TIMEOUT | USBINTR_RESUME | USBINTR_IOC | USBINTR_SP, USBINTR); mb(); uhci->rh_state = UHCI_RH_RUNNING; set_bit(HCD_FLAG_POLL_RH, &uhci_to_hcd(uhci)->flags); } static void wakeup_rh(struct uhci_hcd *uhci) __releases(uhci->lock) __acquires(uhci->lock) { dev_dbg(&uhci_to_hcd(uhci)->self.root_hub->dev, "%s%s\n", __func__, uhci->rh_state == UHCI_RH_AUTO_STOPPED ? " (auto-start)" : ""); /* If we are auto-stopped then no devices are attached so there's * no need for wakeup signals. Otherwise we send Global Resume * for 20 ms. */ if (uhci->rh_state == UHCI_RH_SUSPENDED) { unsigned egsm; /* Keep EGSM on if it was set before */ egsm = uhci_readw(uhci, USBCMD) & USBCMD_EGSM; uhci->rh_state = UHCI_RH_RESUMING; uhci_writew(uhci, USBCMD_FGR | USBCMD_CF | egsm, USBCMD); spin_unlock_irq(&uhci->lock); msleep(20); spin_lock_irq(&uhci->lock); if (uhci->dead) return; /* End Global Resume and wait for EOP to be sent */ uhci_writew(uhci, USBCMD_CF, USBCMD); mb(); udelay(4); if (uhci_readw(uhci, USBCMD) & USBCMD_FGR) dev_warn(uhci_dev(uhci), "FGR not stopped yet!\n"); } start_rh(uhci); /* Restart root hub polling */ mod_timer(&uhci_to_hcd(uhci)->rh_timer, jiffies); } static irqreturn_t uhci_irq(struct usb_hcd *hcd) { struct uhci_hcd *uhci = hcd_to_uhci(hcd); unsigned short status; /* * Read the interrupt status, and write it back to clear the * interrupt cause. Contrary to the UHCI specification, the * "HC Halted" status bit is persistent: it is RO, not R/WC. */ status = uhci_readw(uhci, USBSTS); if (!(status & ~USBSTS_HCH)) /* shared interrupt, not mine */ return IRQ_NONE; uhci_writew(uhci, status, USBSTS); /* Clear it */ spin_lock(&uhci->lock); if (unlikely(!uhci->is_initialized)) /* not yet configured */ goto done; if (status & ~(USBSTS_USBINT | USBSTS_ERROR | USBSTS_RD)) { if (status & USBSTS_HSE) dev_err(uhci_dev(uhci), "host system error, PCI problems?\n"); if (status & USBSTS_HCPE) dev_err(uhci_dev(uhci), "host controller process error, something bad happened!\n"); if (status & USBSTS_HCH) { if (uhci->rh_state >= UHCI_RH_RUNNING) { dev_err(uhci_dev(uhci), "host controller halted, very bad!\n"); if (debug > 1 && errbuf) { /* Print the schedule for debugging */ uhci_sprint_schedule(uhci, errbuf, ERRBUF_LEN - EXTRA_SPACE); lprintk(errbuf); } uhci_hc_died(uhci); usb_hc_died(hcd); /* Force a callback in case there are * pending unlinks */ mod_timer(&hcd->rh_timer, jiffies); } } } if (status & USBSTS_RD) { spin_unlock(&uhci->lock); usb_hcd_poll_rh_status(hcd); } else { uhci_scan_schedule(uhci); done: spin_unlock(&uhci->lock); } return IRQ_HANDLED; } /* * Store the current frame number in uhci->frame_number if the controller * is running. Expand from 11 bits (of which we use only 10) to a * full-sized integer. * * Like many other parts of the driver, this code relies on being polled * more than once per second as long as the controller is running. */ static void uhci_get_current_frame_number(struct uhci_hcd *uhci) { if (!uhci->is_stopped) { unsigned delta; delta = (uhci_readw(uhci, USBFRNUM) - uhci->frame_number) & (UHCI_NUMFRAMES - 1); uhci->frame_number += delta; } } /* * De-allocate all resources */ static void release_uhci(struct uhci_hcd *uhci) { int i; spin_lock_irq(&uhci->lock); uhci->is_initialized = 0; spin_unlock_irq(&uhci->lock); debugfs_lookup_and_remove(uhci_to_hcd(uhci)->self.bus_name, uhci_debugfs_root); for (i = 0; i < UHCI_NUM_SKELQH; i++) uhci_free_qh(uhci, uhci->skelqh[i]); uhci_free_td(uhci, uhci->term_td); dma_pool_destroy(uhci->qh_pool); dma_pool_destroy(uhci->td_pool); kfree(uhci->frame_cpu); dma_free_coherent(uhci_dev(uhci), UHCI_NUMFRAMES * sizeof(*uhci->frame), uhci->frame, uhci->frame_dma_handle); } /* * Allocate a frame list, and then setup the skeleton * * The hardware doesn't really know any difference * in the queues, but the order does matter for the * protocols higher up. The order in which the queues * are encountered by the hardware is: * * - All isochronous events are handled before any * of the queues. We don't do that here, because * we'll create the actual TD entries on demand. * - The first queue is the high-period interrupt queue. * - The second queue is the period-1 interrupt and async * (low-speed control, full-speed control, then bulk) queue. * - The third queue is the terminating bandwidth reclamation queue, * which contains no members, loops back to itself, and is present * only when FSBR is on and there are no full-speed control or bulk QHs. */ static int uhci_start(struct usb_hcd *hcd) { struct uhci_hcd *uhci = hcd_to_uhci(hcd); int retval = -EBUSY; int i; hcd->uses_new_polling = 1; /* Accept arbitrarily long scatter-gather lists */ if (!hcd->localmem_pool) hcd->self.sg_tablesize = ~0; spin_lock_init(&uhci->lock); timer_setup(&uhci->fsbr_timer, uhci_fsbr_timeout, 0); INIT_LIST_HEAD(&uhci->idle_qh_list); init_waitqueue_head(&uhci->waitqh); #ifdef UHCI_DEBUG_OPS debugfs_create_file(hcd->self.bus_name, S_IFREG|S_IRUGO|S_IWUSR, uhci_debugfs_root, uhci, &uhci_debug_operations); #endif uhci->frame = dma_alloc_coherent(uhci_dev(uhci), UHCI_NUMFRAMES * sizeof(*uhci->frame), &uhci->frame_dma_handle, GFP_KERNEL); if (!uhci->frame) { dev_err(uhci_dev(uhci), "unable to allocate consistent memory for frame list\n"); goto err_alloc_frame; } uhci->frame_cpu = kcalloc(UHCI_NUMFRAMES, sizeof(*uhci->frame_cpu), GFP_KERNEL); if (!uhci->frame_cpu) goto err_alloc_frame_cpu; uhci->td_pool = dma_pool_create("uhci_td", uhci_dev(uhci), sizeof(struct uhci_td), 16, 0); if (!uhci->td_pool) { dev_err(uhci_dev(uhci), "unable to create td dma_pool\n"); goto err_create_td_pool; } uhci->qh_pool = dma_pool_create("uhci_qh", uhci_dev(uhci), sizeof(struct uhci_qh), 16, 0); if (!uhci->qh_pool) { dev_err(uhci_dev(uhci), "unable to create qh dma_pool\n"); goto err_create_qh_pool; } uhci->term_td = uhci_alloc_td(uhci); if (!uhci->term_td) { dev_err(uhci_dev(uhci), "unable to allocate terminating TD\n"); goto err_alloc_term_td; } for (i = 0; i < UHCI_NUM_SKELQH; i++) { uhci->skelqh[i] = uhci_alloc_qh(uhci, NULL, NULL); if (!uhci->skelqh[i]) { dev_err(uhci_dev(uhci), "unable to allocate QH\n"); goto err_alloc_skelqh; } } /* * 8 Interrupt queues; link all higher int queues to int1 = async */ for (i = SKEL_ISO + 1; i < SKEL_ASYNC; ++i) uhci->skelqh[i]->link = LINK_TO_QH(uhci, uhci->skel_async_qh); uhci->skel_async_qh->link = UHCI_PTR_TERM(uhci); uhci->skel_term_qh->link = LINK_TO_QH(uhci, uhci->skel_term_qh); /* This dummy TD is to work around a bug in Intel PIIX controllers */ uhci_fill_td(uhci, uhci->term_td, 0, uhci_explen(0) | (0x7f << TD_TOKEN_DEVADDR_SHIFT) | USB_PID_IN, 0); uhci->term_td->link = UHCI_PTR_TERM(uhci); uhci->skel_async_qh->element = uhci->skel_term_qh->element = LINK_TO_TD(uhci, uhci->term_td); /* * Fill the frame list: make all entries point to the proper * interrupt queue. */ for (i = 0; i < UHCI_NUMFRAMES; i++) { /* Only place we don't use the frame list routines */ uhci->frame[i] = uhci_frame_skel_link(uhci, i); } /* * Some architectures require a full mb() to enforce completion of * the memory writes above before the I/O transfers in configure_hc(). */ mb(); spin_lock_irq(&uhci->lock); configure_hc(uhci); uhci->is_initialized = 1; start_rh(uhci); spin_unlock_irq(&uhci->lock); return 0; /* * error exits: */ err_alloc_skelqh: for (i = 0; i < UHCI_NUM_SKELQH; i++) { if (uhci->skelqh[i]) uhci_free_qh(uhci, uhci->skelqh[i]); } uhci_free_td(uhci, uhci->term_td); err_alloc_term_td: dma_pool_destroy(uhci->qh_pool); err_create_qh_pool: dma_pool_destroy(uhci->td_pool); err_create_td_pool: kfree(uhci->frame_cpu); err_alloc_frame_cpu: dma_free_coherent(uhci_dev(uhci), UHCI_NUMFRAMES * sizeof(*uhci->frame), uhci->frame, uhci->frame_dma_handle); err_alloc_frame: debugfs_lookup_and_remove(hcd->self.bus_name, uhci_debugfs_root); return retval; } static void uhci_stop(struct usb_hcd *hcd) { struct uhci_hcd *uhci = hcd_to_uhci(hcd); spin_lock_irq(&uhci->lock); if (HCD_HW_ACCESSIBLE(hcd) && !uhci->dead) uhci_hc_died(uhci); uhci_scan_schedule(uhci); spin_unlock_irq(&uhci->lock); synchronize_irq(hcd->irq); timer_delete_sync(&uhci->fsbr_timer); release_uhci(uhci); } #ifdef CONFIG_PM static int uhci_rh_suspend(struct usb_hcd *hcd) { struct uhci_hcd *uhci = hcd_to_uhci(hcd); int rc = 0; spin_lock_irq(&uhci->lock); if (!HCD_HW_ACCESSIBLE(hcd)) rc = -ESHUTDOWN; else if (uhci->dead) ; /* Dead controllers tell no tales */ /* Once the controller is stopped, port resumes that are already * in progress won't complete. Hence if remote wakeup is enabled * for the root hub and any ports are in the middle of a resume or * remote wakeup, we must fail the suspend. */ else if (hcd->self.root_hub->do_remote_wakeup && uhci->resuming_ports) { dev_dbg(uhci_dev(uhci), "suspend failed because a port is resuming\n"); rc = -EBUSY; } else suspend_rh(uhci, UHCI_RH_SUSPENDED); spin_unlock_irq(&uhci->lock); return rc; } static int uhci_rh_resume(struct usb_hcd *hcd) { struct uhci_hcd *uhci = hcd_to_uhci(hcd); int rc = 0; spin_lock_irq(&uhci->lock); if (!HCD_HW_ACCESSIBLE(hcd)) rc = -ESHUTDOWN; else if (!uhci->dead) wakeup_rh(uhci); spin_unlock_irq(&uhci->lock); return rc; } #endif /* Wait until a particular device/endpoint's QH is idle, and free it */ static void uhci_hcd_endpoint_disable(struct usb_hcd *hcd, struct usb_host_endpoint *hep) { struct uhci_hcd *uhci = hcd_to_uhci(hcd); struct uhci_qh *qh; spin_lock_irq(&uhci->lock); qh = (struct uhci_qh *) hep->hcpriv; if (qh == NULL) goto done; while (qh->state != QH_STATE_IDLE) { ++uhci->num_waiting; spin_unlock_irq(&uhci->lock); wait_event_interruptible(uhci->waitqh, qh->state == QH_STATE_IDLE); spin_lock_irq(&uhci->lock); --uhci->num_waiting; } uhci_free_qh(uhci, qh); done: spin_unlock_irq(&uhci->lock); } static int uhci_hcd_get_frame_number(struct usb_hcd *hcd) { struct uhci_hcd *uhci = hcd_to_uhci(hcd); unsigned frame_number; unsigned delta; /* Minimize latency by avoiding the spinlock */ frame_number = uhci->frame_number; barrier(); delta = (uhci_readw(uhci, USBFRNUM) - frame_number) & (UHCI_NUMFRAMES - 1); return frame_number + delta; } /* Determines number of ports on controller */ static int uhci_count_ports(struct usb_hcd *hcd) { struct uhci_hcd *uhci = hcd_to_uhci(hcd); unsigned io_size = (unsigned) hcd->rsrc_len; int port; /* The UHCI spec says devices must have 2 ports, and goes on to say * they may have more but gives no way to determine how many there * are. However according to the UHCI spec, Bit 7 of the port * status and control register is always set to 1. So we try to * use this to our advantage. Another common failure mode when * a nonexistent register is addressed is to return all ones, so * we test for that also. */ for (port = 0; port < (io_size - USBPORTSC1) / 2; port++) { unsigned int portstatus; portstatus = uhci_readw(uhci, USBPORTSC1 + (port * 2)); if (!(portstatus & 0x0080) || portstatus == 0xffff) break; } if (debug) dev_info(uhci_dev(uhci), "detected %d ports\n", port); /* Anything greater than 7 is weird so we'll ignore it. */ if (port > UHCI_RH_MAXCHILD) { dev_info(uhci_dev(uhci), "port count misdetected? forcing to 2 ports\n"); port = 2; } return port; } static const char hcd_name[] = "uhci_hcd"; #if defined(CONFIG_USB_PCI) && defined(CONFIG_HAS_IOPORT) #include "uhci-pci.c" #define PCI_DRIVER uhci_pci_driver #endif #ifdef CONFIG_SPARC_LEON #include "uhci-grlib.c" #define PLATFORM_DRIVER uhci_grlib_driver #endif #ifdef CONFIG_USB_UHCI_PLATFORM #include "uhci-platform.c" #define PLATFORM_DRIVER uhci_platform_driver #endif #if !defined(PCI_DRIVER) && !defined(PLATFORM_DRIVER) #error "missing bus glue for uhci-hcd" #endif static int __init uhci_hcd_init(void) { int retval = -ENOMEM; if (usb_disabled()) return -ENODEV; set_bit(USB_UHCI_LOADED, &usb_hcds_loaded); #ifdef CONFIG_DYNAMIC_DEBUG errbuf = kmalloc(ERRBUF_LEN, GFP_KERNEL); if (!errbuf) goto errbuf_failed; uhci_debugfs_root = debugfs_create_dir("uhci", usb_debug_root); #endif uhci_up_cachep = kmem_cache_create("uhci_urb_priv", sizeof(struct urb_priv), 0, 0, NULL); if (!uhci_up_cachep) goto up_failed; #ifdef PLATFORM_DRIVER retval = platform_driver_register(&PLATFORM_DRIVER); if (retval < 0) goto clean0; #endif #ifdef PCI_DRIVER retval = pci_register_driver(&PCI_DRIVER); if (retval < 0) goto clean1; #endif return 0; #ifdef PCI_DRIVER clean1: #endif #ifdef PLATFORM_DRIVER platform_driver_unregister(&PLATFORM_DRIVER); clean0: #endif kmem_cache_destroy(uhci_up_cachep); up_failed: #if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG) debugfs_remove(uhci_debugfs_root); kfree(errbuf); errbuf_failed: #endif clear_bit(USB_UHCI_LOADED, &usb_hcds_loaded); return retval; } static void __exit uhci_hcd_cleanup(void) { #ifdef PLATFORM_DRIVER platform_driver_unregister(&PLATFORM_DRIVER); #endif #ifdef PCI_DRIVER pci_unregister_driver(&PCI_DRIVER); #endif kmem_cache_destroy(uhci_up_cachep); debugfs_remove(uhci_debugfs_root); #ifdef CONFIG_DYNAMIC_DEBUG kfree(errbuf); #endif clear_bit(USB_UHCI_LOADED, &usb_hcds_loaded); } module_init(uhci_hcd_init); module_exit(uhci_hcd_cleanup); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL");
12 2 2 2 33 33 2 2 2 2 33 28 28 2 16 1 4 1 1 4 4 3 2 2 2 1 1 2 1 3 2 1 3 1 16 16 16 4 2 3 2 2 1 18 2 2 1 2 1 2 1 2 1 1 1 1 1 1 1 1 1 1 18 3 2 2 2 1 5 5 5 5 5 5 5 5 5 5 1 1 1 1 1 1 1 1 1 1 8 7 3 2 1 1 1 2 2 2 7 5 4 1 1 1 1 1 8 6 33 15 15 14 2 1 12 11 9 8 8 7 6 2 2 5 5 5 3 2 2 2 2 1 1 1 1 291 291 291 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 // SPDX-License-Identifier: GPL-2.0 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/objtool.h> #include <linux/percpu.h> #include <asm/debugreg.h> #include <asm/mmu_context.h> #include <asm/msr.h> #include "x86.h" #include "cpuid.h" #include "hyperv.h" #include "mmu.h" #include "nested.h" #include "pmu.h" #include "posted_intr.h" #include "sgx.h" #include "trace.h" #include "vmx.h" #include "smm.h" static bool __read_mostly enable_shadow_vmcs = 1; module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); static bool __read_mostly nested_early_check = 0; module_param(nested_early_check, bool, S_IRUGO); #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK /* * Hyper-V requires all of these, so mark them as supported even though * they are just treated the same as all-context. */ #define VMX_VPID_EXTENT_SUPPORTED_MASK \ (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 enum { VMX_VMREAD_BITMAP, VMX_VMWRITE_BITMAP, VMX_BITMAP_NR }; static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) struct shadow_vmcs_field { u16 encoding; u16 offset; }; static struct shadow_vmcs_field shadow_read_only_fields[] = { #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) }, #include "vmcs_shadow_fields.h" }; static int max_shadow_read_only_fields = ARRAY_SIZE(shadow_read_only_fields); static struct shadow_vmcs_field shadow_read_write_fields[] = { #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) }, #include "vmcs_shadow_fields.h" }; static int max_shadow_read_write_fields = ARRAY_SIZE(shadow_read_write_fields); static void init_vmcs_shadow_fields(void) { int i, j; memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); for (i = j = 0; i < max_shadow_read_only_fields; i++) { struct shadow_vmcs_field entry = shadow_read_only_fields[i]; u16 field = entry.encoding; if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && (i + 1 == max_shadow_read_only_fields || shadow_read_only_fields[i + 1].encoding != field + 1)) pr_err("Missing field from shadow_read_only_field %x\n", field + 1); clear_bit(field, vmx_vmread_bitmap); if (field & 1) #ifdef CONFIG_X86_64 continue; #else entry.offset += sizeof(u32); #endif shadow_read_only_fields[j++] = entry; } max_shadow_read_only_fields = j; for (i = j = 0; i < max_shadow_read_write_fields; i++) { struct shadow_vmcs_field entry = shadow_read_write_fields[i]; u16 field = entry.encoding; if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && (i + 1 == max_shadow_read_write_fields || shadow_read_write_fields[i + 1].encoding != field + 1)) pr_err("Missing field from shadow_read_write_field %x\n", field + 1); WARN_ONCE(field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES, "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); /* * PML and the preemption timer can be emulated, but the * processor cannot vmwrite to fields that don't exist * on bare metal. */ switch (field) { case GUEST_PML_INDEX: if (!cpu_has_vmx_pml()) continue; break; case VMX_PREEMPTION_TIMER_VALUE: if (!cpu_has_vmx_preemption_timer()) continue; break; case GUEST_INTR_STATUS: if (!cpu_has_vmx_apicv()) continue; break; default: break; } clear_bit(field, vmx_vmwrite_bitmap); clear_bit(field, vmx_vmread_bitmap); if (field & 1) #ifdef CONFIG_X86_64 continue; #else entry.offset += sizeof(u32); #endif shadow_read_write_fields[j++] = entry; } max_shadow_read_write_fields = j; } /* * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), * set the success or error code of an emulated VMX instruction (as specified * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated * instruction. */ static int nested_vmx_succeed(struct kvm_vcpu *vcpu) { vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); return kvm_skip_emulated_instruction(vcpu); } static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) { vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)) | X86_EFLAGS_CF); return kvm_skip_emulated_instruction(vcpu); } static int nested_vmx_failValid(struct kvm_vcpu *vcpu, u32 vm_instruction_error) { vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_SF | X86_EFLAGS_OF)) | X86_EFLAGS_ZF); get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; /* * We don't need to force sync to shadow VMCS because * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all * fields and thus must be synced. */ if (nested_vmx_is_evmptr12_set(to_vmx(vcpu))) to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true; return kvm_skip_emulated_instruction(vcpu); } static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) { struct vcpu_vmx *vmx = to_vmx(vcpu); /* * failValid writes the error number to the current VMCS, which * can't be done if there isn't a current VMCS. */ if (vmx->nested.current_vmptr == INVALID_GPA && !nested_vmx_is_evmptr12_valid(vmx)) return nested_vmx_failInvalid(vcpu); return nested_vmx_failValid(vcpu, vm_instruction_error); } static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) { /* TODO: not to reset guest simply here. */ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); pr_debug_ratelimited("nested vmx abort, indicator %d\n", indicator); } static inline bool vmx_control_verify(u32 control, u32 low, u32 high) { return fixed_bits_valid(control, low, high); } static inline u64 vmx_control_msr(u32 low, u32 high) { return low | ((u64)high << 32); } static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) { secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); vmx->nested.need_vmcs12_to_shadow_sync = false; } static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) { #ifdef CONFIG_KVM_HYPERV struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map); vmx->nested.hv_evmcs = NULL; vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; if (hv_vcpu) { hv_vcpu->nested.pa_page_gpa = INVALID_GPA; hv_vcpu->nested.vm_id = 0; hv_vcpu->nested.vp_id = 0; } #endif } static bool nested_evmcs_handle_vmclear(struct kvm_vcpu *vcpu, gpa_t vmptr) { #ifdef CONFIG_KVM_HYPERV struct vcpu_vmx *vmx = to_vmx(vcpu); /* * When Enlightened VMEntry is enabled on the calling CPU we treat * memory area pointer by vmptr as Enlightened VMCS (as there's no good * way to distinguish it from VMCS12) and we must not corrupt it by * writing to the non-existent 'launch_state' field. The area doesn't * have to be the currently active EVMCS on the calling CPU and there's * nothing KVM has to do to transition it from 'active' to 'non-active' * state. It is possible that the area will stay mapped as * vmx->nested.hv_evmcs but this shouldn't be a problem. */ if (!guest_cpu_cap_has_evmcs(vcpu) || !evmptr_is_valid(nested_get_evmptr(vcpu))) return false; if (nested_vmx_evmcs(vmx) && vmptr == vmx->nested.hv_evmcs_vmptr) nested_release_evmcs(vcpu); return true; #else return false; #endif } static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, struct loaded_vmcs *prev) { struct vmcs_host_state *dest, *src; if (unlikely(!vmx->vt.guest_state_loaded)) return; src = &prev->host_state; dest = &vmx->loaded_vmcs->host_state; vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); dest->ldt_sel = src->ldt_sel; #ifdef CONFIG_X86_64 dest->ds_sel = src->ds_sel; dest->es_sel = src->es_sel; #endif } static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct loaded_vmcs *prev; int cpu; if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs)) return; cpu = get_cpu(); prev = vmx->loaded_vmcs; vmx->loaded_vmcs = vmcs; vmx_vcpu_load_vmcs(vcpu, cpu); vmx_sync_vmcs_host_state(vmx, prev); put_cpu(); vcpu->arch.regs_avail = ~VMX_REGS_LAZY_LOAD_SET; /* * All lazily updated registers will be reloaded from VMCS12 on both * vmentry and vmexit. */ vcpu->arch.regs_dirty = 0; } static void nested_put_vmcs12_pages(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map); kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map); kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map); vmx->nested.pi_desc = NULL; } /* * Free whatever needs to be freed from vmx->nested when L1 goes down, or * just stops using VMX. */ static void free_nested(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01)) vmx_switch_vmcs(vcpu, &vmx->vmcs01); if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) return; kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); vmx->nested.vmxon = false; vmx->nested.smm.vmxon = false; vmx->nested.vmxon_ptr = INVALID_GPA; free_vpid(vmx->nested.vpid02); vmx->nested.posted_intr_nv = -1; vmx->nested.current_vmptr = INVALID_GPA; if (enable_shadow_vmcs) { vmx_disable_shadow_vmcs(vmx); vmcs_clear(vmx->vmcs01.shadow_vmcs); free_vmcs(vmx->vmcs01.shadow_vmcs); vmx->vmcs01.shadow_vmcs = NULL; } kfree(vmx->nested.cached_vmcs12); vmx->nested.cached_vmcs12 = NULL; kfree(vmx->nested.cached_shadow_vmcs12); vmx->nested.cached_shadow_vmcs12 = NULL; nested_put_vmcs12_pages(vcpu); kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); nested_release_evmcs(vcpu); free_loaded_vmcs(&vmx->nested.vmcs02); } /* * Ensure that the current vmcs of the logical processor is the * vmcs01 of the vcpu before calling free_nested(). */ void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) { vcpu_load(vcpu); vmx_leave_nested(vcpu); vcpu_put(vcpu); } #define EPTP_PA_MASK GENMASK_ULL(51, 12) static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) { return VALID_PAGE(root_hpa) && ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); } static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp, gpa_t addr) { unsigned long roots = 0; uint i; struct kvm_mmu_root_info *cached_root; WARN_ON_ONCE(!mmu_is_nested(vcpu)); for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { cached_root = &vcpu->arch.mmu->prev_roots[i]; if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd, eptp)) roots |= KVM_MMU_ROOT_PREVIOUS(i); } if (roots) kvm_mmu_invalidate_addr(vcpu, vcpu->arch.mmu, addr, roots); } static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long exit_qualification; u32 vm_exit_reason; if (vmx->nested.pml_full) { vm_exit_reason = EXIT_REASON_PML_FULL; vmx->nested.pml_full = false; /* * It should be impossible to trigger a nested PML Full VM-Exit * for anything other than an EPT Violation from L2. KVM *can* * trigger nEPT page fault injection in response to an EPT * Misconfig, e.g. if the MMIO SPTE was stale and L1's EPT * tables also changed, but KVM should not treat EPT Misconfig * VM-Exits as writes. */ WARN_ON_ONCE(vmx->vt.exit_reason.basic != EXIT_REASON_EPT_VIOLATION); /* * PML Full and EPT Violation VM-Exits both use bit 12 to report * "NMI unblocking due to IRET", i.e. the bit can be propagated * as-is from the original EXIT_QUALIFICATION. */ exit_qualification = vmx_get_exit_qual(vcpu) & INTR_INFO_UNBLOCK_NMI; } else { if (fault->error_code & PFERR_RSVD_MASK) { vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; exit_qualification = 0; } else { exit_qualification = fault->exit_qualification; exit_qualification |= vmx_get_exit_qual(vcpu) & (EPT_VIOLATION_GVA_IS_VALID | EPT_VIOLATION_GVA_TRANSLATED); vm_exit_reason = EXIT_REASON_EPT_VIOLATION; } /* * Although the caller (kvm_inject_emulated_page_fault) would * have already synced the faulting address in the shadow EPT * tables for the current EPTP12, we also need to sync it for * any other cached EPTP02s based on the same EP4TA, since the * TLB associates mappings to the EP4TA rather than the full EPTP. */ nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer, fault->address); } nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification); vmcs12->guest_physical_address = fault->address; } static void nested_ept_new_eptp(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); bool execonly = vmx->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT; int ept_lpage_level = ept_caps_to_lpage_level(vmx->nested.msrs.ept_caps); kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level, nested_ept_ad_enabled(vcpu), nested_ept_get_eptp(vcpu)); } static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) { WARN_ON(mmu_is_nested(vcpu)); vcpu->arch.mmu = &vcpu->arch.guest_mmu; nested_ept_new_eptp(vcpu); vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp; vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; } static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) { vcpu->arch.mmu = &vcpu->arch.root_mmu; vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; } static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, u16 error_code) { bool inequality, bit; bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; inequality = (error_code & vmcs12->page_fault_error_code_mask) != vmcs12->page_fault_error_code_match; return inequality ^ bit; } static bool nested_vmx_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector, u32 error_code) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); /* * Drop bits 31:16 of the error code when performing the #PF mask+match * check. All VMCS fields involved are 32 bits, but Intel CPUs never * set bits 31:16 and VMX disallows setting bits 31:16 in the injected * error code. Including the to-be-dropped bits in the check might * result in an "impossible" or missed exit from L1's perspective. */ if (vector == PF_VECTOR) return nested_vmx_is_page_fault_vmexit(vmcs12, (u16)error_code); return (vmcs12->exception_bitmap & (1u << vector)); } static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) return 0; if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) || CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b))) return -EINVAL; return 0; } static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) return 0; if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap))) return -EINVAL; return 0; } static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) return 0; if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) return -EINVAL; return 0; } /* * For x2APIC MSRs, ignore the vmcs01 bitmap. L1 can enable x2APIC without L1 * itself utilizing x2APIC. All MSRs were previously set to be intercepted, * only the "disable intercept" case needs to be handled. */ static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1, unsigned long *msr_bitmap_l0, u32 msr, int type) { if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr)) vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr); if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr)) vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr); } static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) { int msr; for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { unsigned word = msr / BITS_PER_LONG; msr_bitmap[word] = ~0; msr_bitmap[word + (0x800 / sizeof(long))] = ~0; } } #define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \ static inline \ void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \ unsigned long *msr_bitmap_l1, \ unsigned long *msr_bitmap_l0, u32 msr) \ { \ if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \ vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \ vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \ else \ vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \ } BUILD_NVMX_MSR_INTERCEPT_HELPER(read) BUILD_NVMX_MSR_INTERCEPT_HELPER(write) static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx, unsigned long *msr_bitmap_l1, unsigned long *msr_bitmap_l0, u32 msr, int types) { if (types & MSR_TYPE_R) nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1, msr_bitmap_l0, msr); if (types & MSR_TYPE_W) nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1, msr_bitmap_l0, msr); } /* * Merge L0's and L1's MSR bitmap, return false to indicate that * we do not use the hardware. */ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); int msr; unsigned long *msr_bitmap_l1; unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; struct kvm_host_map map; /* Nothing to do if the MSR bitmap is not in use. */ if (!cpu_has_vmx_msr_bitmap() || !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) return false; /* * MSR bitmap update can be skipped when: * - MSR bitmap for L1 hasn't changed. * - Nested hypervisor (L1) is attempting to launch the same L2 as * before. * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature * and tells KVM (L0) there were no changes in MSR bitmap for L2. */ if (!vmx->nested.force_msr_bitmap_recalc) { struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); if (evmcs && evmcs->hv_enlightenments_control.msr_bitmap && evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP) return true; } if (kvm_vcpu_map_readonly(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), &map)) return false; msr_bitmap_l1 = (unsigned long *)map.hva; /* * To keep the control flow simple, pay eight 8-byte writes (sixteen * 4-byte writes on 32-bit systems) up front to enable intercepts for * the x2APIC MSR range and selectively toggle those relevant to L2. */ enable_x2apic_msr_intercepts(msr_bitmap_l0); if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { if (nested_cpu_has_apic_reg_virt(vmcs12)) { /* * L0 need not intercept reads for MSRs between 0x800 * and 0x8ff, it just lets the processor take the value * from the virtual-APIC page; take those 256 bits * directly from the L1 bitmap. */ for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { unsigned word = msr / BITS_PER_LONG; msr_bitmap_l0[word] = msr_bitmap_l1[word]; } } nested_vmx_disable_intercept_for_x2apic_msr( msr_bitmap_l1, msr_bitmap_l0, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_R | MSR_TYPE_W); if (nested_cpu_has_vid(vmcs12)) { nested_vmx_disable_intercept_for_x2apic_msr( msr_bitmap_l1, msr_bitmap_l0, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); nested_vmx_disable_intercept_for_x2apic_msr( msr_bitmap_l1, msr_bitmap_l0, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); } } /* * Always check vmcs01's bitmap to honor userspace MSR filters and any * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through. */ #ifdef CONFIG_X86_64 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_FS_BASE, MSR_TYPE_RW); nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_GS_BASE, MSR_TYPE_RW); nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); #endif nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_PRED_CMD, MSR_TYPE_W); nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_FLUSH_CMD, MSR_TYPE_W); nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_APERF, MSR_TYPE_R); nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_MPERF, MSR_TYPE_R); nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_U_CET, MSR_TYPE_RW); nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_S_CET, MSR_TYPE_RW); nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_PL0_SSP, MSR_TYPE_RW); nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_PL1_SSP, MSR_TYPE_RW); nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_PL2_SSP, MSR_TYPE_RW); nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_PL3_SSP, MSR_TYPE_RW); kvm_vcpu_unmap(vcpu, &map); vmx->nested.force_msr_bitmap_recalc = false; return true; } static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; if (!nested_cpu_has_shadow_vmcs(vmcs12) || vmcs12->vmcs_link_pointer == INVALID_GPA) return; if (ghc->gpa != vmcs12->vmcs_link_pointer && kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmcs12->vmcs_link_pointer, VMCS12_SIZE)) return; kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), VMCS12_SIZE); } static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; if (!nested_cpu_has_shadow_vmcs(vmcs12) || vmcs12->vmcs_link_pointer == INVALID_GPA) return; if (ghc->gpa != vmcs12->vmcs_link_pointer && kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmcs12->vmcs_link_pointer, VMCS12_SIZE)) return; kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), VMCS12_SIZE); } /* * In nested virtualization, check if L1 has set * VM_EXIT_ACK_INTR_ON_EXIT */ static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) { return get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_ACK_INTR_ON_EXIT; } static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && CC(!page_address_valid(vcpu, vmcs12->apic_access_addr))) return -EINVAL; else return 0; } static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && !nested_cpu_has_apic_reg_virt(vmcs12) && !nested_cpu_has_vid(vmcs12) && !nested_cpu_has_posted_intr(vmcs12)) return 0; /* * If virtualize x2apic mode is enabled, * virtualize apic access must be disabled. */ if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) && nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))) return -EINVAL; /* * If virtual interrupt delivery is enabled, * we must exit on external interrupts. */ if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu))) return -EINVAL; /* * bits 15:8 should be zero in posted_intr_nv, * the descriptor address has been already checked * in nested_get_vmcs12_pages. * * bits 5:0 of posted_intr_desc_addr should be zero. */ if (nested_cpu_has_posted_intr(vmcs12) && (CC(!nested_cpu_has_vid(vmcs12)) || CC(!nested_exit_intr_ack_set(vcpu)) || CC((vmcs12->posted_intr_nv & 0xff00)) || CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64)))) return -EINVAL; /* tpr shadow is needed by all apicv features. */ if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))) return -EINVAL; return 0; } static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, vmx->nested.msrs.misc_high); return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; } static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, u32 count, u64 addr) { if (count == 0) return 0; /* * Exceeding the limit results in architecturally _undefined_ behavior, * i.e. KVM is allowed to do literally anything in response to a bad * limit. Immediately generate a consistency check so that code that * consumes the count doesn't need to worry about extreme edge cases. */ if (count > nested_vmx_max_atomic_switch_msrs(vcpu)) return -EINVAL; if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) || !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1))) return -EINVAL; return 0; } static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { if (CC(nested_vmx_check_msr_switch(vcpu, vmcs12->vm_exit_msr_load_count, vmcs12->vm_exit_msr_load_addr)) || CC(nested_vmx_check_msr_switch(vcpu, vmcs12->vm_exit_msr_store_count, vmcs12->vm_exit_msr_store_addr))) return -EINVAL; return 0; } static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { if (CC(nested_vmx_check_msr_switch(vcpu, vmcs12->vm_entry_msr_load_count, vmcs12->vm_entry_msr_load_addr))) return -EINVAL; return 0; } static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { if (!nested_cpu_has_pml(vmcs12)) return 0; if (CC(!nested_cpu_has_ept(vmcs12)) || CC(!page_address_valid(vcpu, vmcs12->pml_address))) return -EINVAL; return 0; } static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && !nested_cpu_has_ept(vmcs12))) return -EINVAL; return 0; } static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && !nested_cpu_has_ept(vmcs12))) return -EINVAL; return 0; } static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { if (!nested_cpu_has_shadow_vmcs(vmcs12)) return 0; if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) || CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap))) return -EINVAL; return 0; } static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, struct vmx_msr_entry *e) { /* x2APIC MSR accesses are not allowed */ if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)) return -EINVAL; if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */ CC(e->index == MSR_IA32_UCODE_REV)) return -EINVAL; if (CC(e->reserved != 0)) return -EINVAL; return 0; } static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, struct vmx_msr_entry *e) { if (CC(e->index == MSR_FS_BASE) || CC(e->index == MSR_GS_BASE) || CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */ nested_vmx_msr_check_common(vcpu, e)) return -EINVAL; return 0; } static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, struct vmx_msr_entry *e) { if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */ nested_vmx_msr_check_common(vcpu, e)) return -EINVAL; return 0; } /* * Load guest's/host's msr at nested entry/exit. * return 0 for success, entry index for failure. * * One of the failure modes for MSR load/store is when a list exceeds the * virtual hardware's capacity. To maintain compatibility with hardware inasmuch * as possible, process all valid entries before failing rather than precheck * for a capacity violation. */ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) { u32 i; struct vmx_msr_entry e; u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); for (i = 0; i < count; i++) { if (WARN_ON_ONCE(i >= max_msr_list_size)) goto fail; if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), &e, sizeof(e))) { pr_debug_ratelimited( "%s cannot read MSR entry (%u, 0x%08llx)\n", __func__, i, gpa + i * sizeof(e)); goto fail; } if (nested_vmx_load_msr_check(vcpu, &e)) { pr_debug_ratelimited( "%s check failed (%u, 0x%x, 0x%x)\n", __func__, i, e.index, e.reserved); goto fail; } if (kvm_emulate_msr_write(vcpu, e.index, e.value)) { pr_debug_ratelimited( "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", __func__, i, e.index, e.value); goto fail; } } return 0; fail: /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */ return i + 1; } static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data) { struct vcpu_vmx *vmx = to_vmx(vcpu); /* * If the L0 hypervisor stored a more accurate value for the TSC that * does not include the time taken for emulation of the L2->L1 * VM-exit in L0, use the more accurate value. */ if (msr_index == MSR_IA32_TSC) { int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest, MSR_IA32_TSC); if (i >= 0) { u64 val = vmx->msr_autostore.guest.val[i].value; *data = kvm_read_l1_tsc(vcpu, val); return true; } } if (kvm_emulate_msr_read(vcpu, msr_index, data)) { pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, msr_index); return false; } return true; } static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i, struct vmx_msr_entry *e) { if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(*e), e, 2 * sizeof(u32))) { pr_debug_ratelimited( "%s cannot read MSR entry (%u, 0x%08llx)\n", __func__, i, gpa + i * sizeof(*e)); return false; } if (nested_vmx_store_msr_check(vcpu, e)) { pr_debug_ratelimited( "%s check failed (%u, 0x%x, 0x%x)\n", __func__, i, e->index, e->reserved); return false; } return true; } static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) { u64 data; u32 i; struct vmx_msr_entry e; u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); for (i = 0; i < count; i++) { if (WARN_ON_ONCE(i >= max_msr_list_size)) return -EINVAL; if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) return -EINVAL; if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data)) return -EINVAL; if (kvm_vcpu_write_guest(vcpu, gpa + i * sizeof(e) + offsetof(struct vmx_msr_entry, value), &data, sizeof(data))) { pr_debug_ratelimited( "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", __func__, i, e.index, data); return -EINVAL; } } return 0; } static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); u32 count = vmcs12->vm_exit_msr_store_count; u64 gpa = vmcs12->vm_exit_msr_store_addr; struct vmx_msr_entry e; u32 i; for (i = 0; i < count; i++) { if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) return false; if (e.index == msr_index) return true; } return false; } static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, u32 msr_index) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmx_msrs *autostore = &vmx->msr_autostore.guest; bool in_vmcs12_store_list; int msr_autostore_slot; bool in_autostore_list; int last; msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index); in_autostore_list = msr_autostore_slot >= 0; in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index); if (in_vmcs12_store_list && !in_autostore_list) { if (autostore->nr == MAX_NR_LOADSTORE_MSRS) { /* * Emulated VMEntry does not fail here. Instead a less * accurate value will be returned by * nested_vmx_get_vmexit_msr_value() by reading KVM's * internal MSR state instead of reading the value from * the vmcs02 VMExit MSR-store area. */ pr_warn_ratelimited( "Not enough msr entries in msr_autostore. Can't add msr %x\n", msr_index); return; } last = autostore->nr++; autostore->val[last].index = msr_index; } else if (!in_vmcs12_store_list && in_autostore_list) { last = --autostore->nr; autostore->val[msr_autostore_slot] = autostore->val[last]; } } /* * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are * emulating VM-Entry into a guest with EPT enabled. On failure, the expected * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to * @entry_failure_code. */ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept, bool reload_pdptrs, enum vm_entry_failure_code *entry_failure_code) { if (CC(!kvm_vcpu_is_legal_cr3(vcpu, cr3))) { *entry_failure_code = ENTRY_FAIL_DEFAULT; return -EINVAL; } /* * If PAE paging and EPT are both on, CR3 is not used by the CPU and * must not be dereferenced. */ if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) && CC(!load_pdptrs(vcpu, cr3))) { *entry_failure_code = ENTRY_FAIL_PDPTE; return -EINVAL; } vcpu->arch.cr3 = cr3; kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ kvm_init_mmu(vcpu); if (!nested_ept) kvm_mmu_new_pgd(vcpu, cr3); return 0; } /* * Returns if KVM is able to config CPU to tag TLB entries * populated by L2 differently than TLB entries populated * by L1. * * If L0 uses EPT, L1 and L2 run with different EPTP because * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries * are tagged with different EPTP. * * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged * with different VPID (L1 entries are tagged with vmx->vpid * while L2 entries are tagged with vmx->nested.vpid02). */ static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); return enable_ept || (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); } static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, bool is_vmenter) { struct vcpu_vmx *vmx = to_vmx(vcpu); /* Handle pending Hyper-V TLB flush requests */ kvm_hv_nested_transtion_tlb_flush(vcpu, enable_ept); /* * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the * same VPID as the host, and so architecturally, linear and combined * mappings for VPID=0 must be flushed at VM-Enter and VM-Exit. KVM * emulates L2 sharing L1's VPID=0 by using vpid01 while running L2, * and so KVM must also emulate TLB flush of VPID=0, i.e. vpid01. This * is required if VPID is disabled in KVM, as a TLB flush (there are no * VPIDs) still occurs from L1's perspective, and KVM may need to * synchronize the MMU in response to the guest TLB flush. * * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use. * EPT is a special snowflake, as guest-physical mappings aren't * flushed on VPID invalidations, including VM-Enter or VM-Exit with * VPID disabled. As a result, KVM _never_ needs to sync nEPT * entries on VM-Enter because L1 can't rely on VM-Enter to flush * those mappings. */ if (!nested_cpu_has_vpid(vmcs12)) { kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); return; } /* L2 should never have a VPID if VPID is disabled. */ WARN_ON(!enable_vpid); /* * VPID is enabled and in use by vmcs12. If vpid12 is changing, then * emulate a guest TLB flush as KVM does not track vpid12 history nor * is the VPID incorporated into the MMU context. I.e. KVM must assume * that the new vpid12 has never been used and thus represents a new * guest ASID that cannot have entries in the TLB. */ if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) { vmx->nested.last_vpid = vmcs12->virtual_processor_id; kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); return; } /* * If VPID is enabled, used by vmc12, and vpid12 is not changing but * does not have a unique TLB tag (ASID), i.e. EPT is disabled and * KVM was unable to allocate a VPID for L2, flush the current context * as the effective ASID is common to both L1 and L2. */ if (!nested_has_guest_tlb_tag(vcpu)) kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) { superset &= mask; subset &= mask; return (superset | subset) == superset; } static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) { const u64 feature_bits = VMX_BASIC_DUAL_MONITOR_TREATMENT | VMX_BASIC_INOUT | VMX_BASIC_TRUE_CTLS | VMX_BASIC_NO_HW_ERROR_CODE_CC; const u64 reserved_bits = GENMASK_ULL(63, 57) | GENMASK_ULL(47, 45) | BIT_ULL(31); u64 vmx_basic = vmcs_config.nested.basic; BUILD_BUG_ON(feature_bits & reserved_bits); /* * Except for 32BIT_PHYS_ADDR_ONLY, which is an anti-feature bit (has * inverted polarity), the incoming value must not set feature bits or * reserved bits that aren't allowed/supported by KVM. Fields, i.e. * multi-bit values, are explicitly checked below. */ if (!is_bitwise_subset(vmx_basic, data, feature_bits | reserved_bits)) return -EINVAL; /* * KVM does not emulate a version of VMX that constrains physical * addresses of VMX structures (e.g. VMCS) to 32-bits. */ if (data & VMX_BASIC_32BIT_PHYS_ADDR_ONLY) return -EINVAL; if (vmx_basic_vmcs_revision_id(vmx_basic) != vmx_basic_vmcs_revision_id(data)) return -EINVAL; if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) return -EINVAL; vmx->nested.msrs.basic = data; return 0; } static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u32 **low, u32 **high) { switch (msr_index) { case MSR_IA32_VMX_TRUE_PINBASED_CTLS: *low = &msrs->pinbased_ctls_low; *high = &msrs->pinbased_ctls_high; break; case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: *low = &msrs->procbased_ctls_low; *high = &msrs->procbased_ctls_high; break; case MSR_IA32_VMX_TRUE_EXIT_CTLS: *low = &msrs->exit_ctls_low; *high = &msrs->exit_ctls_high; break; case MSR_IA32_VMX_TRUE_ENTRY_CTLS: *low = &msrs->entry_ctls_low; *high = &msrs->entry_ctls_high; break; case MSR_IA32_VMX_PROCBASED_CTLS2: *low = &msrs->secondary_ctls_low; *high = &msrs->secondary_ctls_high; break; default: BUG(); } } static int vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) { u32 *lowp, *highp; u64 supported; vmx_get_control_msr(&vmcs_config.nested, msr_index, &lowp, &highp); supported = vmx_control_msr(*lowp, *highp); /* Check must-be-1 bits are still 1. */ if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) return -EINVAL; /* Check must-be-0 bits are still 0. */ if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) return -EINVAL; vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp); *lowp = data; *highp = data >> 32; return 0; } static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) { const u64 feature_bits = VMX_MISC_SAVE_EFER_LMA | VMX_MISC_ACTIVITY_HLT | VMX_MISC_ACTIVITY_SHUTDOWN | VMX_MISC_ACTIVITY_WAIT_SIPI | VMX_MISC_INTEL_PT | VMX_MISC_RDMSR_IN_SMM | VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | VMX_MISC_VMXOFF_BLOCK_SMI | VMX_MISC_ZERO_LEN_INS; const u64 reserved_bits = BIT_ULL(31) | GENMASK_ULL(13, 9); u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low, vmcs_config.nested.misc_high); BUILD_BUG_ON(feature_bits & reserved_bits); /* * The incoming value must not set feature bits or reserved bits that * aren't allowed/supported by KVM. Fields, i.e. multi-bit values, are * explicitly checked below. */ if (!is_bitwise_subset(vmx_misc, data, feature_bits | reserved_bits)) return -EINVAL; if ((vmx->nested.msrs.pinbased_ctls_high & PIN_BASED_VMX_PREEMPTION_TIMER) && vmx_misc_preemption_timer_rate(data) != vmx_misc_preemption_timer_rate(vmx_misc)) return -EINVAL; if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) return -EINVAL; if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) return -EINVAL; if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) return -EINVAL; vmx->nested.msrs.misc_low = data; vmx->nested.msrs.misc_high = data >> 32; return 0; } static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) { u64 vmx_ept_vpid_cap = vmx_control_msr(vmcs_config.nested.ept_caps, vmcs_config.nested.vpid_caps); /* Every bit is either reserved or a feature bit. */ if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) return -EINVAL; vmx->nested.msrs.ept_caps = data; vmx->nested.msrs.vpid_caps = data >> 32; return 0; } static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index) { switch (msr_index) { case MSR_IA32_VMX_CR0_FIXED0: return &msrs->cr0_fixed0; case MSR_IA32_VMX_CR4_FIXED0: return &msrs->cr4_fixed0; default: BUG(); } } static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) { const u64 *msr = vmx_get_fixed0_msr(&vmcs_config.nested, msr_index); /* * 1 bits (which indicates bits which "must-be-1" during VMX operation) * must be 1 in the restored value. */ if (!is_bitwise_subset(data, *msr, -1ULL)) return -EINVAL; *vmx_get_fixed0_msr(&vmx->nested.msrs, msr_index) = data; return 0; } /* * Called when userspace is restoring VMX MSRs. * * Returns 0 on success, non-0 otherwise. */ int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) { struct vcpu_vmx *vmx = to_vmx(vcpu); /* * Don't allow changes to the VMX capability MSRs while the vCPU * is in VMX operation. */ if (vmx->nested.vmxon) return -EBUSY; switch (msr_index) { case MSR_IA32_VMX_BASIC: return vmx_restore_vmx_basic(vmx, data); case MSR_IA32_VMX_PINBASED_CTLS: case MSR_IA32_VMX_PROCBASED_CTLS: case MSR_IA32_VMX_EXIT_CTLS: case MSR_IA32_VMX_ENTRY_CTLS: /* * The "non-true" VMX capability MSRs are generated from the * "true" MSRs, so we do not support restoring them directly. * * If userspace wants to emulate VMX_BASIC[55]=0, userspace * should restore the "true" MSRs with the must-be-1 bits * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND * DEFAULT SETTINGS". */ return -EINVAL; case MSR_IA32_VMX_TRUE_PINBASED_CTLS: case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: case MSR_IA32_VMX_TRUE_EXIT_CTLS: case MSR_IA32_VMX_TRUE_ENTRY_CTLS: case MSR_IA32_VMX_PROCBASED_CTLS2: return vmx_restore_control_msr(vmx, msr_index, data); case MSR_IA32_VMX_MISC: return vmx_restore_vmx_misc(vmx, data); case MSR_IA32_VMX_CR0_FIXED0: case MSR_IA32_VMX_CR4_FIXED0: return vmx_restore_fixed0_msr(vmx, msr_index, data); case MSR_IA32_VMX_CR0_FIXED1: case MSR_IA32_VMX_CR4_FIXED1: /* * These MSRs are generated based on the vCPU's CPUID, so we * do not support restoring them directly. */ return -EINVAL; case MSR_IA32_VMX_EPT_VPID_CAP: return vmx_restore_vmx_ept_vpid_cap(vmx, data); case MSR_IA32_VMX_VMCS_ENUM: vmx->nested.msrs.vmcs_enum = data; return 0; case MSR_IA32_VMX_VMFUNC: if (data & ~vmcs_config.nested.vmfunc_controls) return -EINVAL; vmx->nested.msrs.vmfunc_controls = data; return 0; default: /* * The rest of the VMX capability MSRs do not support restore. */ return -EINVAL; } } /* Returns 0 on success, non-0 otherwise. */ int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) { switch (msr_index) { case MSR_IA32_VMX_BASIC: *pdata = msrs->basic; break; case MSR_IA32_VMX_TRUE_PINBASED_CTLS: case MSR_IA32_VMX_PINBASED_CTLS: *pdata = vmx_control_msr( msrs->pinbased_ctls_low, msrs->pinbased_ctls_high); if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; break; case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: case MSR_IA32_VMX_PROCBASED_CTLS: *pdata = vmx_control_msr( msrs->procbased_ctls_low, msrs->procbased_ctls_high); if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; break; case MSR_IA32_VMX_TRUE_EXIT_CTLS: case MSR_IA32_VMX_EXIT_CTLS: *pdata = vmx_control_msr( msrs->exit_ctls_low, msrs->exit_ctls_high); if (msr_index == MSR_IA32_VMX_EXIT_CTLS) *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; break; case MSR_IA32_VMX_TRUE_ENTRY_CTLS: case MSR_IA32_VMX_ENTRY_CTLS: *pdata = vmx_control_msr( msrs->entry_ctls_low, msrs->entry_ctls_high); if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; break; case MSR_IA32_VMX_MISC: *pdata = vmx_control_msr( msrs->misc_low, msrs->misc_high); break; case MSR_IA32_VMX_CR0_FIXED0: *pdata = msrs->cr0_fixed0; break; case MSR_IA32_VMX_CR0_FIXED1: *pdata = msrs->cr0_fixed1; break; case MSR_IA32_VMX_CR4_FIXED0: *pdata = msrs->cr4_fixed0; break; case MSR_IA32_VMX_CR4_FIXED1: *pdata = msrs->cr4_fixed1; break; case MSR_IA32_VMX_VMCS_ENUM: *pdata = msrs->vmcs_enum; break; case MSR_IA32_VMX_PROCBASED_CTLS2: *pdata = vmx_control_msr( msrs->secondary_ctls_low, msrs->secondary_ctls_high); break; case MSR_IA32_VMX_EPT_VPID_CAP: *pdata = msrs->ept_caps | ((u64)msrs->vpid_caps << 32); break; case MSR_IA32_VMX_VMFUNC: *pdata = msrs->vmfunc_controls; break; default: return 1; } return 0; } /* * Copy the writable VMCS shadow fields back to the VMCS12, in case they have * been modified by the L1 guest. Note, "writable" in this context means * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only" * VM-exit information fields (which are actually writable if the vCPU is * configured to support "VMWRITE to any supported field in the VMCS"). */ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) { struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); struct shadow_vmcs_field field; unsigned long val; int i; if (WARN_ON(!shadow_vmcs)) return; preempt_disable(); vmcs_load(shadow_vmcs); for (i = 0; i < max_shadow_read_write_fields; i++) { field = shadow_read_write_fields[i]; val = __vmcs_readl(field.encoding); vmcs12_write_any(vmcs12, field.encoding, field.offset, val); } vmcs_clear(shadow_vmcs); vmcs_load(vmx->loaded_vmcs->vmcs); preempt_enable(); } static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) { const struct shadow_vmcs_field *fields[] = { shadow_read_write_fields, shadow_read_only_fields }; const int max_fields[] = { max_shadow_read_write_fields, max_shadow_read_only_fields }; struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); struct shadow_vmcs_field field; unsigned long val; int i, q; if (WARN_ON(!shadow_vmcs)) return; vmcs_load(shadow_vmcs); for (q = 0; q < ARRAY_SIZE(fields); q++) { for (i = 0; i < max_fields[q]; i++) { field = fields[q][i]; val = vmcs12_read_any(vmcs12, field.encoding, field.offset); __vmcs_writel(field.encoding, val); } } vmcs_clear(shadow_vmcs); vmcs_load(vmx->loaded_vmcs->vmcs); } static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields) { #ifdef CONFIG_KVM_HYPERV struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(&vmx->vcpu); /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ vmcs12->tpr_threshold = evmcs->tpr_threshold; vmcs12->guest_rip = evmcs->guest_rip; if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL))) { hv_vcpu->nested.pa_page_gpa = evmcs->partition_assist_page; hv_vcpu->nested.vm_id = evmcs->hv_vm_id; hv_vcpu->nested.vp_id = evmcs->hv_vp_id; } if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { vmcs12->guest_rsp = evmcs->guest_rsp; vmcs12->guest_rflags = evmcs->guest_rflags; vmcs12->guest_interruptibility_info = evmcs->guest_interruptibility_info; /* * Not present in struct vmcs12: * vmcs12->guest_ssp = evmcs->guest_ssp; */ } if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { vmcs12->cpu_based_vm_exec_control = evmcs->cpu_based_vm_exec_control; } if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) { vmcs12->exception_bitmap = evmcs->exception_bitmap; } if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { vmcs12->vm_entry_controls = evmcs->vm_entry_controls; } if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { vmcs12->vm_entry_intr_info_field = evmcs->vm_entry_intr_info_field; vmcs12->vm_entry_exception_error_code = evmcs->vm_entry_exception_error_code; vmcs12->vm_entry_instruction_len = evmcs->vm_entry_instruction_len; } if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { vmcs12->host_ia32_pat = evmcs->host_ia32_pat; vmcs12->host_ia32_efer = evmcs->host_ia32_efer; vmcs12->host_cr0 = evmcs->host_cr0; vmcs12->host_cr3 = evmcs->host_cr3; vmcs12->host_cr4 = evmcs->host_cr4; vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; vmcs12->host_rip = evmcs->host_rip; vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; vmcs12->host_es_selector = evmcs->host_es_selector; vmcs12->host_cs_selector = evmcs->host_cs_selector; vmcs12->host_ss_selector = evmcs->host_ss_selector; vmcs12->host_ds_selector = evmcs->host_ds_selector; vmcs12->host_fs_selector = evmcs->host_fs_selector; vmcs12->host_gs_selector = evmcs->host_gs_selector; vmcs12->host_tr_selector = evmcs->host_tr_selector; vmcs12->host_ia32_perf_global_ctrl = evmcs->host_ia32_perf_global_ctrl; /* * Not present in struct vmcs12: * vmcs12->host_ia32_s_cet = evmcs->host_ia32_s_cet; * vmcs12->host_ssp = evmcs->host_ssp; * vmcs12->host_ia32_int_ssp_table_addr = evmcs->host_ia32_int_ssp_table_addr; */ } if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) { vmcs12->pin_based_vm_exec_control = evmcs->pin_based_vm_exec_control; vmcs12->vm_exit_controls = evmcs->vm_exit_controls; vmcs12->secondary_vm_exec_control = evmcs->secondary_vm_exec_control; } if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { vmcs12->io_bitmap_a = evmcs->io_bitmap_a; vmcs12->io_bitmap_b = evmcs->io_bitmap_b; } if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { vmcs12->msr_bitmap = evmcs->msr_bitmap; } if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { vmcs12->guest_es_base = evmcs->guest_es_base; vmcs12->guest_cs_base = evmcs->guest_cs_base; vmcs12->guest_ss_base = evmcs->guest_ss_base; vmcs12->guest_ds_base = evmcs->guest_ds_base; vmcs12->guest_fs_base = evmcs->guest_fs_base; vmcs12->guest_gs_base = evmcs->guest_gs_base; vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; vmcs12->guest_tr_base = evmcs->guest_tr_base; vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; vmcs12->guest_idtr_base = evmcs->guest_idtr_base; vmcs12->guest_es_limit = evmcs->guest_es_limit; vmcs12->guest_cs_limit = evmcs->guest_cs_limit; vmcs12->guest_ss_limit = evmcs->guest_ss_limit; vmcs12->guest_ds_limit = evmcs->guest_ds_limit; vmcs12->guest_fs_limit = evmcs->guest_fs_limit; vmcs12->guest_gs_limit = evmcs->guest_gs_limit; vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; vmcs12->guest_tr_limit = evmcs->guest_tr_limit; vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; vmcs12->guest_es_selector = evmcs->guest_es_selector; vmcs12->guest_cs_selector = evmcs->guest_cs_selector; vmcs12->guest_ss_selector = evmcs->guest_ss_selector; vmcs12->guest_ds_selector = evmcs->guest_ds_selector; vmcs12->guest_fs_selector = evmcs->guest_fs_selector; vmcs12->guest_gs_selector = evmcs->guest_gs_selector; vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; vmcs12->guest_tr_selector = evmcs->guest_tr_selector; } if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { vmcs12->tsc_offset = evmcs->tsc_offset; vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; vmcs12->encls_exiting_bitmap = evmcs->encls_exiting_bitmap; vmcs12->tsc_multiplier = evmcs->tsc_multiplier; } if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; vmcs12->guest_cr0 = evmcs->guest_cr0; vmcs12->guest_cr3 = evmcs->guest_cr3; vmcs12->guest_cr4 = evmcs->guest_cr4; vmcs12->guest_dr7 = evmcs->guest_dr7; } if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { vmcs12->host_fs_base = evmcs->host_fs_base; vmcs12->host_gs_base = evmcs->host_gs_base; vmcs12->host_tr_base = evmcs->host_tr_base; vmcs12->host_gdtr_base = evmcs->host_gdtr_base; vmcs12->host_idtr_base = evmcs->host_idtr_base; vmcs12->host_rsp = evmcs->host_rsp; } if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { vmcs12->ept_pointer = evmcs->ept_pointer; vmcs12->virtual_processor_id = evmcs->virtual_processor_id; } if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; vmcs12->guest_pending_dbg_exceptions = evmcs->guest_pending_dbg_exceptions; vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; vmcs12->guest_activity_state = evmcs->guest_activity_state; vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; vmcs12->guest_ia32_perf_global_ctrl = evmcs->guest_ia32_perf_global_ctrl; /* * Not present in struct vmcs12: * vmcs12->guest_ia32_s_cet = evmcs->guest_ia32_s_cet; * vmcs12->guest_ia32_lbr_ctl = evmcs->guest_ia32_lbr_ctl; * vmcs12->guest_ia32_int_ssp_table_addr = evmcs->guest_ia32_int_ssp_table_addr; */ } /* * Not used? * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; * vmcs12->page_fault_error_code_mask = * evmcs->page_fault_error_code_mask; * vmcs12->page_fault_error_code_match = * evmcs->page_fault_error_code_match; * vmcs12->cr3_target_count = evmcs->cr3_target_count; * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; */ /* * Read only fields: * vmcs12->guest_physical_address = evmcs->guest_physical_address; * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; * vmcs12->exit_qualification = evmcs->exit_qualification; * vmcs12->guest_linear_address = evmcs->guest_linear_address; * * Not present in struct vmcs12: * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; */ return; #else /* CONFIG_KVM_HYPERV */ KVM_BUG_ON(1, vmx->vcpu.kvm); #endif /* CONFIG_KVM_HYPERV */ } static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) { #ifdef CONFIG_KVM_HYPERV struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); /* * Should not be changed by KVM: * * evmcs->host_es_selector = vmcs12->host_es_selector; * evmcs->host_cs_selector = vmcs12->host_cs_selector; * evmcs->host_ss_selector = vmcs12->host_ss_selector; * evmcs->host_ds_selector = vmcs12->host_ds_selector; * evmcs->host_fs_selector = vmcs12->host_fs_selector; * evmcs->host_gs_selector = vmcs12->host_gs_selector; * evmcs->host_tr_selector = vmcs12->host_tr_selector; * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; * evmcs->host_cr0 = vmcs12->host_cr0; * evmcs->host_cr3 = vmcs12->host_cr3; * evmcs->host_cr4 = vmcs12->host_cr4; * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; * evmcs->host_rip = vmcs12->host_rip; * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; * evmcs->host_fs_base = vmcs12->host_fs_base; * evmcs->host_gs_base = vmcs12->host_gs_base; * evmcs->host_tr_base = vmcs12->host_tr_base; * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; * evmcs->host_idtr_base = vmcs12->host_idtr_base; * evmcs->host_rsp = vmcs12->host_rsp; * sync_vmcs02_to_vmcs12() doesn't read these: * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; * evmcs->msr_bitmap = vmcs12->msr_bitmap; * evmcs->ept_pointer = vmcs12->ept_pointer; * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; * evmcs->tpr_threshold = vmcs12->tpr_threshold; * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; * evmcs->exception_bitmap = vmcs12->exception_bitmap; * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; * evmcs->page_fault_error_code_mask = * vmcs12->page_fault_error_code_mask; * evmcs->page_fault_error_code_match = * vmcs12->page_fault_error_code_match; * evmcs->cr3_target_count = vmcs12->cr3_target_count; * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; * evmcs->tsc_offset = vmcs12->tsc_offset; * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; * evmcs->guest_ia32_perf_global_ctrl = vmcs12->guest_ia32_perf_global_ctrl; * evmcs->host_ia32_perf_global_ctrl = vmcs12->host_ia32_perf_global_ctrl; * evmcs->encls_exiting_bitmap = vmcs12->encls_exiting_bitmap; * evmcs->tsc_multiplier = vmcs12->tsc_multiplier; * * Not present in struct vmcs12: * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; * evmcs->host_ia32_s_cet = vmcs12->host_ia32_s_cet; * evmcs->host_ssp = vmcs12->host_ssp; * evmcs->host_ia32_int_ssp_table_addr = vmcs12->host_ia32_int_ssp_table_addr; * evmcs->guest_ia32_s_cet = vmcs12->guest_ia32_s_cet; * evmcs->guest_ia32_lbr_ctl = vmcs12->guest_ia32_lbr_ctl; * evmcs->guest_ia32_int_ssp_table_addr = vmcs12->guest_ia32_int_ssp_table_addr; * evmcs->guest_ssp = vmcs12->guest_ssp; */ evmcs->guest_es_selector = vmcs12->guest_es_selector; evmcs->guest_cs_selector = vmcs12->guest_cs_selector; evmcs->guest_ss_selector = vmcs12->guest_ss_selector; evmcs->guest_ds_selector = vmcs12->guest_ds_selector; evmcs->guest_fs_selector = vmcs12->guest_fs_selector; evmcs->guest_gs_selector = vmcs12->guest_gs_selector; evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; evmcs->guest_tr_selector = vmcs12->guest_tr_selector; evmcs->guest_es_limit = vmcs12->guest_es_limit; evmcs->guest_cs_limit = vmcs12->guest_cs_limit; evmcs->guest_ss_limit = vmcs12->guest_ss_limit; evmcs->guest_ds_limit = vmcs12->guest_ds_limit; evmcs->guest_fs_limit = vmcs12->guest_fs_limit; evmcs->guest_gs_limit = vmcs12->guest_gs_limit; evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; evmcs->guest_tr_limit = vmcs12->guest_tr_limit; evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; evmcs->guest_es_base = vmcs12->guest_es_base; evmcs->guest_cs_base = vmcs12->guest_cs_base; evmcs->guest_ss_base = vmcs12->guest_ss_base; evmcs->guest_ds_base = vmcs12->guest_ds_base; evmcs->guest_fs_base = vmcs12->guest_fs_base; evmcs->guest_gs_base = vmcs12->guest_gs_base; evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; evmcs->guest_tr_base = vmcs12->guest_tr_base; evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; evmcs->guest_idtr_base = vmcs12->guest_idtr_base; evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; evmcs->guest_pending_dbg_exceptions = vmcs12->guest_pending_dbg_exceptions; evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; evmcs->guest_activity_state = vmcs12->guest_activity_state; evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; evmcs->guest_cr0 = vmcs12->guest_cr0; evmcs->guest_cr3 = vmcs12->guest_cr3; evmcs->guest_cr4 = vmcs12->guest_cr4; evmcs->guest_dr7 = vmcs12->guest_dr7; evmcs->guest_physical_address = vmcs12->guest_physical_address; evmcs->vm_instruction_error = vmcs12->vm_instruction_error; evmcs->vm_exit_reason = vmcs12->vm_exit_reason; evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; evmcs->exit_qualification = vmcs12->exit_qualification; evmcs->guest_linear_address = vmcs12->guest_linear_address; evmcs->guest_rsp = vmcs12->guest_rsp; evmcs->guest_rflags = vmcs12->guest_rflags; evmcs->guest_interruptibility_info = vmcs12->guest_interruptibility_info; evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; evmcs->vm_entry_controls = vmcs12->vm_entry_controls; evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; evmcs->vm_entry_exception_error_code = vmcs12->vm_entry_exception_error_code; evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; evmcs->guest_rip = vmcs12->guest_rip; evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; return; #else /* CONFIG_KVM_HYPERV */ KVM_BUG_ON(1, vmx->vcpu.kvm); #endif /* CONFIG_KVM_HYPERV */ } /* * This is an equivalent of the nested hypervisor executing the vmptrld * instruction. */ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( struct kvm_vcpu *vcpu, bool from_launch) { #ifdef CONFIG_KVM_HYPERV struct vcpu_vmx *vmx = to_vmx(vcpu); bool evmcs_gpa_changed = false; u64 evmcs_gpa; if (likely(!guest_cpu_cap_has_evmcs(vcpu))) return EVMPTRLD_DISABLED; evmcs_gpa = nested_get_evmptr(vcpu); if (!evmptr_is_valid(evmcs_gpa)) { nested_release_evmcs(vcpu); return EVMPTRLD_DISABLED; } if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { vmx->nested.current_vmptr = INVALID_GPA; nested_release_evmcs(vcpu); if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa), &vmx->nested.hv_evmcs_map)) return EVMPTRLD_ERROR; vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; /* * Currently, KVM only supports eVMCS version 1 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this * value to first u32 field of eVMCS which should specify eVMCS * VersionNumber. * * Guest should be aware of supported eVMCS versions by host by * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is * expected to set this CPUID leaf according to the value * returned in vmcs_version from nested_enable_evmcs(). * * However, it turns out that Microsoft Hyper-V fails to comply * to their own invented interface: When Hyper-V use eVMCS, it * just sets first u32 field of eVMCS to revision_id specified * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number * which is one of the supported versions specified in * CPUID.0x4000000A.EAX[0:15]. * * To overcome Hyper-V bug, we accept here either a supported * eVMCS version or VMCS12 revision_id as valid values for first * u32 field of eVMCS. */ if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { nested_release_evmcs(vcpu); return EVMPTRLD_VMFAIL; } vmx->nested.hv_evmcs_vmptr = evmcs_gpa; evmcs_gpa_changed = true; /* * Unlike normal vmcs12, enlightened vmcs12 is not fully * reloaded from guest's memory (read only fields, fields not * present in struct hv_enlightened_vmcs, ...). Make sure there * are no leftovers. */ if (from_launch) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); memset(vmcs12, 0, sizeof(*vmcs12)); vmcs12->hdr.revision_id = VMCS12_REVISION; } } /* * Clean fields data can't be used on VMLAUNCH and when we switch * between different L2 guests as KVM keeps a single VMCS12 per L1. */ if (from_launch || evmcs_gpa_changed) { vmx->nested.hv_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; vmx->nested.force_msr_bitmap_recalc = true; } return EVMPTRLD_SUCCEEDED; #else return EVMPTRLD_DISABLED; #endif } void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); if (nested_vmx_is_evmptr12_valid(vmx)) copy_vmcs12_to_enlightened(vmx); else copy_vmcs12_to_shadow(vmx); vmx->nested.need_vmcs12_to_shadow_sync = false; } static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) { struct vcpu_vmx *vmx = container_of(timer, struct vcpu_vmx, nested.preemption_timer); vmx->nested.preemption_timer_expired = true; kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); kvm_vcpu_kick(&vmx->vcpu); return HRTIMER_NORESTART; } static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; if (!vmx->nested.has_preemption_timer_deadline) { vmx->nested.preemption_timer_deadline = vmcs12->vmx_preemption_timer_value + l1_scaled_tsc; vmx->nested.has_preemption_timer_deadline = true; } return vmx->nested.preemption_timer_deadline - l1_scaled_tsc; } static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu, u64 preemption_timeout) { struct vcpu_vmx *vmx = to_vmx(vcpu); /* * A timer value of zero is architecturally guaranteed to cause * a VMExit prior to executing any instructions in the guest. */ if (preemption_timeout == 0) { vmx_preemption_timer_fn(&vmx->nested.preemption_timer); return; } if (vcpu->arch.virtual_tsc_khz == 0) return; preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; preemption_timeout *= 1000000; do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); hrtimer_start(&vmx->nested.preemption_timer, ktime_add_ns(ktime_get(), preemption_timeout), HRTIMER_MODE_ABS_PINNED); } static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) { if (vmx->nested.nested_run_pending && (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) return vmcs12->guest_ia32_efer; else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); else return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); } static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) { struct kvm *kvm = vmx->vcpu.kvm; /* * If vmcs02 hasn't been initialized, set the constant vmcs02 state * according to L0's settings (vmcs12 is irrelevant here). Host * fields that come from L0 and are not constant, e.g. HOST_CR3, * will be set as needed prior to VMLAUNCH/VMRESUME. */ if (vmx->nested.vmcs02_initialized) return; vmx->nested.vmcs02_initialized = true; /* * We don't care what the EPTP value is we just need to guarantee * it's valid so we don't get a false positive when doing early * consistency checks. */ if (enable_ept && nested_early_check) vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL)); if (vmx->ve_info) vmcs_write64(VE_INFORMATION_ADDRESS, __pa(vmx->ve_info)); /* All VMFUNCs are currently emulated through L0 vmexits. */ if (cpu_has_vmx_vmfunc()) vmcs_write64(VM_FUNCTION_CONTROL, 0); if (cpu_has_vmx_posted_intr()) vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); if (cpu_has_vmx_msr_bitmap()) vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); /* * PML is emulated for L2, but never enabled in hardware as the MMU * handles A/D emulation. Disabling PML for L2 also avoids having to * deal with filtering out L2 GPAs from the buffer. */ if (enable_pml) { vmcs_write64(PML_ADDRESS, 0); vmcs_write16(GUEST_PML_INDEX, -1); } if (cpu_has_vmx_encls_vmexit()) vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA); if (kvm_notify_vmexit_enabled(kvm)) vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); /* * Set the MSR load/store lists to match L0's settings. Only the * addresses are constant (for vmcs02), the counts can change based * on L2's behavior, e.g. switching to/from long mode. */ vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val)); vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); vmx_set_constant_host_state(vmx); } static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) { prepare_vmcs02_constant_state(vmx); vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); /* * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the * same VPID as the host. Emulate this behavior by using vpid01 for L2 * if VPID is disabled in vmcs12. Note, if VPID is disabled, VM-Enter * and VM-Exit are architecturally required to flush VPID=0, but *only* * VPID=0. I.e. using vpid02 would be ok (so long as KVM emulates the * required flushes), but doing so would cause KVM to over-flush. E.g. * if L1 runs L2 X with VPID12=1, then runs L2 Y with VPID12 disabled, * and then runs L2 X again, then KVM can and should retain TLB entries * for VPID12=1. */ if (enable_vpid) { if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); else vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); } } static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01, struct vmcs12 *vmcs12) { u32 exec_control; u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) prepare_vmcs02_early_rare(vmx, vmcs12); /* * PIN CONTROLS */ exec_control = __pin_controls_get(vmcs01); exec_control |= (vmcs12->pin_based_vm_exec_control & ~PIN_BASED_VMX_PREEMPTION_TIMER); /* Posted interrupts setting is only taken from vmcs12. */ vmx->nested.pi_pending = false; if (nested_cpu_has_posted_intr(vmcs12)) { vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; } else { vmx->nested.posted_intr_nv = -1; exec_control &= ~PIN_BASED_POSTED_INTR; } pin_controls_set(vmx, exec_control); /* * EXEC CONTROLS */ exec_control = __exec_controls_get(vmcs01); /* L0's desires */ exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING; exec_control &= ~CPU_BASED_TPR_SHADOW; exec_control |= vmcs12->cpu_based_vm_exec_control; vmx->nested.l1_tpr_threshold = -1; if (exec_control & CPU_BASED_TPR_SHADOW) vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); #ifdef CONFIG_X86_64 else exec_control |= CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING; #endif /* * A vmexit (to either L1 hypervisor or L0 userspace) is always needed * for I/O port accesses. */ exec_control |= CPU_BASED_UNCOND_IO_EXITING; exec_control &= ~CPU_BASED_USE_IO_BITMAPS; /* * This bit will be computed in nested_get_vmcs12_pages, because * we do not have access to L1's MSR bitmap yet. For now, keep * the same bit as before, hoping to avoid multiple VMWRITEs that * only set/clear this bit. */ exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS; exec_controls_set(vmx, exec_control); /* * SECONDARY EXEC CONTROLS */ if (cpu_has_secondary_exec_ctrls()) { exec_control = __secondary_exec_controls_get(vmcs01); /* Take the following fields only from vmcs12 */ exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_ENABLE_INVPCID | SECONDARY_EXEC_ENABLE_RDTSCP | SECONDARY_EXEC_ENABLE_XSAVES | SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_ENABLE_VMFUNC | SECONDARY_EXEC_DESC); if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) exec_control |= vmcs12->secondary_vm_exec_control; /* PML is emulated and never enabled in hardware for L2. */ exec_control &= ~SECONDARY_EXEC_ENABLE_PML; /* VMCS shadowing for L2 is emulated for now */ exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; /* * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() * will not have to rewrite the controls just for this bit. */ if (vmx_umip_emulated() && (vmcs12->guest_cr4 & X86_CR4_UMIP)) exec_control |= SECONDARY_EXEC_DESC; if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) vmcs_write16(GUEST_INTR_STATUS, vmcs12->guest_intr_status); if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) vmx_write_encls_bitmap(&vmx->vcpu, vmcs12); secondary_exec_controls_set(vmx, exec_control); } /* * ENTRY CONTROLS * * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate * on the related bits (if supported by the CPU) in the hope that * we can avoid VMWrites during vmx_set_efer(). * * Similarly, take vmcs01's PERF_GLOBAL_CTRL in the hope that if KVM is * loading PERF_GLOBAL_CTRL via the VMCS for L1, then KVM will want to * do the same for L2. */ exec_control = __vm_entry_controls_get(vmcs01); exec_control |= (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER); if (cpu_has_load_ia32_efer()) { if (guest_efer & EFER_LMA) exec_control |= VM_ENTRY_IA32E_MODE; if (guest_efer != kvm_host.efer) exec_control |= VM_ENTRY_LOAD_IA32_EFER; } vm_entry_controls_set(vmx, exec_control); /* * EXIT CONTROLS * * L2->L1 exit controls are emulated - the hardware exit is to L0 so * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER * bits may be modified by vmx_set_efer() in prepare_vmcs02(). */ exec_control = __vm_exit_controls_get(vmcs01); if (cpu_has_load_ia32_efer() && guest_efer != kvm_host.efer) exec_control |= VM_EXIT_LOAD_IA32_EFER; else exec_control &= ~VM_EXIT_LOAD_IA32_EFER; vm_exit_controls_set(vmx, exec_control); /* * Interrupt/Exception Fields */ if (vmx->nested.nested_run_pending) { vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, vmcs12->vm_entry_intr_info_field); vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, vmcs12->vm_entry_exception_error_code); vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, vmcs12->vm_entry_instruction_len); vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, vmcs12->guest_interruptibility_info); vmx->loaded_vmcs->nmi_known_unmasked = !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); } else { vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); } } static void vmcs_read_cet_state(struct kvm_vcpu *vcpu, u64 *s_cet, u64 *ssp, u64 *ssp_tbl) { if (guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) || guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) *s_cet = vmcs_readl(GUEST_S_CET); if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) { *ssp = vmcs_readl(GUEST_SSP); *ssp_tbl = vmcs_readl(GUEST_INTR_SSP_TABLE); } } static void vmcs_write_cet_state(struct kvm_vcpu *vcpu, u64 s_cet, u64 ssp, u64 ssp_tbl) { if (guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) || guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) vmcs_writel(GUEST_S_CET, s_cet); if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) { vmcs_writel(GUEST_SSP, ssp); vmcs_writel(GUEST_INTR_SSP_TABLE, ssp_tbl); } } static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) { struct hv_enlightened_vmcs *hv_evmcs = nested_vmx_evmcs(vmx); if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); vmx_segment_cache_clear(vmx); } if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, vmcs12->guest_pending_dbg_exceptions); vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); /* * L1 may access the L2's PDPTR, so save them to construct * vmcs12 */ if (enable_ept) { vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); } if (kvm_mpx_supported() && vmx->nested.nested_run_pending && (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); } if (nested_cpu_has_xsaves(vmcs12)) vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); /* * Whether page-faults are trapped is determined by a combination of * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0 * doesn't care about page faults then we should set all of these to * L1's desires. However, if L0 does care about (some) page faults, it * is not easy (if at all possible?) to merge L0 and L1's desires, we * simply ask to exit on each and every L2 page fault. This is done by * setting MASK=MATCH=0 and (see below) EB.PF=1. * Note that below we don't need special code to set EB.PF beyond the * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when * !enable_ept, EB.PF is 1, so the "or" will always be 1. */ if (vmx_need_pf_intercept(&vmx->vcpu)) { /* * TODO: if both L0 and L1 need the same MASK and MATCH, * go ahead and use it? */ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); } else { vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask); vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match); } if (cpu_has_vmx_apicv()) { vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); } /* * Make sure the msr_autostore list is up to date before we set the * count in the vmcs02. */ prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC); vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr); vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE) vmcs_write_cet_state(&vmx->vcpu, vmcs12->guest_s_cet, vmcs12->guest_ssp, vmcs12->guest_ssp_tbl); set_cr4_guest_host_mask(vmx); } /* * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 * guest in a way that will both be appropriate to L1's requests, and our * needs. In addition to modifying the active vmcs (which is vmcs02), this * function also has additional necessary side-effects, like setting various * vcpu->arch fields. * Returns 0 on success, 1 on failure. Invalid state exit qualification code * is assigned to entry_failure_code on failure. */ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, bool from_vmentry, enum vm_entry_failure_code *entry_failure_code) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); bool load_guest_pdptrs_vmcs12 = false; if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) { prepare_vmcs02_rare(vmx, vmcs12); vmx->nested.dirty_vmcs12 = false; load_guest_pdptrs_vmcs12 = !nested_vmx_is_evmptr12_valid(vmx) || !(evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); } if (vmx->nested.nested_run_pending && (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); vmx_guest_debugctl_write(vcpu, vmcs12->guest_ia32_debugctl & vmx_get_supported_debugctl(vcpu, false)); } else { kvm_set_dr(vcpu, 7, vcpu->arch.dr7); vmx_guest_debugctl_write(vcpu, vmx->nested.pre_vmenter_debugctl); } if (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) vmcs_write_cet_state(vcpu, vmx->nested.pre_vmenter_s_cet, vmx->nested.pre_vmenter_ssp, vmx->nested.pre_vmenter_ssp_tbl); if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs); vmx_set_rflags(vcpu, vmcs12->guest_rflags); /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the * bitwise-or of what L1 wants to trap for L2, and what we want to * trap. Note that CR0.TS also needs updating - we do this later. */ vmx_update_exception_bitmap(vcpu); vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); if (vmx->nested.nested_run_pending && (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); vcpu->arch.pat = vmcs12->guest_ia32_pat; } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); } vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( vcpu->arch.l1_tsc_offset, vmx_get_l2_tsc_offset(vcpu), vmx_get_l2_tsc_multiplier(vcpu)); vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( vcpu->arch.l1_tsc_scaling_ratio, vmx_get_l2_tsc_multiplier(vcpu)); vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); if (kvm_caps.has_tsc_control) vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); if (nested_cpu_has_ept(vmcs12)) nested_ept_init_mmu_context(vcpu); /* * Override the CR0/CR4 read shadows after setting the effective guest * CR0/CR4. The common helpers also set the shadows, but they don't * account for vmcs12's cr0/4_guest_host_mask. */ vmx_set_cr0(vcpu, vmcs12->guest_cr0); vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); vmx_set_cr4(vcpu, vmcs12->guest_cr4); vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ vmx_set_efer(vcpu, vcpu->arch.efer); /* * Guest state is invalid and unrestricted guest is disabled, * which means L1 attempted VMEntry to L2 with invalid state. * Fail the VMEntry. * * However when force loading the guest state (SMM exit or * loading nested state after migration, it is possible to * have invalid guest state now, which will be later fixed by * restoring L2 register state */ if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) { *entry_failure_code = ENTRY_FAIL_DEFAULT; return -EINVAL; } /* Shadow page tables on either EPT or shadow page tables. */ if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), from_vmentry, entry_failure_code)) return -EINVAL; /* * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12 * on nested VM-Exit, which can occur without actually running L2 and * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the * transition to HLT instead of running L2. */ if (enable_ept) vmcs_writel(GUEST_CR3, vmcs12->guest_cr3); /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); } if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && WARN_ON_ONCE(__kvm_emulate_msr_write(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, vmcs12->guest_ia32_perf_global_ctrl))) { *entry_failure_code = ENTRY_FAIL_DEFAULT; return -EINVAL; } kvm_rsp_write(vcpu, vmcs12->guest_rsp); kvm_rip_write(vcpu, vmcs12->guest_rip); /* * It was observed that genuine Hyper-V running in L1 doesn't reset * 'hv_clean_fields' by itself, it only sets the corresponding dirty * bits when it changes a field in eVMCS. Mark all fields as clean * here. */ if (nested_vmx_is_evmptr12_valid(vmx)) evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; return 0; } static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) { if (CC(!nested_cpu_has_nmi_exiting(vmcs12) && nested_cpu_has_virtual_nmis(vmcs12))) return -EINVAL; if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING))) return -EINVAL; return 0; } static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) { struct vcpu_vmx *vmx = to_vmx(vcpu); /* Check for memory type validity */ switch (new_eptp & VMX_EPTP_MT_MASK) { case VMX_EPTP_MT_UC: if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))) return false; break; case VMX_EPTP_MT_WB: if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))) return false; break; default: return false; } /* Page-walk levels validity. */ switch (new_eptp & VMX_EPTP_PWL_MASK) { case VMX_EPTP_PWL_5: if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT))) return false; break; case VMX_EPTP_PWL_4: if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT))) return false; break; default: return false; } /* Reserved bits should not be set */ if (CC(!kvm_vcpu_is_legal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f))) return false; /* AD, if set, should be supported */ if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) { if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))) return false; } return true; } /* * Checks related to VM-Execution Control Fields */ static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control, vmx->nested.msrs.pinbased_ctls_low, vmx->nested.msrs.pinbased_ctls_high)) || CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, vmx->nested.msrs.procbased_ctls_low, vmx->nested.msrs.procbased_ctls_high))) return -EINVAL; if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control, vmx->nested.msrs.secondary_ctls_low, vmx->nested.msrs.secondary_ctls_high))) return -EINVAL; if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) || nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || nested_vmx_check_apic_access_controls(vcpu, vmcs12) || nested_vmx_check_apicv_controls(vcpu, vmcs12) || nested_vmx_check_nmi_controls(vmcs12) || nested_vmx_check_pml_controls(vcpu, vmcs12) || nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) return -EINVAL; if (!nested_cpu_has_preemption_timer(vmcs12) && nested_cpu_has_save_preemption_timer(vmcs12)) return -EINVAL; if (nested_cpu_has_ept(vmcs12) && CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer))) return -EINVAL; if (nested_cpu_has_vmfunc(vmcs12)) { if (CC(vmcs12->vm_function_control & ~vmx->nested.msrs.vmfunc_controls)) return -EINVAL; if (nested_cpu_has_eptp_switching(vmcs12)) { if (CC(!nested_cpu_has_ept(vmcs12)) || CC(!page_address_valid(vcpu, vmcs12->eptp_list_address))) return -EINVAL; } } return 0; } /* * Checks related to VM-Exit Control Fields */ static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); if (CC(!vmx_control_verify(vmcs12->vm_exit_controls, vmx->nested.msrs.exit_ctls_low, vmx->nested.msrs.exit_ctls_high)) || CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))) return -EINVAL; return 0; } /* * Checks related to VM-Entry Control Fields */ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); if (CC(!vmx_control_verify(vmcs12->vm_entry_controls, vmx->nested.msrs.entry_ctls_low, vmx->nested.msrs.entry_ctls_high))) return -EINVAL; /* * From the Intel SDM, volume 3: * Fields relevant to VM-entry event injection must be set properly. * These fields are the VM-entry interruption-information field, the * VM-entry exception error code, and the VM-entry instruction length. */ if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { u32 intr_info = vmcs12->vm_entry_intr_info_field; u8 vector = intr_info & INTR_INFO_VECTOR_MASK; u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; bool urg = nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST); bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; /* VM-entry interruption-info field: interruption type */ if (CC(intr_type == INTR_TYPE_RESERVED) || CC(intr_type == INTR_TYPE_OTHER_EVENT && !nested_cpu_supports_monitor_trap_flag(vcpu))) return -EINVAL; /* VM-entry interruption-info field: vector */ if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) return -EINVAL; /* * Cannot deliver error code in real mode or if the interrupt * type is not hardware exception. For other cases, do the * consistency check only if the vCPU doesn't enumerate * VMX_BASIC_NO_HW_ERROR_CODE_CC. */ if (!prot_mode || intr_type != INTR_TYPE_HARD_EXCEPTION) { if (CC(has_error_code)) return -EINVAL; } else if (!nested_cpu_has_no_hw_errcode_cc(vcpu)) { if (CC(has_error_code != x86_exception_has_error_code(vector))) return -EINVAL; } /* VM-entry exception error code */ if (CC(has_error_code && vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))) return -EINVAL; /* VM-entry interruption-info field: reserved bits */ if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK)) return -EINVAL; /* VM-entry instruction length */ switch (intr_type) { case INTR_TYPE_SOFT_EXCEPTION: case INTR_TYPE_SOFT_INTR: case INTR_TYPE_PRIV_SW_EXCEPTION: if (CC(vmcs12->vm_entry_instruction_len > X86_MAX_INSTRUCTION_LENGTH) || CC(vmcs12->vm_entry_instruction_len == 0 && CC(!nested_cpu_has_zero_length_injection(vcpu)))) return -EINVAL; } } if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12)) return -EINVAL; return 0; } static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { if (nested_check_vm_execution_controls(vcpu, vmcs12) || nested_check_vm_exit_controls(vcpu, vmcs12) || nested_check_vm_entry_controls(vcpu, vmcs12)) return -EINVAL; #ifdef CONFIG_KVM_HYPERV if (guest_cpu_cap_has_evmcs(vcpu)) return nested_evmcs_check_controls(vmcs12); #endif return 0; } static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { #ifdef CONFIG_X86_64 if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) != !!(vcpu->arch.efer & EFER_LMA))) return -EINVAL; #endif return 0; } static bool is_l1_noncanonical_address_on_vmexit(u64 la, struct vmcs12 *vmcs12) { /* * Check that the given linear address is canonical after a VM exit * from L2, based on HOST_CR4.LA57 value that will be loaded for L1. */ u8 l1_address_bits_on_exit = (vmcs12->host_cr4 & X86_CR4_LA57) ? 57 : 48; return !__is_canonical_address(la, l1_address_bits_on_exit); } static int nested_vmx_check_cet_state_common(struct kvm_vcpu *vcpu, u64 s_cet, u64 ssp, u64 ssp_tbl) { if (CC(!kvm_is_valid_u_s_cet(vcpu, s_cet)) || CC(!IS_ALIGNED(ssp, 4)) || CC(is_noncanonical_msr_address(ssp_tbl, vcpu))) return -EINVAL; return 0; } static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { bool ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE); if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || CC(!kvm_vcpu_is_legal_cr3(vcpu, vmcs12->host_cr3))) return -EINVAL; if (CC(vmcs12->host_cr4 & X86_CR4_CET && !(vmcs12->host_cr0 & X86_CR0_WP))) return -EINVAL; if (CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_eip, vcpu))) return -EINVAL; if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) return -EINVAL; if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), vmcs12->host_ia32_perf_global_ctrl))) return -EINVAL; if (ia32e) { if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) return -EINVAL; } else { if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || CC((vmcs12->host_rip) >> 32)) return -EINVAL; } if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || CC(vmcs12->host_cs_selector == 0) || CC(vmcs12->host_tr_selector == 0) || CC(vmcs12->host_ss_selector == 0 && !ia32e)) return -EINVAL; if (CC(is_noncanonical_base_address(vmcs12->host_fs_base, vcpu)) || CC(is_noncanonical_base_address(vmcs12->host_gs_base, vcpu)) || CC(is_noncanonical_base_address(vmcs12->host_gdtr_base, vcpu)) || CC(is_noncanonical_base_address(vmcs12->host_idtr_base, vcpu)) || CC(is_noncanonical_base_address(vmcs12->host_tr_base, vcpu)) || CC(is_l1_noncanonical_address_on_vmexit(vmcs12->host_rip, vmcs12))) return -EINVAL; /* * If the load IA32_EFER VM-exit control is 1, bits reserved in the * IA32_EFER MSR must be 0 in the field for that register. In addition, * the values of the LMA and LME bits in the field must each be that of * the host address-space size VM-exit control. */ if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) || CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) || CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))) return -EINVAL; } if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_CET_STATE) { if (nested_vmx_check_cet_state_common(vcpu, vmcs12->host_s_cet, vmcs12->host_ssp, vmcs12->host_ssp_tbl)) return -EINVAL; /* * IA32_S_CET and SSP must be canonical if the host will * enter 64-bit mode after VM-exit; otherwise, higher * 32-bits must be all 0s. */ if (ia32e) { if (CC(is_noncanonical_msr_address(vmcs12->host_s_cet, vcpu)) || CC(is_noncanonical_msr_address(vmcs12->host_ssp, vcpu))) return -EINVAL; } else { if (CC(vmcs12->host_s_cet >> 32) || CC(vmcs12->host_ssp >> 32)) return -EINVAL; } } return 0; } static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; struct vmcs_hdr hdr; if (vmcs12->vmcs_link_pointer == INVALID_GPA) return 0; if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) return -EINVAL; if (ghc->gpa != vmcs12->vmcs_link_pointer && CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmcs12->vmcs_link_pointer, VMCS12_SIZE))) return -EINVAL; if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, offsetof(struct vmcs12, hdr), sizeof(hdr)))) return -EINVAL; if (CC(hdr.revision_id != VMCS12_REVISION) || CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) return -EINVAL; return 0; } /* * Checks related to Guest Non-register State */ static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) { if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT && vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI)) return -EINVAL; return 0; } static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, enum vm_entry_failure_code *entry_failure_code) { bool ia32e = !!(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE); *entry_failure_code = ENTRY_FAIL_DEFAULT; if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) return -EINVAL; if (CC(vmcs12->guest_cr4 & X86_CR4_CET && !(vmcs12->guest_cr0 & X86_CR0_WP))) return -EINVAL; if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && (CC(!kvm_dr7_valid(vmcs12->guest_dr7)) || CC(!vmx_is_valid_debugctl(vcpu, vmcs12->guest_ia32_debugctl, false)))) return -EINVAL; if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) return -EINVAL; if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR; return -EINVAL; } if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), vmcs12->guest_ia32_perf_global_ctrl))) return -EINVAL; if (CC((vmcs12->guest_cr0 & (X86_CR0_PG | X86_CR0_PE)) == X86_CR0_PG)) return -EINVAL; if (CC(ia32e && !(vmcs12->guest_cr4 & X86_CR4_PAE)) || CC(ia32e && !(vmcs12->guest_cr0 & X86_CR0_PG))) return -EINVAL; /* * If the load IA32_EFER VM-entry control is 1, the following checks * are performed on the field for the IA32_EFER MSR: * - Bits reserved in the IA32_EFER MSR must be 0. * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of * the IA-32e mode guest VM-exit control. It must also be identical * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to * CR0.PG) is 1. */ if (to_vmx(vcpu)->nested.nested_run_pending && (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || CC(((vmcs12->guest_cr0 & X86_CR0_PG) && ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))) return -EINVAL; } if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && (CC(is_noncanonical_msr_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) return -EINVAL; if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE) { if (nested_vmx_check_cet_state_common(vcpu, vmcs12->guest_s_cet, vmcs12->guest_ssp, vmcs12->guest_ssp_tbl)) return -EINVAL; /* * Guest SSP must have 63:N bits identical, rather than * be canonical (i.e., 63:N-1 bits identical), where N is * the CPU's maximum linear-address width. Similar to * is_noncanonical_msr_address(), use the host's * linear-address width. */ if (CC(!__is_canonical_address(vmcs12->guest_ssp, max_host_virt_addr_bits() + 1))) return -EINVAL; } if (nested_check_guest_non_reg_state(vmcs12)) return -EINVAL; return 0; } static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long cr3, cr4; bool vm_fail; if (!nested_early_check) return 0; if (vmx->msr_autoload.host.nr) vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); if (vmx->msr_autoload.guest.nr) vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); preempt_disable(); vmx_prepare_switch_to_guest(vcpu); /* * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e. * there is no need to preserve other bits or save/restore the field. */ vmcs_writel(GUEST_RFLAGS, 0); cr3 = __get_current_cr3_fast(); if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { vmcs_writel(HOST_CR3, cr3); vmx->loaded_vmcs->host_state.cr3 = cr3; } cr4 = cr4_read_shadow(); if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { vmcs_writel(HOST_CR4, cr4); vmx->loaded_vmcs->host_state.cr4 = cr4; } vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, __vmx_vcpu_run_flags(vmx)); if (vmx->msr_autoload.host.nr) vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); if (vmx->msr_autoload.guest.nr) vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); if (vm_fail) { u32 error = vmcs_read32(VM_INSTRUCTION_ERROR); preempt_enable(); trace_kvm_nested_vmenter_failed( "early hardware check VM-instruction error: ", error); WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD); return 1; } /* * VMExit clears RFLAGS.IF and DR7, even on a consistency check. */ if (hw_breakpoint_active()) set_debugreg(__this_cpu_read(cpu_dr7), 7); local_irq_enable(); preempt_enable(); /* * A non-failing VMEntry means we somehow entered guest mode with * an illegal RIP, and that's just the tip of the iceberg. There * is no telling what memory has been modified or what state has * been exposed to unknown code. Hitting this all but guarantees * a (very critical) hardware issue. */ WARN_ON(!(vmcs_read32(VM_EXIT_REASON) & VMX_EXIT_REASONS_FAILED_VMENTRY)); return 0; } #ifdef CONFIG_KVM_HYPERV static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); /* * hv_evmcs may end up being not mapped after migration (when * L2 was running), map it here to make sure vmcs12 changes are * properly reflected. */ if (guest_cpu_cap_has_evmcs(vcpu) && vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) { enum nested_evmptrld_status evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, false); if (evmptrld_status == EVMPTRLD_VMFAIL || evmptrld_status == EVMPTRLD_ERROR) return false; /* * Post migration VMCS12 always provides the most actual * information, copy it to eVMCS upon entry. */ vmx->nested.need_vmcs12_to_shadow_sync = true; } return true; } #endif static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_host_map *map; if (!vcpu->arch.pdptrs_from_userspace && !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { /* * Reload the guest's PDPTRs since after a migration * the guest CR3 might be restored prior to setting the nested * state which can lead to a load of wrong PDPTRs. */ if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3))) return false; } if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { map = &vmx->nested.apic_access_page_map; if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) { vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn)); } else { pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n", __func__); vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; return false; } } if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { map = &vmx->nested.virtual_apic_map; if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { /* * The processor will never use the TPR shadow, simply * clear the bit from the execution control. Such a * configuration is useless, but it happens in tests. * For any other configuration, failing the vm entry is * _not_ what the processor does but it's basically the * only possibility we have. */ exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW); } else { /* * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to * force VM-Entry to fail. */ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA); } } if (nested_cpu_has_posted_intr(vmcs12)) { map = &vmx->nested.pi_desc_map; if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) { vmx->nested.pi_desc = (struct pi_desc *)(((void *)map->hva) + offset_in_page(vmcs12->posted_intr_desc_addr)); vmcs_write64(POSTED_INTR_DESC_ADDR, pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); } else { /* * Defer the KVM_INTERNAL_EXIT until KVM tries to * access the contents of the VMCS12 posted interrupt * descriptor. (Note that KVM may do this when it * should not, per the architectural specification.) */ vmx->nested.pi_desc = NULL; pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR); } } if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); else exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); return true; } static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) { #ifdef CONFIG_KVM_HYPERV /* * Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy * in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory * to make nested_evmcs_l2_tlb_flush_enabled() work correctly post * migration. */ if (!nested_get_evmcs_page(vcpu)) { pr_debug_ratelimited("%s: enlightened vmptrld failed\n", __func__); vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; return false; } #endif if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu)) return false; return true; } static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) { struct vmcs12 *vmcs12; struct vcpu_vmx *vmx = to_vmx(vcpu); gpa_t dst; if (WARN_ON_ONCE(!is_guest_mode(vcpu))) return 0; if (WARN_ON_ONCE(vmx->nested.pml_full)) return 1; /* * Check if PML is enabled for the nested guest. Whether eptp bit 6 is * set is already checked as part of A/D emulation. */ vmcs12 = get_vmcs12(vcpu); if (!nested_cpu_has_pml(vmcs12)) return 0; if (vmcs12->guest_pml_index >= PML_LOG_NR_ENTRIES) { vmx->nested.pml_full = true; return 1; } gpa &= ~0xFFFull; dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, offset_in_page(dst), sizeof(gpa))) return 0; vmcs12->guest_pml_index--; return 0; } /* * Intel's VMX Instruction Reference specifies a common set of prerequisites * for running VMX instructions (except VMXON, whose prerequisites are * slightly different). It also specifies what exception to inject otherwise. * Note that many of these exceptions have priority over VM exits, so they * don't have to be checked again here. */ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) { if (!to_vmx(vcpu)->nested.vmxon) { kvm_queue_exception(vcpu, UD_VECTOR); return 0; } if (vmx_get_cpl(vcpu)) { kvm_inject_gp(vcpu, 0); return 0; } return 1; } static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12); /* * If from_vmentry is false, this is being called from state restore (either RSM * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. * * Returns: * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode * NVMX_VMENTRY_VMFAIL: Consistency check VMFail * NVMX_VMENTRY_VMEXIT: Consistency check VMExit * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error */ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); enum vm_entry_failure_code entry_failure_code; union vmx_exit_reason exit_reason = { .basic = EXIT_REASON_INVALID_STATE, .failed_vmentry = 1, }; u32 failed_index; trace_kvm_nested_vmenter(kvm_rip_read(vcpu), vmx->nested.current_vmptr, vmcs12->guest_rip, vmcs12->guest_intr_status, vmcs12->vm_entry_intr_info_field, vmcs12->secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT, vmcs12->ept_pointer, vmcs12->guest_cr3, KVM_ISA_VMX); kvm_service_local_tlb_flush_requests(vcpu); if (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) vmx->nested.pre_vmenter_debugctl = vmx_guest_debugctl_read(); if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS); if (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) vmcs_read_cet_state(vcpu, &vmx->nested.pre_vmenter_s_cet, &vmx->nested.pre_vmenter_ssp, &vmx->nested.pre_vmenter_ssp_tbl); /* * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* * nested early checks are disabled. In the event of a "late" VM-Fail, * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its * software model to the pre-VMEntry host state. When EPT is disabled, * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is * guaranteed to be overwritten with a shadow CR3 prior to re-entering * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail * path would need to manually save/restore vmcs01.GUEST_CR3. */ if (!enable_ept && !nested_early_check) vmcs_writel(GUEST_CR3, vcpu->arch.cr3); vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12); if (from_vmentry) { if (unlikely(!nested_get_vmcs12_pages(vcpu))) { vmx_switch_vmcs(vcpu, &vmx->vmcs01); return NVMX_VMENTRY_KVM_INTERNAL_ERROR; } if (nested_vmx_check_vmentry_hw(vcpu)) { vmx_switch_vmcs(vcpu, &vmx->vmcs01); return NVMX_VMENTRY_VMFAIL; } if (nested_vmx_check_guest_state(vcpu, vmcs12, &entry_failure_code)) { exit_reason.basic = EXIT_REASON_INVALID_STATE; vmcs12->exit_qualification = entry_failure_code; goto vmentry_fail_vmexit; } } enter_guest_mode(vcpu); if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) { exit_reason.basic = EXIT_REASON_INVALID_STATE; vmcs12->exit_qualification = entry_failure_code; goto vmentry_fail_vmexit_guest_mode; } if (from_vmentry) { failed_index = nested_vmx_load_msr(vcpu, vmcs12->vm_entry_msr_load_addr, vmcs12->vm_entry_msr_load_count); if (failed_index) { exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL; vmcs12->exit_qualification = failed_index; goto vmentry_fail_vmexit_guest_mode; } } else { /* * The MMU is not initialized to point at the right entities yet and * "get pages" would need to read data from the guest (i.e. we will * need to perform gpa to hpa translation). Request a call * to nested_get_vmcs12_pages before the next VM-entry. The MSRs * have already been set at vmentry time and should not be reset. */ kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); } /* * Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI * when it executed VMLAUNCH/VMRESUME, as entering non-root mode can * effectively unblock various events, e.g. INIT/SIPI cause VM-Exit * unconditionally. Take care to pull data from vmcs01 as appropriate, * e.g. when checking for interrupt windows, as vmcs02 is now loaded. */ if ((__exec_controls_get(&vmx->vmcs01) & (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING)) || kvm_apic_has_pending_init_or_sipi(vcpu) || kvm_apic_has_interrupt(vcpu)) kvm_make_request(KVM_REQ_EVENT, vcpu); /* * Do not start the preemption timer hrtimer until after we know * we are successful, so that only nested_vmx_vmexit needs to cancel * the timer. */ vmx->nested.preemption_timer_expired = false; if (nested_cpu_has_preemption_timer(vmcs12)) { u64 timer_value = vmx_calc_preemption_timer_value(vcpu); vmx_start_preemption_timer(vcpu, timer_value); } /* * Note no nested_vmx_succeed or nested_vmx_fail here. At this point * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet * returned as far as L1 is concerned. It will only return (and set * the success flag) when L2 exits (see nested_vmx_vmexit()). */ return NVMX_VMENTRY_SUCCESS; /* * A failed consistency check that leads to a VMExit during L1's * VMEnter to L2 is a variation of a normal VMexit, as explained in * 26.7 "VM-entry failures during or after loading guest state". */ vmentry_fail_vmexit_guest_mode: if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) vcpu->arch.tsc_offset -= vmcs12->tsc_offset; leave_guest_mode(vcpu); vmentry_fail_vmexit: vmx_switch_vmcs(vcpu, &vmx->vmcs01); if (!from_vmentry) return NVMX_VMENTRY_VMEXIT; load_vmcs12_host_state(vcpu, vmcs12); vmcs12->vm_exit_reason = exit_reason.full; if (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx)) vmx->nested.need_vmcs12_to_shadow_sync = true; return NVMX_VMENTRY_VMEXIT; } /* * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 * for running an L2 nested guest. */ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) { struct vmcs12 *vmcs12; enum nvmx_vmentry_status status; struct vcpu_vmx *vmx = to_vmx(vcpu); u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); enum nested_evmptrld_status evmptrld_status; if (!nested_vmx_check_permission(vcpu)) return 1; evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch); if (evmptrld_status == EVMPTRLD_ERROR) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } kvm_pmu_branch_retired(vcpu); if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) return nested_vmx_failInvalid(vcpu); if (CC(!nested_vmx_is_evmptr12_valid(vmx) && vmx->nested.current_vmptr == INVALID_GPA)) return nested_vmx_failInvalid(vcpu); vmcs12 = get_vmcs12(vcpu); /* * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact * that there *is* a valid VMCS pointer, RFLAGS.CF is set * rather than RFLAGS.ZF, and no error number is stored to the * VM-instruction error field. */ if (CC(vmcs12->hdr.shadow_vmcs)) return nested_vmx_failInvalid(vcpu); if (nested_vmx_is_evmptr12_valid(vmx)) { struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); copy_enlightened_to_vmcs12(vmx, evmcs->hv_clean_fields); /* Enlightened VMCS doesn't have launch state */ vmcs12->launch_state = !launch; } else if (enable_shadow_vmcs) { copy_shadow_to_vmcs12(vmx); } /* * The nested entry process starts with enforcing various prerequisites * on vmcs12 as required by the Intel SDM, and act appropriately when * they fail: As the SDM explains, some conditions should cause the * instruction to fail, while others will cause the instruction to seem * to succeed, but return an EXIT_REASON_INVALID_STATE. * To speed up the normal (success) code path, we should avoid checking * for misconfigurations which will anyway be caught by the processor * when using the merged vmcs02. */ if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)) return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); if (CC(vmcs12->launch_state == launch)) return nested_vmx_fail(vcpu, launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS : VMXERR_VMRESUME_NONLAUNCHED_VMCS); if (nested_vmx_check_controls(vcpu, vmcs12)) return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); if (nested_vmx_check_address_space_size(vcpu, vmcs12)) return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); if (nested_vmx_check_host_state(vcpu, vmcs12)) return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); /* * We're finally done with prerequisite checking, and can start with * the nested entry. */ vmx->nested.nested_run_pending = 1; vmx->nested.has_preemption_timer_deadline = false; status = nested_vmx_enter_non_root_mode(vcpu, true); if (unlikely(status != NVMX_VMENTRY_SUCCESS)) goto vmentry_failed; /* Hide L1D cache contents from the nested guest. */ vmx->vcpu.arch.l1tf_flush_l1d = true; /* * Must happen outside of nested_vmx_enter_non_root_mode() as it will * also be used as part of restoring nVMX state for * snapshot restore (migration). * * In this flow, it is assumed that vmcs12 cache was * transferred as part of captured nVMX state and should * therefore not be read from guest memory (which may not * exist on destination host yet). */ nested_cache_shadow_vmcs12(vcpu, vmcs12); switch (vmcs12->guest_activity_state) { case GUEST_ACTIVITY_HLT: /* * If we're entering a halted L2 vcpu and the L2 vcpu won't be * awakened by event injection or by an NMI-window VM-exit or * by an interrupt-window VM-exit, halt the vcpu. */ if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) && !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) && (vmcs12->guest_rflags & X86_EFLAGS_IF))) { vmx->nested.nested_run_pending = 0; return kvm_emulate_halt_noskip(vcpu); } break; case GUEST_ACTIVITY_WAIT_SIPI: vmx->nested.nested_run_pending = 0; kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED); break; default: break; } return 1; vmentry_failed: vmx->nested.nested_run_pending = 0; if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR) return 0; if (status == NVMX_VMENTRY_VMEXIT) return 1; WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL); return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); } /* * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK). * This function returns the new value we should put in vmcs12.guest_cr0. * It's not enough to just return the vmcs02 GUEST_CR0. Rather, * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 * didn't trap the bit, because if L1 did, so would L0). * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have * been modified by L2, and L1 knows it. So just leave the old value of * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 * isn't relevant, because if L0 traps this bit it can set it to anything. * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have * changed these bits, and therefore they need to be updated, but L0 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. */ static inline unsigned long vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { return /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | vcpu->arch.cr0_guest_owned_bits)); } static inline unsigned long vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { return /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | vcpu->arch.cr4_guest_owned_bits)); } static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, u32 vm_exit_reason, u32 exit_intr_info) { u32 idt_vectoring; unsigned int nr; /* * Per the SDM, VM-Exits due to double and triple faults are never * considered to occur during event delivery, even if the double/triple * fault is the result of an escalating vectoring issue. * * Note, the SDM qualifies the double fault behavior with "The original * event results in a double-fault exception". It's unclear why the * qualification exists since exits due to double fault can occur only * while vectoring a different exception (injected events are never * subject to interception), i.e. there's _always_ an original event. * * The SDM also uses NMI as a confusing example for the "original event * causes the VM exit directly" clause. NMI isn't special in any way, * the same rule applies to all events that cause an exit directly. * NMI is an odd choice for the example because NMIs can only occur on * instruction boundaries, i.e. they _can't_ occur during vectoring. */ if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT || ((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI && is_double_fault(exit_intr_info))) { vmcs12->idt_vectoring_info_field = 0; } else if (vcpu->arch.exception.injected) { nr = vcpu->arch.exception.vector; idt_vectoring = nr | VECTORING_INFO_VALID_MASK; if (kvm_exception_is_soft(nr)) { vmcs12->vm_exit_instruction_len = vcpu->arch.event_exit_inst_len; idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; } else idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; if (vcpu->arch.exception.has_error_code) { idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; vmcs12->idt_vectoring_error_code = vcpu->arch.exception.error_code; } vmcs12->idt_vectoring_info_field = idt_vectoring; } else if (vcpu->arch.nmi_injected) { vmcs12->idt_vectoring_info_field = INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; } else if (vcpu->arch.interrupt.injected) { nr = vcpu->arch.interrupt.nr; idt_vectoring = nr | VECTORING_INFO_VALID_MASK; if (vcpu->arch.interrupt.soft) { idt_vectoring |= INTR_TYPE_SOFT_INTR; vmcs12->vm_entry_instruction_len = vcpu->arch.event_exit_inst_len; } else idt_vectoring |= INTR_TYPE_EXT_INTR; vmcs12->idt_vectoring_info_field = idt_vectoring; } else { vmcs12->idt_vectoring_info_field = 0; } } void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); gfn_t gfn; /* * Don't need to mark the APIC access page dirty; it is never * written to by the CPU during APIC virtualization. */ if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; kvm_vcpu_mark_page_dirty(vcpu, gfn); } if (nested_cpu_has_posted_intr(vmcs12)) { gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; kvm_vcpu_mark_page_dirty(vcpu, gfn); } } static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); int max_irr; void *vapic_page; u16 status; if (!vmx->nested.pi_pending) return 0; if (!vmx->nested.pi_desc) goto mmio_needed; vmx->nested.pi_pending = false; if (!pi_test_and_clear_on(vmx->nested.pi_desc)) return 0; max_irr = pi_find_highest_vector(vmx->nested.pi_desc); if (max_irr > 0) { vapic_page = vmx->nested.virtual_apic_map.hva; if (!vapic_page) goto mmio_needed; __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page, &max_irr); status = vmcs_read16(GUEST_INTR_STATUS); if ((u8)max_irr > ((u8)status & 0xff)) { status &= ~0xff; status |= (u8)max_irr; vmcs_write16(GUEST_INTR_STATUS, status); } } nested_mark_vmcs12_pages_dirty(vcpu); return 0; mmio_needed: kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL); return -ENXIO; } static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu) { struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit; u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; struct vmcs12 *vmcs12 = get_vmcs12(vcpu); unsigned long exit_qual; if (ex->has_payload) { exit_qual = ex->payload; } else if (ex->vector == PF_VECTOR) { exit_qual = vcpu->arch.cr2; } else if (ex->vector == DB_VECTOR) { exit_qual = vcpu->arch.dr6; exit_qual &= ~DR6_BT; exit_qual ^= DR6_ACTIVE_LOW; } else { exit_qual = 0; } /* * Unlike AMD's Paged Real Mode, which reports an error code on #PF * VM-Exits even if the CPU is in Real Mode, Intel VMX never sets the * "has error code" flags on VM-Exit if the CPU is in Real Mode. */ if (ex->has_error_code && is_protmode(vcpu)) { /* * Intel CPUs do not generate error codes with bits 31:16 set, * and more importantly VMX disallows setting bits 31:16 in the * injected error code for VM-Entry. Drop the bits to mimic * hardware and avoid inducing failure on nested VM-Entry if L1 * chooses to inject the exception back to L2. AMD CPUs _do_ * generate "full" 32-bit error codes, so KVM allows userspace * to inject exception error codes with bits 31:16 set. */ vmcs12->vm_exit_intr_error_code = (u16)ex->error_code; intr_info |= INTR_INFO_DELIVER_CODE_MASK; } if (kvm_exception_is_soft(ex->vector)) intr_info |= INTR_TYPE_SOFT_EXCEPTION; else intr_info |= INTR_TYPE_HARD_EXCEPTION; if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && vmx_get_nmi_mask(vcpu)) intr_info |= INTR_INFO_UNBLOCK_NMI; nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); } /* * Returns true if a debug trap is (likely) pending delivery. Infer the class * of a #DB (trap-like vs. fault-like) from the exception payload (to-be-DR6). * Using the payload is flawed because code breakpoints (fault-like) and data * breakpoints (trap-like) set the same bits in DR6 (breakpoint detected), i.e. * this will return false positives if a to-be-injected code breakpoint #DB is * pending (from KVM's perspective, but not "pending" across an instruction * boundary). ICEBP, a.k.a. INT1, is also not reflected here even though it * too is trap-like. * * KVM "works" despite these flaws as ICEBP isn't currently supported by the * emulator, Monitor Trap Flag is not marked pending on intercepted #DBs (the * #DB has already happened), and MTF isn't marked pending on code breakpoints * from the emulator (because such #DBs are fault-like and thus don't trigger * actions that fire on instruction retire). */ static unsigned long vmx_get_pending_dbg_trap(struct kvm_queued_exception *ex) { if (!ex->pending || ex->vector != DB_VECTOR) return 0; /* General Detect #DBs are always fault-like. */ return ex->payload & ~DR6_BD; } /* * Returns true if there's a pending #DB exception that is lower priority than * a pending Monitor Trap Flag VM-Exit. TSS T-flag #DBs are not emulated by * KVM, but could theoretically be injected by userspace. Note, this code is * imperfect, see above. */ static bool vmx_is_low_priority_db_trap(struct kvm_queued_exception *ex) { return vmx_get_pending_dbg_trap(ex) & ~DR6_BT; } /* * Certain VM-exits set the 'pending debug exceptions' field to indicate a * recognized #DB (data or single-step) that has yet to be delivered. Since KVM * represents these debug traps with a payload that is said to be compatible * with the 'pending debug exceptions' field, write the payload to the VMCS * field if a VM-exit is delivered before the debug trap. */ static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu) { unsigned long pending_dbg; pending_dbg = vmx_get_pending_dbg_trap(&vcpu->arch.exception); if (pending_dbg) vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, pending_dbg); } static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) { return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && to_vmx(vcpu)->nested.preemption_timer_expired; } static bool vmx_has_nested_events(struct kvm_vcpu *vcpu, bool for_injection) { struct vcpu_vmx *vmx = to_vmx(vcpu); void *vapic = vmx->nested.virtual_apic_map.hva; int max_irr, vppr; if (nested_vmx_preemption_timer_pending(vcpu) || vmx->nested.mtf_pending) return true; /* * Virtual Interrupt Delivery doesn't require manual injection. Either * the interrupt is already in GUEST_RVI and will be recognized by CPU * at VM-Entry, or there is a KVM_REQ_EVENT pending and KVM will move * the interrupt from the PIR to RVI prior to entering the guest. */ if (for_injection) return false; if (!nested_cpu_has_vid(get_vmcs12(vcpu)) || __vmx_interrupt_blocked(vcpu)) return false; if (!vapic) return false; vppr = *((u32 *)(vapic + APIC_PROCPRI)); max_irr = vmx_get_rvi(); if ((max_irr & 0xf0) > (vppr & 0xf0)) return true; if (vmx->nested.pi_pending && vmx->nested.pi_desc && pi_test_on(vmx->nested.pi_desc)) { max_irr = pi_find_highest_vector(vmx->nested.pi_desc); if (max_irr > 0 && (max_irr & 0xf0) > (vppr & 0xf0)) return true; } return false; } /* * Per the Intel SDM's table "Priority Among Concurrent Events", with minor * edits to fill in missing examples, e.g. #DB due to split-lock accesses, * and less minor edits to splice in the priority of VMX Non-Root specific * events, e.g. MTF and NMI/INTR-window exiting. * * 1 Hardware Reset and Machine Checks * - RESET * - Machine Check * * 2 Trap on Task Switch * - T flag in TSS is set (on task switch) * * 3 External Hardware Interventions * - FLUSH * - STOPCLK * - SMI * - INIT * * 3.5 Monitor Trap Flag (MTF) VM-exit[1] * * 4 Traps on Previous Instruction * - Breakpoints * - Trap-class Debug Exceptions (#DB due to TF flag set, data/I-O * breakpoint, or #DB due to a split-lock access) * * 4.3 VMX-preemption timer expired VM-exit * * 4.6 NMI-window exiting VM-exit[2] * * 5 Nonmaskable Interrupts (NMI) * * 5.5 Interrupt-window exiting VM-exit and Virtual-interrupt delivery * * 6 Maskable Hardware Interrupts * * 7 Code Breakpoint Fault * * 8 Faults from Fetching Next Instruction * - Code-Segment Limit Violation * - Code Page Fault * - Control protection exception (missing ENDBRANCH at target of indirect * call or jump) * * 9 Faults from Decoding Next Instruction * - Instruction length > 15 bytes * - Invalid Opcode * - Coprocessor Not Available * *10 Faults on Executing Instruction * - Overflow * - Bound error * - Invalid TSS * - Segment Not Present * - Stack fault * - General Protection * - Data Page Fault * - Alignment Check * - x86 FPU Floating-point exception * - SIMD floating-point exception * - Virtualization exception * - Control protection exception * * [1] Per the "Monitor Trap Flag" section: System-management interrupts (SMIs), * INIT signals, and higher priority events take priority over MTF VM exits. * MTF VM exits take priority over debug-trap exceptions and lower priority * events. * * [2] Debug-trap exceptions and higher priority events take priority over VM exits * caused by the VMX-preemption timer. VM exits caused by the VMX-preemption * timer take priority over VM exits caused by the "NMI-window exiting" * VM-execution control and lower priority events. * * [3] Debug-trap exceptions and higher priority events take priority over VM exits * caused by "NMI-window exiting". VM exits caused by this control take * priority over non-maskable interrupts (NMIs) and lower priority events. * * [4] Virtual-interrupt delivery has the same priority as that of VM exits due to * the 1-setting of the "interrupt-window exiting" VM-execution control. Thus, * non-maskable interrupts (NMIs) and higher priority events take priority over * delivery of a virtual interrupt; delivery of a virtual interrupt takes * priority over external interrupts and lower priority events. */ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; struct vcpu_vmx *vmx = to_vmx(vcpu); /* * Only a pending nested run blocks a pending exception. If there is a * previously injected event, the pending exception occurred while said * event was being delivered and thus needs to be handled. */ bool block_nested_exceptions = vmx->nested.nested_run_pending; /* * Events that don't require injection, i.e. that are virtualized by * hardware, aren't blocked by a pending VM-Enter as KVM doesn't need * to regain control in order to deliver the event, and hardware will * handle event ordering, e.g. with respect to injected exceptions. * * But, new events (not exceptions) are only recognized at instruction * boundaries. If an event needs reinjection, then KVM is handling a * VM-Exit that occurred _during_ instruction execution; new events, * irrespective of whether or not they're injected, are blocked until * the instruction completes. */ bool block_non_injected_events = kvm_event_needs_reinjection(vcpu); /* * Inject events are blocked by nested VM-Enter, as KVM is responsible * for managing priority between concurrent events, i.e. KVM needs to * wait until after VM-Enter completes to deliver injected events. */ bool block_nested_events = block_nested_exceptions || block_non_injected_events; if (lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &apic->pending_events)) { if (block_nested_events) return -EBUSY; nested_vmx_update_pending_dbg(vcpu); clear_bit(KVM_APIC_INIT, &apic->pending_events); if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); /* MTF is discarded if the vCPU is in WFS. */ vmx->nested.mtf_pending = false; return 0; } if (lapic_in_kernel(vcpu) && test_bit(KVM_APIC_SIPI, &apic->pending_events)) { if (block_nested_events) return -EBUSY; clear_bit(KVM_APIC_SIPI, &apic->pending_events); if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0, apic->sipi_vector & 0xFFUL); return 0; } /* Fallthrough, the SIPI is completely ignored. */ } /* * Process exceptions that are higher priority than Monitor Trap Flag: * fault-like exceptions, TSS T flag #DB (not emulated by KVM, but * could theoretically come in from userspace), and ICEBP (INT1). * * TODO: SMIs have higher priority than MTF and trap-like #DBs (except * for TSS T flag #DBs). KVM also doesn't save/restore pending MTF * across SMI/RSM as it should; that needs to be addressed in order to * prioritize SMI over MTF and trap-like #DBs. */ if (vcpu->arch.exception_vmexit.pending && !vmx_is_low_priority_db_trap(&vcpu->arch.exception_vmexit)) { if (block_nested_exceptions) return -EBUSY; nested_vmx_inject_exception_vmexit(vcpu); return 0; } if (vcpu->arch.exception.pending && !vmx_is_low_priority_db_trap(&vcpu->arch.exception)) { if (block_nested_exceptions) return -EBUSY; goto no_vmexit; } if (vmx->nested.mtf_pending) { if (block_nested_events) return -EBUSY; nested_vmx_update_pending_dbg(vcpu); nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0); return 0; } if (vcpu->arch.exception_vmexit.pending) { if (block_nested_exceptions) return -EBUSY; nested_vmx_inject_exception_vmexit(vcpu); return 0; } if (vcpu->arch.exception.pending) { if (block_nested_exceptions) return -EBUSY; goto no_vmexit; } if (nested_vmx_preemption_timer_pending(vcpu)) { if (block_nested_events) return -EBUSY; nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); return 0; } if (vcpu->arch.smi_pending && !is_smm(vcpu)) { if (block_nested_events) return -EBUSY; goto no_vmexit; } if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) { if (block_nested_events) return -EBUSY; if (!nested_exit_on_nmi(vcpu)) goto no_vmexit; nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, NMI_VECTOR | INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK, 0); /* * The NMI-triggered VM exit counts as injection: * clear this one and block further NMIs. */ vcpu->arch.nmi_pending = 0; vmx_set_nmi_mask(vcpu, true); return 0; } if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { int irq; if (!nested_exit_on_intr(vcpu)) { if (block_nested_events) return -EBUSY; goto no_vmexit; } if (!nested_exit_intr_ack_set(vcpu)) { if (block_nested_events) return -EBUSY; nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); return 0; } irq = kvm_cpu_get_extint(vcpu); if (irq != -1) { if (block_nested_events) return -EBUSY; nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); return 0; } irq = kvm_apic_has_interrupt(vcpu); if (WARN_ON_ONCE(irq < 0)) goto no_vmexit; /* * If the IRQ is L2's PI notification vector, process posted * interrupts for L2 instead of injecting VM-Exit, as the * detection/morphing architecturally occurs when the IRQ is * delivered to the CPU. Note, only interrupts that are routed * through the local APIC trigger posted interrupt processing, * and enabling posted interrupts requires ACK-on-exit. */ if (irq == vmx->nested.posted_intr_nv) { /* * Nested posted interrupts are delivered via RVI, i.e. * aren't injected by KVM, and so can be queued even if * manual event injection is disallowed. */ if (block_non_injected_events) return -EBUSY; vmx->nested.pi_pending = true; kvm_apic_clear_irr(vcpu, irq); goto no_vmexit; } if (block_nested_events) return -EBUSY; nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); /* * ACK the interrupt _after_ emulating VM-Exit, as the IRQ must * be marked as in-service in vmcs01.GUEST_INTERRUPT_STATUS.SVI * if APICv is active. */ kvm_apic_ack_interrupt(vcpu, irq); return 0; } no_vmexit: return vmx_complete_nested_posted_interrupt(vcpu); } static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) { ktime_t remaining = hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); u64 value; if (ktime_to_ns(remaining) <= 0) return 0; value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; do_div(value, 1000000); return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; } static bool is_vmcs12_ext_field(unsigned long field) { switch (field) { case GUEST_ES_SELECTOR: case GUEST_CS_SELECTOR: case GUEST_SS_SELECTOR: case GUEST_DS_SELECTOR: case GUEST_FS_SELECTOR: case GUEST_GS_SELECTOR: case GUEST_LDTR_SELECTOR: case GUEST_TR_SELECTOR: case GUEST_ES_LIMIT: case GUEST_CS_LIMIT: case GUEST_SS_LIMIT: case GUEST_DS_LIMIT: case GUEST_FS_LIMIT: case GUEST_GS_LIMIT: case GUEST_LDTR_LIMIT: case GUEST_TR_LIMIT: case GUEST_GDTR_LIMIT: case GUEST_IDTR_LIMIT: case GUEST_ES_AR_BYTES: case GUEST_DS_AR_BYTES: case GUEST_FS_AR_BYTES: case GUEST_GS_AR_BYTES: case GUEST_LDTR_AR_BYTES: case GUEST_TR_AR_BYTES: case GUEST_ES_BASE: case GUEST_CS_BASE: case GUEST_SS_BASE: case GUEST_DS_BASE: case GUEST_FS_BASE: case GUEST_GS_BASE: case GUEST_LDTR_BASE: case GUEST_TR_BASE: case GUEST_GDTR_BASE: case GUEST_IDTR_BASE: case GUEST_PENDING_DBG_EXCEPTIONS: case GUEST_BNDCFGS: return true; default: break; } return false; } static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); vmcs12->guest_pending_dbg_exceptions = vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; } static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); int cpu; if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare) return; WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); cpu = get_cpu(); vmx->loaded_vmcs = &vmx->nested.vmcs02; vmx_vcpu_load_vmcs(vcpu, cpu); sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); vmx->loaded_vmcs = &vmx->vmcs01; vmx_vcpu_load_vmcs(vcpu, cpu); put_cpu(); } /* * Update the guest state fields of vmcs12 to reflect changes that * occurred while L2 was running. (The "IA-32e mode guest" bit of the * VM-entry controls is also updated, since this is really a guest * state bit.) */ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); if (nested_vmx_is_evmptr12_valid(vmx)) sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); vmx->nested.need_sync_vmcs02_to_vmcs12_rare = !nested_vmx_is_evmptr12_valid(vmx); vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); vmcs12->guest_rsp = kvm_rsp_read(vcpu); vmcs12->guest_rip = kvm_rip_read(vcpu); vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); vmcs12->guest_interruptibility_info = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI; else vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; if (nested_cpu_has_preemption_timer(vmcs12) && vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER && !vmx->nested.nested_run_pending) vmcs12->vmx_preemption_timer_value = vmx_get_preemption_timer_value(vcpu); /* * In some cases (usually, nested EPT), L2 is allowed to change its * own CR3 without exiting. If it has changed it, we must keep it. * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. * * Additionally, restore L2's PDPTR to vmcs12. */ if (enable_ept) { vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); } } vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); if (nested_cpu_has_vid(vmcs12)) vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); vmcs12->vm_entry_controls = (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); /* * Note! Save DR7, but intentionally don't grab DEBUGCTL from vmcs02. * Writes to DEBUGCTL that aren't intercepted by L1 are immediately * propagated to vmcs12 (see vmx_set_msr()), as the value loaded into * vmcs02 doesn't strictly track vmcs12. */ if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) vmcs12->guest_dr7 = vcpu->arch.dr7; if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) vmcs12->guest_ia32_efer = vcpu->arch.efer; vmcs_read_cet_state(&vmx->vcpu, &vmcs12->guest_s_cet, &vmcs12->guest_ssp, &vmcs12->guest_ssp_tbl); } /* * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), * and this function updates it to reflect the changes to the guest state while * L2 was running (and perhaps made some exits which were handled directly by L0 * without going back to L1), and to reflect the exit reason. * Note that we do not have to copy here all VMCS fields, just those that * could have changed by the L2 guest or the exit - i.e., the guest-state and * exit-information fields only. Other fields are modified by L1 with VMWRITE, * which already writes to vmcs12 directly. */ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, u32 vm_exit_reason, u32 exit_intr_info, unsigned long exit_qualification, u32 exit_insn_len) { /* update exit information fields: */ vmcs12->vm_exit_reason = vm_exit_reason; if (vmx_get_exit_reason(vcpu).enclave_mode) vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; vmcs12->exit_qualification = exit_qualification; /* * On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched * and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other * exit info fields are unmodified. */ if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { vmcs12->launch_state = 1; /* vm_entry_intr_info_field is cleared on exit. Emulate this * instead of reading the real value. */ vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; /* * Transfer the event that L0 or L1 may wanted to inject into * L2 to IDT_VECTORING_INFO_FIELD. */ vmcs12_save_pending_event(vcpu, vmcs12, vm_exit_reason, exit_intr_info); vmcs12->vm_exit_intr_info = exit_intr_info; vmcs12->vm_exit_instruction_len = exit_insn_len; vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); /* * According to spec, there's no need to store the guest's * MSRs if the exit is due to a VM-entry failure that occurs * during or after loading the guest state. Since this exit * does not fall in that category, we need to save the MSRs. */ if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr, vmcs12->vm_exit_msr_store_count)) nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL); } } /* * A part of what we need to when the nested L2 guest exits and we want to * run its L1 parent, is to reset L1's guest state to the host state specified * in vmcs12. * This function is to be called not only on normal nested exit, but also on * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry * Failures During or After Loading Guest State"). * This function should be called when the active VMCS is L1's (vmcs01). */ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { enum vm_entry_failure_code ignored; struct kvm_segment seg; if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) vcpu->arch.efer = vmcs12->host_ia32_efer; else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) vcpu->arch.efer |= (EFER_LMA | EFER_LME); else vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); vmx_set_efer(vcpu, vcpu->arch.efer); kvm_rsp_write(vcpu, vmcs12->host_rsp); kvm_rip_write(vcpu, vmcs12->host_rip); vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); vmx_set_interrupt_shadow(vcpu, 0); /* * Note that calling vmx_set_cr0 is important, even if cr0 hasn't * actually changed, because vmx_set_cr0 refers to efer set above. * * CR0_GUEST_HOST_MASK is already set in the original vmcs01 * (KVM doesn't change it); */ vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); vmx_set_cr0(vcpu, vmcs12->host_cr0); /* Same as above - no reason to call set_cr4_guest_host_mask(). */ vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); vmx_set_cr4(vcpu, vmcs12->host_cr4); nested_ept_uninit_mmu_context(vcpu); /* * Only PDPTE load can fail as the value of cr3 was checked on entry and * couldn't have changed. */ if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored)) nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); nested_vmx_transition_tlb_flush(vcpu, vmcs12, false); vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) vmcs_write64(GUEST_BNDCFGS, 0); /* * Load CET state from host state if VM_EXIT_LOAD_CET_STATE is set. * otherwise CET state should be retained across VM-exit, i.e., * guest values should be propagated from vmcs12 to vmcs01. */ if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_CET_STATE) vmcs_write_cet_state(vcpu, vmcs12->host_s_cet, vmcs12->host_ssp, vmcs12->host_ssp_tbl); else vmcs_write_cet_state(vcpu, vmcs12->guest_s_cet, vmcs12->guest_ssp, vmcs12->guest_ssp_tbl); if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); vcpu->arch.pat = vmcs12->host_ia32_pat; } if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) WARN_ON_ONCE(__kvm_emulate_msr_write(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, vmcs12->host_ia32_perf_global_ctrl)); /* Set L1 segment info according to Intel SDM 27.5.2 Loading Host Segment and Descriptor-Table Registers */ seg = (struct kvm_segment) { .base = 0, .limit = 0xFFFFFFFF, .selector = vmcs12->host_cs_selector, .type = 11, .present = 1, .s = 1, .g = 1 }; if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) seg.l = 1; else seg.db = 1; __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); seg = (struct kvm_segment) { .base = 0, .limit = 0xFFFFFFFF, .type = 3, .present = 1, .s = 1, .db = 1, .g = 1 }; seg.selector = vmcs12->host_ds_selector; __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); seg.selector = vmcs12->host_es_selector; __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); seg.selector = vmcs12->host_ss_selector; __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); seg.selector = vmcs12->host_fs_selector; seg.base = vmcs12->host_fs_base; __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); seg.selector = vmcs12->host_gs_selector; seg.base = vmcs12->host_gs_base; __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); seg = (struct kvm_segment) { .base = vmcs12->host_tr_base, .limit = 0x67, .selector = vmcs12->host_tr_selector, .type = 11, .present = 1 }; __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); memset(&seg, 0, sizeof(seg)); seg.unusable = 1; __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR); kvm_set_dr(vcpu, 7, 0x400); vmx_guest_debugctl_write(vcpu, 0); if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, vmcs12->vm_exit_msr_load_count)) nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); to_vt(vcpu)->emulation_required = vmx_emulation_required(vcpu); } static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) { struct vmx_uret_msr *efer_msr; unsigned int i; if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) return vmcs_read64(GUEST_IA32_EFER); if (cpu_has_load_ia32_efer()) return kvm_host.efer; for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) return vmx->msr_autoload.guest.val[i].value; } efer_msr = vmx_find_uret_msr(vmx, MSR_EFER); if (efer_msr) return efer_msr->data; return kvm_host.efer; } static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmx_msr_entry g, h; gpa_t gpa; u32 i, j; vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { /* * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set * as vmcs01.GUEST_DR7 contains a userspace defined value * and vcpu->arch.dr7 is not squirreled away before the * nested VMENTER (not worth adding a variable in nested_vmx). */ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) kvm_set_dr(vcpu, 7, DR7_FIXED_1); else WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); } /* Reload DEBUGCTL to ensure vmcs01 has a fresh FREEZE_IN_SMM value. */ vmx_reload_guest_debugctl(vcpu); /* * Note that calling vmx_set_{efer,cr0,cr4} is important as they * handle a variety of side effects to KVM's software model. */ vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); nested_ept_uninit_mmu_context(vcpu); vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); /* * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs * from vmcs01 (if necessary). The PDPTRs are not loaded on * VMFail, like everything else we just need to ensure our * software model is up-to-date. */ if (enable_ept && is_pae_paging(vcpu)) ept_save_pdptrs(vcpu); kvm_mmu_reset_context(vcpu); /* * This nasty bit of open coding is a compromise between blindly * loading L1's MSRs using the exit load lists (incorrect emulation * of VMFail), leaving the nested VM's MSRs in the software model * (incorrect behavior) and snapshotting the modified MSRs (too * expensive since the lists are unbound by hardware). For each * MSR that was (prematurely) loaded from the nested VMEntry load * list, reload it from the exit load list if it exists and differs * from the guest value. The intent is to stuff host state as * silently as possible, not to fully process the exit load list. */ for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { pr_debug_ratelimited( "%s read MSR index failed (%u, 0x%08llx)\n", __func__, i, gpa); goto vmabort; } for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { pr_debug_ratelimited( "%s read MSR failed (%u, 0x%08llx)\n", __func__, j, gpa); goto vmabort; } if (h.index != g.index) continue; if (h.value == g.value) break; if (nested_vmx_load_msr_check(vcpu, &h)) { pr_debug_ratelimited( "%s check failed (%u, 0x%x, 0x%x)\n", __func__, j, h.index, h.reserved); goto vmabort; } if (kvm_emulate_msr_write(vcpu, h.index, h.value)) { pr_debug_ratelimited( "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", __func__, j, h.index, h.value); goto vmabort; } } } return; vmabort: nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); } /* * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 * and modify vmcs12 to make it see what it would expect to see there if * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) */ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, u32 exit_intr_info, unsigned long exit_qualification, u32 exit_insn_len) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); /* Pending MTF traps are discarded on VM-Exit. */ vmx->nested.mtf_pending = false; /* trying to cancel vmlaunch/vmresume is a bug */ WARN_ON_ONCE(vmx->nested.nested_run_pending); #ifdef CONFIG_KVM_HYPERV if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { /* * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map * Enlightened VMCS after migration and we still need to * do that when something is forcing L2->L1 exit prior to * the first L2 run. */ (void)nested_get_evmcs_page(vcpu); } #endif /* Service pending TLB flush requests for L2 before switching to L1. */ kvm_service_local_tlb_flush_requests(vcpu); /* * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are * up-to-date before switching to L1. */ if (enable_ept && is_pae_paging(vcpu)) vmx_ept_load_pdptrs(vcpu); leave_guest_mode(vcpu); if (nested_cpu_has_preemption_timer(vmcs12)) hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) { vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset; if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; } if (likely(!vmx->fail)) { sync_vmcs02_to_vmcs12(vcpu, vmcs12); if (vm_exit_reason != -1) prepare_vmcs12(vcpu, vmcs12, vm_exit_reason, exit_intr_info, exit_qualification, exit_insn_len); /* * Must happen outside of sync_vmcs02_to_vmcs12() as it will * also be used to capture vmcs12 cache as part of * capturing nVMX state for snapshot (migration). * * Otherwise, this flush will dirty guest memory at a * point it is already assumed by user-space to be * immutable. */ nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); } else { /* * The only expected VM-instruction error is "VM entry with * invalid control field(s)." Anything else indicates a * problem with L0. And we should never get here with a * VMFail of any type if early consistency checks are enabled. */ WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != VMXERR_ENTRY_INVALID_CONTROL_FIELD); WARN_ON_ONCE(nested_early_check); } /* * Drop events/exceptions that were queued for re-injection to L2 * (picked up via vmx_complete_interrupts()), as well as exceptions * that were pending for L2. Note, this must NOT be hoisted above * prepare_vmcs12(), events/exceptions queued for re-injection need to * be captured in vmcs12 (see vmcs12_save_pending_event()). */ vcpu->arch.nmi_injected = false; kvm_clear_exception_queue(vcpu); kvm_clear_interrupt_queue(vcpu); vmx_switch_vmcs(vcpu, &vmx->vmcs01); kvm_nested_vmexit_handle_ibrs(vcpu); /* Update any VMCS fields that might have changed while L2 ran */ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); if (kvm_caps.has_tsc_control) vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); if (vmx->nested.l1_tpr_threshold != -1) vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold); if (vmx->nested.change_vmcs01_virtual_apic_mode) { vmx->nested.change_vmcs01_virtual_apic_mode = false; vmx_set_virtual_apic_mode(vcpu); } if (vmx->nested.update_vmcs01_cpu_dirty_logging) { vmx->nested.update_vmcs01_cpu_dirty_logging = false; vmx_update_cpu_dirty_logging(vcpu); } nested_put_vmcs12_pages(vcpu); if (vmx->nested.reload_vmcs01_apic_access_page) { vmx->nested.reload_vmcs01_apic_access_page = false; kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); } if (vmx->nested.update_vmcs01_apicv_status) { vmx->nested.update_vmcs01_apicv_status = false; kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); } if (vmx->nested.update_vmcs01_hwapic_isr) { vmx->nested.update_vmcs01_hwapic_isr = false; kvm_apic_update_hwapic_isr(vcpu); } if ((vm_exit_reason != -1) && (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx))) vmx->nested.need_vmcs12_to_shadow_sync = true; /* in case we halted in L2 */ kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); if (likely(!vmx->fail)) { if (vm_exit_reason != -1) trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, vmcs12->exit_qualification, vmcs12->idt_vectoring_info_field, vmcs12->vm_exit_intr_info, vmcs12->vm_exit_intr_error_code, KVM_ISA_VMX); load_vmcs12_host_state(vcpu, vmcs12); /* * Process events if an injectable IRQ or NMI is pending, even * if the event is blocked (RFLAGS.IF is cleared on VM-Exit). * If an event became pending while L2 was active, KVM needs to * either inject the event or request an IRQ/NMI window. SMIs * don't need to be processed as SMM is mutually exclusive with * non-root mode. INIT/SIPI don't need to be checked as INIT * is blocked post-VMXON, and SIPIs are ignored. */ if (kvm_cpu_has_injectable_intr(vcpu) || vcpu->arch.nmi_pending) kvm_make_request(KVM_REQ_EVENT, vcpu); return; } /* * After an early L2 VM-entry failure, we're now back * in L1 which thinks it just finished a VMLAUNCH or * VMRESUME instruction, so we need to set the failure * flag and the VM-instruction error field of the VMCS * accordingly, and skip the emulated instruction. */ (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); /* * Restore L1's host state to KVM's software model. We're here * because a consistency check was caught by hardware, which * means some amount of guest state has been propagated to KVM's * model and needs to be unwound to the host's state. */ nested_vmx_restore_host_state(vcpu); vmx->fail = 0; } static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu) { kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); } /* * Decode the memory-address operand of a vmx instruction, as recorded on an * exit caused by such an instruction (run by a guest hypervisor). * On success, returns 0. When the operand is invalid, returns 1 and throws * #UD, #GP, or #SS. */ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, u32 vmx_instruction_info, bool wr, int len, gva_t *ret) { gva_t off; bool exn; struct kvm_segment s; /* * According to Vol. 3B, "Information for VM Exits Due to Instruction * Execution", on an exit, vmx_instruction_info holds most of the * addressing components of the operand. Only the displacement part * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). * For how an actual address is calculated from all these components, * refer to Vol. 1, "Operand Addressing". */ int scaling = vmx_instruction_info & 3; int addr_size = (vmx_instruction_info >> 7) & 7; bool is_reg = vmx_instruction_info & (1u << 10); int seg_reg = (vmx_instruction_info >> 15) & 7; int index_reg = (vmx_instruction_info >> 18) & 0xf; bool index_is_valid = !(vmx_instruction_info & (1u << 22)); int base_reg = (vmx_instruction_info >> 23) & 0xf; bool base_is_valid = !(vmx_instruction_info & (1u << 27)); if (is_reg) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } /* Addr = segment_base + offset */ /* offset = base + [index * scale] + displacement */ off = exit_qualification; /* holds the displacement */ if (addr_size == 1) off = (gva_t)sign_extend64(off, 31); else if (addr_size == 0) off = (gva_t)sign_extend64(off, 15); if (base_is_valid) off += kvm_register_read(vcpu, base_reg); if (index_is_valid) off += kvm_register_read(vcpu, index_reg) << scaling; vmx_get_segment(vcpu, &s, seg_reg); /* * The effective address, i.e. @off, of a memory operand is truncated * based on the address size of the instruction. Note that this is * the *effective address*, i.e. the address prior to accounting for * the segment's base. */ if (addr_size == 1) /* 32 bit */ off &= 0xffffffff; else if (addr_size == 0) /* 16 bit */ off &= 0xffff; /* Checks for #GP/#SS exceptions. */ exn = false; if (is_long_mode(vcpu)) { /* * The virtual/linear address is never truncated in 64-bit * mode, e.g. a 32-bit address size can yield a 64-bit virtual * address when using FS/GS with a non-zero base. */ if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS) *ret = s.base + off; else *ret = off; *ret = vmx_get_untagged_addr(vcpu, *ret, 0); /* Long mode: #GP(0)/#SS(0) if the memory address is in a * non-canonical form. This is the only check on the memory * destination for long mode! */ exn = is_noncanonical_address(*ret, vcpu, 0); } else { /* * When not in long mode, the virtual/linear address is * unconditionally truncated to 32 bits regardless of the * address size. */ *ret = (s.base + off) & 0xffffffff; /* Protected mode: apply checks for segment validity in the * following order: * - segment type check (#GP(0) may be thrown) * - usability check (#GP(0)/#SS(0)) * - limit check (#GP(0)/#SS(0)) */ if (wr) /* #GP(0) if the destination operand is located in a * read-only data segment or any code segment. */ exn = ((s.type & 0xa) == 0 || (s.type & 8)); else /* #GP(0) if the source operand is located in an * execute-only code segment */ exn = ((s.type & 0xa) == 8); if (exn) { kvm_queue_exception_e(vcpu, GP_VECTOR, 0); return 1; } /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. */ exn = (s.unusable != 0); /* * Protected mode: #GP(0)/#SS(0) if the memory operand is * outside the segment limit. All CPUs that support VMX ignore * limit checks for flat segments, i.e. segments with base==0, * limit==0xffffffff and of type expand-up data or code. */ if (!(s.base == 0 && s.limit == 0xffffffff && ((s.type & 8) || !(s.type & 4)))) exn = exn || ((u64)off + len - 1 > s.limit); } if (exn) { kvm_queue_exception_e(vcpu, seg_reg == VCPU_SREG_SS ? SS_VECTOR : GP_VECTOR, 0); return 1; } return 0; } static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, int *ret) { gva_t gva; struct x86_exception e; int r; if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), vmcs_read32(VMX_INSTRUCTION_INFO), false, sizeof(*vmpointer), &gva)) { *ret = 1; return -EINVAL; } r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e); if (r != X86EMUL_CONTINUE) { *ret = kvm_handle_memory_failure(vcpu, r, &e); return -EINVAL; } return 0; } /* * Allocate a shadow VMCS and associate it with the currently loaded * VMCS, unless such a shadow VMCS already exists. The newly allocated * VMCS is also VMCLEARed, so that it is ready for use. */ static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; /* * KVM allocates a shadow VMCS only when L1 executes VMXON and frees it * when L1 executes VMXOFF or the vCPU is forced out of nested * operation. VMXON faults if the CPU is already post-VMXON, so it * should be impossible to already have an allocated shadow VMCS. KVM * doesn't support virtualization of VMCS shadowing, so vmcs01 should * always be the loaded VMCS. */ if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs)) return loaded_vmcs->shadow_vmcs; loaded_vmcs->shadow_vmcs = alloc_vmcs(true); if (loaded_vmcs->shadow_vmcs) vmcs_clear(loaded_vmcs->shadow_vmcs); return loaded_vmcs->shadow_vmcs; } static int enter_vmx_operation(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); int r; r = alloc_loaded_vmcs(&vmx->nested.vmcs02); if (r < 0) goto out_vmcs02; vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); if (!vmx->nested.cached_vmcs12) goto out_cached_vmcs12; vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA; vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); if (!vmx->nested.cached_shadow_vmcs12) goto out_cached_shadow_vmcs12; if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) goto out_shadow_vmcs; hrtimer_setup(&vmx->nested.preemption_timer, vmx_preemption_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); vmx->nested.vpid02 = allocate_vpid(); vmx->nested.vmcs02_initialized = false; vmx->nested.vmxon = true; if (vmx_pt_mode_is_host_guest()) { vmx->pt_desc.guest.ctl = 0; pt_update_intercept_for_msr(vcpu); } return 0; out_shadow_vmcs: kfree(vmx->nested.cached_shadow_vmcs12); out_cached_shadow_vmcs12: kfree(vmx->nested.cached_vmcs12); out_cached_vmcs12: free_loaded_vmcs(&vmx->nested.vmcs02); out_vmcs02: return -ENOMEM; } /* Emulate the VMXON instruction. */ static int handle_vmxon(struct kvm_vcpu *vcpu) { int ret; gpa_t vmptr; uint32_t revision; struct vcpu_vmx *vmx = to_vmx(vcpu); const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; /* * Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter * the guest and so cannot rely on hardware to perform the check, * which has higher priority than VM-Exit (see Intel SDM's pseudocode * for VMXON). * * Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86 * and !COMPATIBILITY modes. For an unrestricted guest, KVM doesn't * force any of the relevant guest state. For a restricted guest, KVM * does force CR0.PE=1, but only to also force VM86 in order to emulate * Real Mode, and so there's no need to check CR0.PE manually. */ if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_VMXE)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } /* * The CPL is checked for "not in VMX operation" and for "in VMX root", * and has higher priority than the VM-Fail due to being post-VMXON, * i.e. VMXON #GPs outside of VMX non-root if CPL!=0. In VMX non-root, * VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits * from L2 to L1, i.e. there's no need to check for the vCPU being in * VMX non-root. * * Forwarding the VM-Exit unconditionally, i.e. without performing the * #UD checks (see above), is functionally ok because KVM doesn't allow * L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's * CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are * missed by hardware due to shadowing CR0 and/or CR4. */ if (vmx_get_cpl(vcpu)) { kvm_inject_gp(vcpu, 0); return 1; } if (vmx->nested.vmxon) return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); /* * Invalid CR0/CR4 generates #GP. These checks are performed if and * only if the vCPU isn't already in VMX operation, i.e. effectively * have lower priority than the VM-Fail above. */ if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) || !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) { kvm_inject_gp(vcpu, 0); return 1; } if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) != VMXON_NEEDED_FEATURES) { kvm_inject_gp(vcpu, 0); return 1; } if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret)) return ret; /* * SDM 3: 24.11.5 * The first 4 bytes of VMXON region contain the supported * VMCS revision identifier * * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; * which replaces physical address width with 32 */ if (!page_address_valid(vcpu, vmptr)) return nested_vmx_failInvalid(vcpu); if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) || revision != VMCS12_REVISION) return nested_vmx_failInvalid(vcpu); vmx->nested.vmxon_ptr = vmptr; ret = enter_vmx_operation(vcpu); if (ret) return ret; return nested_vmx_succeed(vcpu); } static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); if (vmx->nested.current_vmptr == INVALID_GPA) return; copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); if (enable_shadow_vmcs) { /* copy to memory all shadowed fields in case they were modified */ copy_shadow_to_vmcs12(vmx); vmx_disable_shadow_vmcs(vmx); } vmx->nested.posted_intr_nv = -1; /* Flush VMCS12 to guest memory */ kvm_vcpu_write_guest_page(vcpu, vmx->nested.current_vmptr >> PAGE_SHIFT, vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); vmx->nested.current_vmptr = INVALID_GPA; } /* Emulate the VMXOFF instruction */ static int handle_vmxoff(struct kvm_vcpu *vcpu) { if (!nested_vmx_check_permission(vcpu)) return 1; free_nested(vcpu); if (kvm_apic_has_pending_init_or_sipi(vcpu)) kvm_make_request(KVM_REQ_EVENT, vcpu); return nested_vmx_succeed(vcpu); } /* Emulate the VMCLEAR instruction */ static int handle_vmclear(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 zero = 0; gpa_t vmptr; int r; if (!nested_vmx_check_permission(vcpu)) return 1; if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) return r; if (!page_address_valid(vcpu, vmptr)) return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); if (vmptr == vmx->nested.vmxon_ptr) return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); if (likely(!nested_evmcs_handle_vmclear(vcpu, vmptr))) { if (vmptr == vmx->nested.current_vmptr) nested_release_vmcs12(vcpu); /* * Silently ignore memory errors on VMCLEAR, Intel's pseudocode * for VMCLEAR includes a "ensure that data for VMCS referenced * by the operand is in memory" clause that guards writes to * memory, i.e. doing nothing for I/O is architecturally valid. * * FIXME: Suppress failures if and only if no memslot is found, * i.e. exit to userspace if __copy_to_user() fails. */ (void)kvm_vcpu_write_guest(vcpu, vmptr + offsetof(struct vmcs12, launch_state), &zero, sizeof(zero)); } return nested_vmx_succeed(vcpu); } /* Emulate the VMLAUNCH instruction */ static int handle_vmlaunch(struct kvm_vcpu *vcpu) { return nested_vmx_run(vcpu, true); } /* Emulate the VMRESUME instruction */ static int handle_vmresume(struct kvm_vcpu *vcpu) { return nested_vmx_run(vcpu, false); } static int handle_vmread(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) : get_vmcs12(vcpu); unsigned long exit_qualification = vmx_get_exit_qual(vcpu); u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); struct vcpu_vmx *vmx = to_vmx(vcpu); struct x86_exception e; unsigned long field; u64 value; gva_t gva = 0; short offset; int len, r; if (!nested_vmx_check_permission(vcpu)) return 1; /* Decode instruction info and find the field to read */ field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); if (!nested_vmx_is_evmptr12_valid(vmx)) { /* * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, * any VMREAD sets the ALU flags for VMfailInvalid. */ if (vmx->nested.current_vmptr == INVALID_GPA || (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) return nested_vmx_failInvalid(vcpu); offset = get_vmcs12_field_offset(field); if (offset < 0) return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); /* Read the field, zero-extended to a u64 value */ value = vmcs12_read_any(vmcs12, field, offset); } else { /* * Hyper-V TLFS (as of 6.0b) explicitly states, that while an * enlightened VMCS is active VMREAD/VMWRITE instructions are * unsupported. Unfortunately, certain versions of Windows 11 * don't comply with this requirement which is not enforced in * genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a * workaround, as misbehaving guests will panic on VM-Fail. * Note, enlightened VMCS is incompatible with shadow VMCS so * all VMREADs from L2 should go to L1. */ if (WARN_ON_ONCE(is_guest_mode(vcpu))) return nested_vmx_failInvalid(vcpu); offset = evmcs_field_offset(field, NULL); if (offset < 0) return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); /* Read the field, zero-extended to a u64 value */ value = evmcs_read_any(nested_vmx_evmcs(vmx), field, offset); } /* * Now copy part of this value to register or memory, as requested. * Note that the number of bits actually copied is 32 or 64 depending * on the guest's mode (32 or 64 bit), not on the given field's length. */ if (instr_info & BIT(10)) { kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value); } else { len = is_64_bit_mode(vcpu) ? 8 : 4; if (get_vmx_mem_address(vcpu, exit_qualification, instr_info, true, len, &gva)) return 1; /* _system ok, nested_vmx_check_permission has verified cpl=0 */ r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e); if (r != X86EMUL_CONTINUE) return kvm_handle_memory_failure(vcpu, r, &e); } return nested_vmx_succeed(vcpu); } static bool is_shadow_field_rw(unsigned long field) { switch (field) { #define SHADOW_FIELD_RW(x, y) case x: #include "vmcs_shadow_fields.h" return true; default: break; } return false; } static bool is_shadow_field_ro(unsigned long field) { switch (field) { #define SHADOW_FIELD_RO(x, y) case x: #include "vmcs_shadow_fields.h" return true; default: break; } return false; } static int handle_vmwrite(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) : get_vmcs12(vcpu); unsigned long exit_qualification = vmx_get_exit_qual(vcpu); u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); struct vcpu_vmx *vmx = to_vmx(vcpu); struct x86_exception e; unsigned long field; short offset; gva_t gva; int len, r; /* * The value to write might be 32 or 64 bits, depending on L1's long * mode, and eventually we need to write that into a field of several * possible lengths. The code below first zero-extends the value to 64 * bit (value), and then copies only the appropriate number of * bits into the vmcs12 field. */ u64 value = 0; if (!nested_vmx_check_permission(vcpu)) return 1; /* * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, * any VMWRITE sets the ALU flags for VMfailInvalid. */ if (vmx->nested.current_vmptr == INVALID_GPA || (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) return nested_vmx_failInvalid(vcpu); if (instr_info & BIT(10)) value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf)); else { len = is_64_bit_mode(vcpu) ? 8 : 4; if (get_vmx_mem_address(vcpu, exit_qualification, instr_info, false, len, &gva)) return 1; r = kvm_read_guest_virt(vcpu, gva, &value, len, &e); if (r != X86EMUL_CONTINUE) return kvm_handle_memory_failure(vcpu, r, &e); } field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); offset = get_vmcs12_field_offset(field); if (offset < 0) return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); /* * If the vCPU supports "VMWRITE to any supported field in the * VMCS," then the "read-only" fields are actually read/write. */ if (vmcs_field_readonly(field) && !nested_cpu_has_vmwrite_any_field(vcpu)) return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); /* * Ensure vmcs12 is up-to-date before any VMWRITE that dirties * vmcs12, else we may crush a field or consume a stale value. */ if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); /* * Some Intel CPUs intentionally drop the reserved bits of the AR byte * fields on VMWRITE. Emulate this behavior to ensure consistent KVM * behavior regardless of the underlying hardware, e.g. if an AR_BYTE * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD * from L1 will return a different value than VMREAD from L2 (L1 sees * the stripped down value, L2 sees the full value as stored by KVM). */ if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) value &= 0x1f0ff; vmcs12_write_any(vmcs12, field, offset, value); /* * Do not track vmcs12 dirty-state if in guest-mode as we actually * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated * by L1 without a vmexit are always updated in the vmcs02, i.e. don't * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path. */ if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) { /* * L1 can read these fields without exiting, ensure the * shadow VMCS is up-to-date. */ if (enable_shadow_vmcs && is_shadow_field_ro(field)) { preempt_disable(); vmcs_load(vmx->vmcs01.shadow_vmcs); __vmcs_writel(field, value); vmcs_clear(vmx->vmcs01.shadow_vmcs); vmcs_load(vmx->loaded_vmcs->vmcs); preempt_enable(); } vmx->nested.dirty_vmcs12 = true; } return nested_vmx_succeed(vcpu); } static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) { vmx->nested.current_vmptr = vmptr; if (enable_shadow_vmcs) { secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); vmcs_write64(VMCS_LINK_POINTER, __pa(vmx->vmcs01.shadow_vmcs)); vmx->nested.need_vmcs12_to_shadow_sync = true; } vmx->nested.dirty_vmcs12 = true; vmx->nested.force_msr_bitmap_recalc = true; } /* Emulate the VMPTRLD instruction */ static int handle_vmptrld(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); gpa_t vmptr; int r; if (!nested_vmx_check_permission(vcpu)) return 1; if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) return r; if (!page_address_valid(vcpu, vmptr)) return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); if (vmptr == vmx->nested.vmxon_ptr) return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); /* Forbid normal VMPTRLD if Enlightened version was used */ if (nested_vmx_is_evmptr12_valid(vmx)) return 1; if (vmx->nested.current_vmptr != vmptr) { struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache; struct vmcs_hdr hdr; if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) { /* * Reads from an unbacked page return all 1s, * which means that the 32 bits located at the * given physical address won't match the required * VMCS12_REVISION identifier. */ return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); } if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, offsetof(struct vmcs12, hdr), sizeof(hdr))) { return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); } if (hdr.revision_id != VMCS12_REVISION || (hdr.shadow_vmcs && !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); } nested_release_vmcs12(vcpu); /* * Load VMCS12 from guest memory since it is not already * cached. */ if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12, VMCS12_SIZE)) { return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); } set_current_vmptr(vmx, vmptr); } return nested_vmx_succeed(vcpu); } /* Emulate the VMPTRST instruction */ static int handle_vmptrst(struct kvm_vcpu *vcpu) { unsigned long exit_qual = vmx_get_exit_qual(vcpu); u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; struct x86_exception e; gva_t gva; int r; if (!nested_vmx_check_permission(vcpu)) return 1; if (unlikely(nested_vmx_is_evmptr12_valid(to_vmx(vcpu)))) return 1; if (get_vmx_mem_address(vcpu, exit_qual, instr_info, true, sizeof(gpa_t), &gva)) return 1; /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ r = kvm_write_guest_virt_system(vcpu, gva, (void *)&current_vmptr, sizeof(gpa_t), &e); if (r != X86EMUL_CONTINUE) return kvm_handle_memory_failure(vcpu, r, &e); return nested_vmx_succeed(vcpu); } /* Emulate the INVEPT instruction */ static int handle_invept(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 vmx_instruction_info, types; unsigned long type, roots_to_free; struct kvm_mmu *mmu; gva_t gva; struct x86_exception e; struct { u64 eptp, gpa; } operand; int i, r, gpr_index; if (!(vmx->nested.msrs.secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) || !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } if (!nested_vmx_check_permission(vcpu)) return 1; vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); type = kvm_register_read(vcpu, gpr_index); types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; if (type >= 32 || !(types & (1 << type))) return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); /* According to the Intel VMX instruction reference, the memory * operand is read even if it isn't needed (e.g., for type==global) */ if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), vmx_instruction_info, false, sizeof(operand), &gva)) return 1; r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); if (r != X86EMUL_CONTINUE) return kvm_handle_memory_failure(vcpu, r, &e); /* * Nested EPT roots are always held through guest_mmu, * not root_mmu. */ mmu = &vcpu->arch.guest_mmu; switch (type) { case VMX_EPT_EXTENT_CONTEXT: if (!nested_vmx_check_eptp(vcpu, operand.eptp)) return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); roots_to_free = 0; if (nested_ept_root_matches(mmu->root.hpa, mmu->root.pgd, operand.eptp)) roots_to_free |= KVM_MMU_ROOT_CURRENT; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { if (nested_ept_root_matches(mmu->prev_roots[i].hpa, mmu->prev_roots[i].pgd, operand.eptp)) roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); } break; case VMX_EPT_EXTENT_GLOBAL: roots_to_free = KVM_MMU_ROOTS_ALL; break; default: BUG(); break; } if (roots_to_free) kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free); return nested_vmx_succeed(vcpu); } static int handle_invvpid(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 vmx_instruction_info; unsigned long type, types; gva_t gva; struct x86_exception e; struct { u64 vpid; u64 gla; } operand; u16 vpid02; int r, gpr_index; if (!(vmx->nested.msrs.secondary_ctls_high & SECONDARY_EXEC_ENABLE_VPID) || !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } if (!nested_vmx_check_permission(vcpu)) return 1; vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); type = kvm_register_read(vcpu, gpr_index); types = (vmx->nested.msrs.vpid_caps & VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; if (type >= 32 || !(types & (1 << type))) return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); /* according to the intel vmx instruction reference, the memory * operand is read even if it isn't needed (e.g., for type==global) */ if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), vmx_instruction_info, false, sizeof(operand), &gva)) return 1; r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); if (r != X86EMUL_CONTINUE) return kvm_handle_memory_failure(vcpu, r, &e); if (operand.vpid >> 16) return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); /* * Always flush the effective vpid02, i.e. never flush the current VPID * and never explicitly flush vpid01. INVVPID targets a VPID, not a * VMCS, and so whether or not the current vmcs12 has VPID enabled is * irrelevant (and there may not be a loaded vmcs12). */ vpid02 = nested_get_vpid02(vcpu); switch (type) { case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: /* * LAM doesn't apply to addresses that are inputs to TLB * invalidation. */ if (!operand.vpid || is_noncanonical_invlpg_address(operand.gla, vcpu)) return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); vpid_sync_vcpu_addr(vpid02, operand.gla); break; case VMX_VPID_EXTENT_SINGLE_CONTEXT: case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: if (!operand.vpid) return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); vpid_sync_context(vpid02); break; case VMX_VPID_EXTENT_ALL_CONTEXT: vpid_sync_context(vpid02); break; default: WARN_ON_ONCE(1); return kvm_skip_emulated_instruction(vcpu); } /* * Sync the shadow page tables if EPT is disabled, L1 is invalidating * linear mappings for L2 (tagged with L2's VPID). Free all guest * roots as VPIDs are not tracked in the MMU role. * * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share * an MMU when EPT is disabled. * * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR. */ if (!enable_ept) kvm_mmu_free_guest_mode_roots(vcpu->kvm, &vcpu->arch.root_mmu); return nested_vmx_succeed(vcpu); } static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { u32 index = kvm_rcx_read(vcpu); u64 new_eptp; if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12))) return 1; if (index >= VMFUNC_EPTP_ENTRIES) return 1; if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, &new_eptp, index * 8, 8)) return 1; /* * If the (L2) guest does a vmfunc to the currently * active ept pointer, we don't have to do anything else */ if (vmcs12->ept_pointer != new_eptp) { if (!nested_vmx_check_eptp(vcpu, new_eptp)) return 1; vmcs12->ept_pointer = new_eptp; nested_ept_new_eptp(vcpu); if (!nested_cpu_has_vpid(vmcs12)) kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); } return 0; } static int handle_vmfunc(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12; u32 function = kvm_rax_read(vcpu); /* * VMFUNC should never execute cleanly while L1 is active; KVM supports * VMFUNC for nested VMs, but not for L1. */ if (WARN_ON_ONCE(!is_guest_mode(vcpu))) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } vmcs12 = get_vmcs12(vcpu); /* * #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC * is enabled in vmcs02 if and only if it's enabled in vmcs12. */ if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } if (!(vmcs12->vm_function_control & BIT_ULL(function))) goto fail; switch (function) { case 0: if (nested_vmx_eptp_switching(vcpu, vmcs12)) goto fail; break; default: goto fail; } return kvm_skip_emulated_instruction(vcpu); fail: /* * This is effectively a reflected VM-Exit, as opposed to a synthesized * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode * EXIT_REASON_VMFUNC as the exit reason. */ nested_vmx_vmexit(vcpu, vmx->vt.exit_reason.full, vmx_get_intr_info(vcpu), vmx_get_exit_qual(vcpu)); return 1; } /* * Return true if an IO instruction with the specified port and size should cause * a VM-exit into L1. */ bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, int size) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); gpa_t bitmap, last_bitmap; u8 b; last_bitmap = INVALID_GPA; b = -1; while (size > 0) { if (port < 0x8000) bitmap = vmcs12->io_bitmap_a; else if (port < 0x10000) bitmap = vmcs12->io_bitmap_b; else return true; bitmap += (port & 0x7fff) / 8; if (last_bitmap != bitmap) if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) return true; if (b & (1 << (port & 7))) return true; port++; size--; last_bitmap = bitmap; } return false; } static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { unsigned long exit_qualification; unsigned short port; int size; if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); exit_qualification = vmx_get_exit_qual(vcpu); port = exit_qualification >> 16; size = (exit_qualification & 7) + 1; return nested_vmx_check_io_bitmaps(vcpu, port, size); } /* * Return 1 if we should exit from L2 to L1 to handle an MSR access, * rather than handle it ourselves in L0. I.e., check whether L1 expressed * disinterest in the current event (read or write a specific MSR) by using an * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. */ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, union vmx_exit_reason exit_reason) { u32 msr_index; gpa_t bitmap; if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) return true; if (exit_reason.basic == EXIT_REASON_MSR_READ_IMM || exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM) msr_index = vmx_get_exit_qual(vcpu); else msr_index = kvm_rcx_read(vcpu); /* * The MSR_BITMAP page is divided into four 1024-byte bitmaps, * for the four combinations of read/write and low/high MSR numbers. * First we need to figure out which of the four to use: */ bitmap = vmcs12->msr_bitmap; if (exit_reason.basic == EXIT_REASON_MSR_WRITE || exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM) bitmap += 2048; if (msr_index >= 0xc0000000) { msr_index -= 0xc0000000; bitmap += 1024; } /* Then read the msr_index'th bit from this bitmap: */ if (msr_index < 1024*8) { unsigned char b; if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) return true; return 1 & (b >> (msr_index & 7)); } else return true; /* let L1 handle the wrong parameter */ } /* * Return 1 if we should exit from L2 to L1 to handle a CR access exit, * rather than handle it ourselves in L0. I.e., check if L1 wanted to * intercept (via guest_host_mask etc.) the current event. */ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { unsigned long exit_qualification = vmx_get_exit_qual(vcpu); int cr = exit_qualification & 15; int reg; unsigned long val; switch ((exit_qualification >> 4) & 3) { case 0: /* mov to cr */ reg = (exit_qualification >> 8) & 15; val = kvm_register_read(vcpu, reg); switch (cr) { case 0: if (vmcs12->cr0_guest_host_mask & (val ^ vmcs12->cr0_read_shadow)) return true; break; case 3: if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) return true; break; case 4: if (vmcs12->cr4_guest_host_mask & (vmcs12->cr4_read_shadow ^ val)) return true; break; case 8: if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) return true; break; } break; case 2: /* clts */ if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && (vmcs12->cr0_read_shadow & X86_CR0_TS)) return true; break; case 1: /* mov from cr */ switch (cr) { case 3: if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_CR3_STORE_EXITING) return true; break; case 8: if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_CR8_STORE_EXITING) return true; break; } break; case 3: /* lmsw */ /* * lmsw can change bits 1..3 of cr0, and only set bit 0 of * cr0. Other attempted changes are ignored, with no exit. */ val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; if (vmcs12->cr0_guest_host_mask & 0xe & (val ^ vmcs12->cr0_read_shadow)) return true; if ((vmcs12->cr0_guest_host_mask & 0x1) && !(vmcs12->cr0_read_shadow & 0x1) && (val & 0x1)) return true; break; } return false; } static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { u32 encls_leaf; if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX) || !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING)) return false; encls_leaf = kvm_rax_read(vcpu); if (encls_leaf > 62) encls_leaf = 63; return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf); } static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, gpa_t bitmap) { u32 vmx_instruction_info; unsigned long field; u8 b; if (!nested_cpu_has_shadow_vmcs(vmcs12)) return true; /* Decode instruction info and find the field to access */ vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); /* Out-of-range fields always cause a VM exit from L2 to L1 */ if (field >> 15) return true; if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) return true; return 1 & (b >> (field & 7)); } static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12) { u32 entry_intr_info = vmcs12->vm_entry_intr_info_field; if (nested_cpu_has_mtf(vmcs12)) return true; /* * An MTF VM-exit may be injected into the guest by setting the * interruption-type to 7 (other event) and the vector field to 0. Such * is the case regardless of the 'monitor trap flag' VM-execution * control. */ return entry_intr_info == (INTR_INFO_VALID_MASK | INTR_TYPE_OTHER_EVENT); } /* * Return true if L0 wants to handle an exit from L2 regardless of whether or not * L1 wants the exit. Only call this when in is_guest_mode (L2). */ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, union vmx_exit_reason exit_reason) { u32 intr_info; switch ((u16)exit_reason.basic) { case EXIT_REASON_EXCEPTION_NMI: intr_info = vmx_get_intr_info(vcpu); if (is_nmi(intr_info)) return true; else if (is_page_fault(intr_info)) return vcpu->arch.apf.host_apf_flags || vmx_need_pf_intercept(vcpu); else if (is_debug(intr_info) && vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) return true; else if (is_breakpoint(intr_info) && vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) return true; else if (is_alignment_check(intr_info) && !vmx_guest_inject_ac(vcpu)) return true; else if (is_ve_fault(intr_info)) return true; return false; case EXIT_REASON_EXTERNAL_INTERRUPT: return true; case EXIT_REASON_MCE_DURING_VMENTRY: return true; case EXIT_REASON_EPT_VIOLATION: /* * L0 always deals with the EPT violation. If nested EPT is * used, and the nested mmu code discovers that the address is * missing in the guest EPT table (EPT12), the EPT violation * will be injected with nested_ept_inject_page_fault() */ return true; case EXIT_REASON_EPT_MISCONFIG: /* * L2 never uses directly L1's EPT, but rather L0's own EPT * table (shadow on EPT) or a merged EPT table that L0 built * (EPT on EPT). So any problems with the structure of the * table is L0's fault. */ return true; case EXIT_REASON_PREEMPTION_TIMER: return true; case EXIT_REASON_PML_FULL: /* * PML is emulated for an L1 VMM and should never be enabled in * vmcs02, always "handle" PML_FULL by exiting to userspace. */ return true; case EXIT_REASON_VMFUNC: /* VM functions are emulated through L2->L0 vmexits. */ return true; case EXIT_REASON_BUS_LOCK: /* * At present, bus lock VM exit is never exposed to L1. * Handle L2's bus locks in L0 directly. */ return true; #ifdef CONFIG_KVM_HYPERV case EXIT_REASON_VMCALL: /* Hyper-V L2 TLB flush hypercall is handled by L0 */ return guest_hv_cpuid_has_l2_tlb_flush(vcpu) && nested_evmcs_l2_tlb_flush_enabled(vcpu) && kvm_hv_is_tlb_flush_hcall(vcpu); #endif default: break; } return false; } /* * Return 1 if L1 wants to intercept an exit from L2. Only call this when in * is_guest_mode (L2). */ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, union vmx_exit_reason exit_reason) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); u32 intr_info; switch ((u16)exit_reason.basic) { case EXIT_REASON_EXCEPTION_NMI: intr_info = vmx_get_intr_info(vcpu); if (is_nmi(intr_info)) return true; else if (is_page_fault(intr_info)) return true; return vmcs12->exception_bitmap & (1u << (intr_info & INTR_INFO_VECTOR_MASK)); case EXIT_REASON_EXTERNAL_INTERRUPT: return nested_exit_on_intr(vcpu); case EXIT_REASON_TRIPLE_FAULT: return true; case EXIT_REASON_INTERRUPT_WINDOW: return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING); case EXIT_REASON_NMI_WINDOW: return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING); case EXIT_REASON_TASK_SWITCH: return true; case EXIT_REASON_CPUID: return true; case EXIT_REASON_HLT: return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); case EXIT_REASON_INVD: return true; case EXIT_REASON_INVLPG: return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); case EXIT_REASON_RDPMC: return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); case EXIT_REASON_RDRAND: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); case EXIT_REASON_RDSEED: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); case EXIT_REASON_VMREAD: return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, vmcs12->vmread_bitmap); case EXIT_REASON_VMWRITE: return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, vmcs12->vmwrite_bitmap); case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: /* * VMX instructions trap unconditionally. This allows L1 to * emulate them for its L2 guest, i.e., allows 3-level nesting! */ return true; case EXIT_REASON_CR_ACCESS: return nested_vmx_exit_handled_cr(vcpu, vmcs12); case EXIT_REASON_DR_ACCESS: return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); case EXIT_REASON_IO_INSTRUCTION: return nested_vmx_exit_handled_io(vcpu, vmcs12); case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); case EXIT_REASON_MSR_READ: case EXIT_REASON_MSR_WRITE: case EXIT_REASON_MSR_READ_IMM: case EXIT_REASON_MSR_WRITE_IMM: return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); case EXIT_REASON_INVALID_STATE: return true; case EXIT_REASON_MWAIT_INSTRUCTION: return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); case EXIT_REASON_MONITOR_TRAP_FLAG: return nested_vmx_exit_handled_mtf(vmcs12); case EXIT_REASON_MONITOR_INSTRUCTION: return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); case EXIT_REASON_PAUSE_INSTRUCTION: return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || nested_cpu_has2(vmcs12, SECONDARY_EXEC_PAUSE_LOOP_EXITING); case EXIT_REASON_MCE_DURING_VMENTRY: return true; case EXIT_REASON_TPR_BELOW_THRESHOLD: return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); case EXIT_REASON_APIC_ACCESS: case EXIT_REASON_APIC_WRITE: case EXIT_REASON_EOI_INDUCED: /* * The controls for "virtualize APIC accesses," "APIC- * register virtualization," and "virtual-interrupt * delivery" only come from vmcs12. */ return true; case EXIT_REASON_INVPCID: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); case EXIT_REASON_WBINVD: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); case EXIT_REASON_XSETBV: return true; case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: /* * Always forward XSAVES/XRSTORS to L1 as KVM doesn't utilize * XSS-bitmap, and always loads vmcs02 with vmcs12's XSS-bitmap * verbatim, i.e. any exit is due to L1's bitmap. WARN if * XSAVES isn't enabled, as the CPU is supposed to inject #UD * in that case, before consulting the XSS-bitmap. */ WARN_ON_ONCE(!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES)); return true; case EXIT_REASON_UMWAIT: case EXIT_REASON_TPAUSE: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); case EXIT_REASON_ENCLS: return nested_vmx_exit_handled_encls(vcpu, vmcs12); case EXIT_REASON_NOTIFY: /* Notify VM exit is not exposed to L1 */ return false; case EXIT_REASON_SEAMCALL: case EXIT_REASON_TDCALL: /* * SEAMCALL and TDCALL unconditionally VM-Exit, but aren't * virtualized by KVM for L1 hypervisors, i.e. L1 should * never want or expect such an exit. */ return false; default: return true; } } /* * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was * reflected into L1. */ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); union vmx_exit_reason exit_reason = vmx->vt.exit_reason; unsigned long exit_qual; u32 exit_intr_info; WARN_ON_ONCE(vmx->nested.nested_run_pending); /* * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM * has already loaded L2's state. */ if (unlikely(vmx->fail)) { trace_kvm_nested_vmenter_failed( "hardware VM-instruction error: ", vmcs_read32(VM_INSTRUCTION_ERROR)); exit_intr_info = 0; exit_qual = 0; goto reflect_vmexit; } trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX); /* If L0 (KVM) wants the exit, it trumps L1's desires. */ if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) return false; /* If L1 doesn't want the exit, handle it in L0. */ if (!nested_vmx_l1_wants_exit(vcpu, exit_reason)) return false; /* * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would * need to be synthesized by querying the in-kernel LAPIC, but external * interrupts are never reflected to L1 so it's a non-issue. */ exit_intr_info = vmx_get_intr_info(vcpu); if (is_exception_with_error_code(exit_intr_info)) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); } exit_qual = vmx_get_exit_qual(vcpu); reflect_vmexit: nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual); return true; } static int vmx_get_nested_state(struct kvm_vcpu *vcpu, struct kvm_nested_state __user *user_kvm_nested_state, u32 user_data_size) { struct vcpu_vmx *vmx; struct vmcs12 *vmcs12; struct kvm_nested_state kvm_state = { .flags = 0, .format = KVM_STATE_NESTED_FORMAT_VMX, .size = sizeof(kvm_state), .hdr.vmx.flags = 0, .hdr.vmx.vmxon_pa = INVALID_GPA, .hdr.vmx.vmcs12_pa = INVALID_GPA, .hdr.vmx.preemption_timer_deadline = 0, }; struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = &user_kvm_nested_state->data.vmx[0]; if (!vcpu) return kvm_state.size + sizeof(*user_vmx_nested_state); vmx = to_vmx(vcpu); vmcs12 = get_vmcs12(vcpu); if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) && (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; if (vmx_has_valid_vmcs12(vcpu)) { kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */ if (nested_vmx_is_evmptr12_set(vmx)) kvm_state.flags |= KVM_STATE_NESTED_EVMCS; if (is_guest_mode(vcpu) && nested_cpu_has_shadow_vmcs(vmcs12) && vmcs12->vmcs_link_pointer != INVALID_GPA) kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12); } if (vmx->nested.smm.vmxon) kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; if (vmx->nested.smm.guest_mode) kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; if (is_guest_mode(vcpu)) { kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; if (vmx->nested.nested_run_pending) kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; if (vmx->nested.mtf_pending) kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING; if (nested_cpu_has_preemption_timer(vmcs12) && vmx->nested.has_preemption_timer_deadline) { kvm_state.hdr.vmx.flags |= KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE; kvm_state.hdr.vmx.preemption_timer_deadline = vmx->nested.preemption_timer_deadline; } } } if (user_data_size < kvm_state.size) goto out; if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) return -EFAULT; if (!vmx_has_valid_vmcs12(vcpu)) goto out; /* * When running L2, the authoritative vmcs12 state is in the * vmcs02. When running L1, the authoritative vmcs12 state is * in the shadow or enlightened vmcs linked to vmcs01, unless * need_vmcs12_to_shadow_sync is set, in which case, the authoritative * vmcs12 state is in the vmcs12 already. */ if (is_guest_mode(vcpu)) { sync_vmcs02_to_vmcs12(vcpu, vmcs12); sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); } else { copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); if (!vmx->nested.need_vmcs12_to_shadow_sync) { if (nested_vmx_is_evmptr12_valid(vmx)) /* * L1 hypervisor is not obliged to keep eVMCS * clean fields data always up-to-date while * not in guest mode, 'hv_clean_fields' is only * supposed to be actual upon vmentry so we need * to ignore it here and do full copy. */ copy_enlightened_to_vmcs12(vmx, 0); else if (enable_shadow_vmcs) copy_shadow_to_vmcs12(vmx); } } BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE); BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE); /* * Copy over the full allocated size of vmcs12 rather than just the size * of the struct. */ if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE)) return -EFAULT; if (nested_cpu_has_shadow_vmcs(vmcs12) && vmcs12->vmcs_link_pointer != INVALID_GPA) { if (copy_to_user(user_vmx_nested_state->shadow_vmcs12, get_shadow_vmcs12(vcpu), VMCS12_SIZE)) return -EFAULT; } out: return kvm_state.size; } void vmx_leave_nested(struct kvm_vcpu *vcpu) { if (is_guest_mode(vcpu)) { to_vmx(vcpu)->nested.nested_run_pending = 0; nested_vmx_vmexit(vcpu, -1, 0, 0); } free_nested(vcpu); } static int vmx_set_nested_state(struct kvm_vcpu *vcpu, struct kvm_nested_state __user *user_kvm_nested_state, struct kvm_nested_state *kvm_state) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12; enum vm_entry_failure_code ignored; struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = &user_kvm_nested_state->data.vmx[0]; int ret; if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) return -EINVAL; if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) { if (kvm_state->hdr.vmx.smm.flags) return -EINVAL; if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) return -EINVAL; /* * KVM_STATE_NESTED_EVMCS used to signal that KVM should * enable eVMCS capability on vCPU. However, since then * code was changed such that flag signals vmcs12 should * be copied into eVMCS in guest memory. * * To preserve backwards compatibility, allow user * to set this flag even when there is no VMXON region. */ if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) return -EINVAL; } else { if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) return -EINVAL; if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) return -EINVAL; } if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) return -EINVAL; if (kvm_state->hdr.vmx.smm.flags & ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) return -EINVAL; if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) return -EINVAL; /* * SMM temporarily disables VMX, so we cannot be in guest mode, * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags * must be zero. */ if (is_smm(vcpu) ? (kvm_state->flags & (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING)) : kvm_state->hdr.vmx.smm.flags) return -EINVAL; if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) return -EINVAL; if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) || !vmx->nested.enlightened_vmcs_enabled)) return -EINVAL; vmx_leave_nested(vcpu); if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) return 0; vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa; ret = enter_vmx_operation(vcpu); if (ret) return ret; /* Empty 'VMXON' state is permitted if no VMCS loaded */ if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) { /* See vmx_has_valid_vmcs12. */ if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) || (kvm_state->flags & KVM_STATE_NESTED_EVMCS) || (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)) return -EINVAL; else return 0; } if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) { if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa)) return -EINVAL; set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); #ifdef CONFIG_KVM_HYPERV } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { /* * nested_vmx_handle_enlightened_vmptrld() cannot be called * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be * restored yet. EVMCS will be mapped from * nested_get_vmcs12_pages(). */ vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING; kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); #endif } else { return -EINVAL; } if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { vmx->nested.smm.vmxon = true; vmx->nested.vmxon = false; if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) vmx->nested.smm.guest_mode = true; } vmcs12 = get_vmcs12(vcpu); if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12))) return -EFAULT; if (vmcs12->hdr.revision_id != VMCS12_REVISION) return -EINVAL; if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) return 0; vmx->nested.nested_run_pending = !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); vmx->nested.mtf_pending = !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING); ret = -EINVAL; if (nested_cpu_has_shadow_vmcs(vmcs12) && vmcs12->vmcs_link_pointer != INVALID_GPA) { struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); if (kvm_state->size < sizeof(*kvm_state) + sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12)) goto error_guest_mode; if (copy_from_user(shadow_vmcs12, user_vmx_nested_state->shadow_vmcs12, sizeof(*shadow_vmcs12))) { ret = -EFAULT; goto error_guest_mode; } if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || !shadow_vmcs12->hdr.shadow_vmcs) goto error_guest_mode; } vmx->nested.has_preemption_timer_deadline = false; if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) { vmx->nested.has_preemption_timer_deadline = true; vmx->nested.preemption_timer_deadline = kvm_state->hdr.vmx.preemption_timer_deadline; } if (nested_vmx_check_controls(vcpu, vmcs12) || nested_vmx_check_host_state(vcpu, vmcs12) || nested_vmx_check_guest_state(vcpu, vmcs12, &ignored)) goto error_guest_mode; vmx->nested.dirty_vmcs12 = true; vmx->nested.force_msr_bitmap_recalc = true; ret = nested_vmx_enter_non_root_mode(vcpu, false); if (ret) goto error_guest_mode; if (vmx->nested.mtf_pending) kvm_make_request(KVM_REQ_EVENT, vcpu); return 0; error_guest_mode: vmx->nested.nested_run_pending = 0; return ret; } void nested_vmx_set_vmcs_shadowing_bitmap(void) { if (enable_shadow_vmcs) { vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); } } /* * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6. Undo * that madness to get the encoding for comparison. */ #define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10))) static u64 nested_vmx_calc_vmcs_enum_msr(void) { /* * Note these are the so called "index" of the VMCS field encoding, not * the index into vmcs12. */ unsigned int max_idx, idx; int i; /* * For better or worse, KVM allows VMREAD/VMWRITE to all fields in * vmcs12, regardless of whether or not the associated feature is * exposed to L1. Simply find the field with the highest index. */ max_idx = 0; for (i = 0; i < nr_vmcs12_fields; i++) { /* The vmcs12 table is very, very sparsely populated. */ if (!vmcs12_field_offsets[i]) continue; idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i)); if (idx > max_idx) max_idx = idx; } return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT; } static void nested_vmx_setup_pinbased_ctls(struct vmcs_config *vmcs_conf, struct nested_vmx_msrs *msrs) { msrs->pinbased_ctls_low = PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; msrs->pinbased_ctls_high = vmcs_conf->pin_based_exec_ctrl; msrs->pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS | (enable_apicv ? PIN_BASED_POSTED_INTR : 0); msrs->pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | PIN_BASED_VMX_PREEMPTION_TIMER; } static void nested_vmx_setup_exit_ctls(struct vmcs_config *vmcs_conf, struct nested_vmx_msrs *msrs) { msrs->exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; msrs->exit_ctls_high = vmcs_conf->vmexit_ctrl; msrs->exit_ctls_high &= #ifdef CONFIG_X86_64 VM_EXIT_HOST_ADDR_SPACE_SIZE | #endif VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_CET_STATE; msrs->exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && !kvm_cpu_cap_has(X86_FEATURE_IBT)) msrs->exit_ctls_high &= ~VM_EXIT_LOAD_CET_STATE; /* We support free control of debug control saving. */ msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; } static void nested_vmx_setup_entry_ctls(struct vmcs_config *vmcs_conf, struct nested_vmx_msrs *msrs) { msrs->entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; msrs->entry_ctls_high = vmcs_conf->vmentry_ctrl; msrs->entry_ctls_high &= #ifdef CONFIG_X86_64 VM_ENTRY_IA32E_MODE | #endif VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS | VM_ENTRY_LOAD_CET_STATE; msrs->entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER | VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && !kvm_cpu_cap_has(X86_FEATURE_IBT)) msrs->entry_ctls_high &= ~VM_ENTRY_LOAD_CET_STATE; /* We support free control of debug control loading. */ msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; } static void nested_vmx_setup_cpubased_ctls(struct vmcs_config *vmcs_conf, struct nested_vmx_msrs *msrs) { msrs->procbased_ctls_low = CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; msrs->procbased_ctls_high = vmcs_conf->cpu_based_exec_ctrl; msrs->procbased_ctls_high &= CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING | CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING | #ifdef CONFIG_X86_64 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | #endif CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; /* * We can allow some features even when not supported by the * hardware. For example, L1 can specify an MSR bitmap - and we * can use it to avoid exits to L1 - even when L0 runs L2 * without MSR bitmaps. */ msrs->procbased_ctls_high |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | CPU_BASED_USE_MSR_BITMAPS; /* We support free control of CR3 access interception. */ msrs->procbased_ctls_low &= ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); } static void nested_vmx_setup_secondary_ctls(u32 ept_caps, struct vmcs_config *vmcs_conf, struct nested_vmx_msrs *msrs) { msrs->secondary_ctls_low = 0; msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl; msrs->secondary_ctls_high &= SECONDARY_EXEC_DESC | SECONDARY_EXEC_ENABLE_RDTSCP | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_WBINVD_EXITING | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_RDRAND_EXITING | SECONDARY_EXEC_ENABLE_INVPCID | SECONDARY_EXEC_ENABLE_VMFUNC | SECONDARY_EXEC_RDSEED_EXITING | SECONDARY_EXEC_ENABLE_XSAVES | SECONDARY_EXEC_TSC_SCALING | SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; /* * We can emulate "VMCS shadowing," even if the hardware * doesn't support it. */ msrs->secondary_ctls_high |= SECONDARY_EXEC_SHADOW_VMCS; if (enable_ept) { /* nested EPT: emulate EPT also to L1 */ msrs->secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT; msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT | VMX_EPT_PAGE_WALK_5_BIT | VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT | VMX_EPT_EXECUTE_ONLY_BIT; msrs->ept_caps &= ept_caps; msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | VMX_EPT_1GB_PAGE_BIT; if (enable_ept_ad_bits) { msrs->secondary_ctls_high |= SECONDARY_EXEC_ENABLE_PML; msrs->ept_caps |= VMX_EPT_AD_BIT; } /* * Advertise EPTP switching irrespective of hardware support, * KVM emulates it in software so long as VMFUNC is supported. */ if (cpu_has_vmx_vmfunc()) msrs->vmfunc_controls = VMX_VMFUNC_EPTP_SWITCHING; } /* * Old versions of KVM use the single-context version without * checking for support, so declare that it is supported even * though it is treated as global context. The alternative is * not failing the single-context invvpid, and it is worse. */ if (enable_vpid) { msrs->secondary_ctls_high |= SECONDARY_EXEC_ENABLE_VPID; msrs->vpid_caps = VMX_VPID_INVVPID_BIT | VMX_VPID_EXTENT_SUPPORTED_MASK; } if (enable_unrestricted_guest) msrs->secondary_ctls_high |= SECONDARY_EXEC_UNRESTRICTED_GUEST; if (flexpriority_enabled) msrs->secondary_ctls_high |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; if (enable_sgx) msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING; } static void nested_vmx_setup_misc_data(struct vmcs_config *vmcs_conf, struct nested_vmx_msrs *msrs) { msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA; msrs->misc_low |= VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | VMX_MISC_ACTIVITY_HLT | VMX_MISC_ACTIVITY_WAIT_SIPI; msrs->misc_high = 0; } static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs) { /* * This MSR reports some information about VMX support. We * should return information about the VMX we emulate for the * guest, and the VMCS structure we give it - not about the * VMX support of the underlying hardware. */ msrs->basic = vmx_basic_encode_vmcs_info(VMCS12_REVISION, VMCS12_SIZE, X86_MEMTYPE_WB); msrs->basic |= VMX_BASIC_TRUE_CTLS; if (cpu_has_vmx_basic_inout()) msrs->basic |= VMX_BASIC_INOUT; if (cpu_has_vmx_basic_no_hw_errcode_cc()) msrs->basic |= VMX_BASIC_NO_HW_ERROR_CODE_CC; } static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs) { /* * These MSRs specify bits which the guest must keep fixed on * while L1 is in VMXON mode (in L1's root mode, or running an L2). * We picked the standard core2 setting. */ #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) #define VMXON_CR4_ALWAYSON X86_CR4_VMXE msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; /* These MSRs specify bits which the guest must keep fixed off. */ rdmsrq(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); rdmsrq(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); if (vmx_umip_emulated()) msrs->cr4_fixed1 |= X86_CR4_UMIP; } /* * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be * returned for the various VMX controls MSRs when nested VMX is enabled. * The same values should also be used to verify that vmcs12 control fields are * valid during nested entry from L1 to L2. * Each of these control msrs has a low and high 32-bit half: A low bit is on * if the corresponding bit in the (32-bit) control field *must* be on, and a * bit in the high half is on if the corresponding bit in the control field * may be on. See also vmx_control_verify(). */ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) { struct nested_vmx_msrs *msrs = &vmcs_conf->nested; /* * Note that as a general rule, the high half of the MSRs (bits in * the control fields which may be 1) should be initialized by the * intersection of the underlying hardware's MSR (i.e., features which * can be supported) and the list of features we want to expose - * because they are known to be properly supported in our code. * Also, usually, the low half of the MSRs (bits which must be 1) can * be set to 0, meaning that L1 may turn off any of these bits. The * reason is that if one of these bits is necessary, it will appear * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control * fields of vmcs01 and vmcs02, will turn these bits off - and * nested_vmx_l1_wants_exit() will not pass related exits to L1. * These rules have exceptions below. */ nested_vmx_setup_pinbased_ctls(vmcs_conf, msrs); nested_vmx_setup_exit_ctls(vmcs_conf, msrs); nested_vmx_setup_entry_ctls(vmcs_conf, msrs); nested_vmx_setup_cpubased_ctls(vmcs_conf, msrs); nested_vmx_setup_secondary_ctls(ept_caps, vmcs_conf, msrs); nested_vmx_setup_misc_data(vmcs_conf, msrs); nested_vmx_setup_basic(msrs); nested_vmx_setup_cr_fixed(msrs); msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr(); } void nested_vmx_hardware_unsetup(void) { int i; if (enable_shadow_vmcs) { for (i = 0; i < VMX_BITMAP_NR; i++) free_page((unsigned long)vmx_bitmap[i]); } } __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) { int i; if (!cpu_has_vmx_shadow_vmcs()) enable_shadow_vmcs = 0; if (enable_shadow_vmcs) { for (i = 0; i < VMX_BITMAP_NR; i++) { /* * The vmx_bitmap is not tied to a VM and so should * not be charged to a memcg. */ vmx_bitmap[i] = (unsigned long *) __get_free_page(GFP_KERNEL); if (!vmx_bitmap[i]) { nested_vmx_hardware_unsetup(); return -ENOMEM; } } init_vmcs_shadow_fields(); } exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear; exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch; exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld; exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst; exit_handlers[EXIT_REASON_VMREAD] = handle_vmread; exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume; exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite; exit_handlers[EXIT_REASON_VMOFF] = handle_vmxoff; exit_handlers[EXIT_REASON_VMON] = handle_vmxon; exit_handlers[EXIT_REASON_INVEPT] = handle_invept; exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; return 0; } struct kvm_x86_nested_ops vmx_nested_ops = { .leave_nested = vmx_leave_nested, .is_exception_vmexit = nested_vmx_is_exception_vmexit, .check_events = vmx_check_nested_events, .has_events = vmx_has_nested_events, .triple_fault = nested_vmx_triple_fault, .get_state = vmx_get_nested_state, .set_state = vmx_set_nested_state, .get_nested_state_pages = vmx_get_nested_state_pages, .write_log_dirty = nested_vmx_write_pml_buffer, #ifdef CONFIG_KVM_HYPERV .enable_evmcs = nested_enable_evmcs, .get_evmcs_version = nested_get_evmcs_version, .hv_inject_synthetic_vmexit_post_tlb_flush = vmx_hv_inject_synthetic_vmexit_post_tlb_flush, #endif };
6211 5393 1306 1303 5952 25 5 292 505 298 1 484 208 298 295 295 296 458 104 286 287 12 10 10 12 28 28 28 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 // SPDX-License-Identifier: GPL-2.0 /* * Kernel internal schedule timeout and sleeping functions */ #include <linux/delay.h> #include <linux/jiffies.h> #include <linux/timer.h> #include <linux/sched/signal.h> #include <linux/sched/debug.h> #include "tick-internal.h" /* * Since schedule_timeout()'s timer is defined on the stack, it must store * the target task on the stack as well. */ struct process_timer { struct timer_list timer; struct task_struct *task; }; static void process_timeout(struct timer_list *t) { struct process_timer *timeout = timer_container_of(timeout, t, timer); wake_up_process(timeout->task); } /** * schedule_timeout - sleep until timeout * @timeout: timeout value in jiffies * * Make the current task sleep until @timeout jiffies have elapsed. * The function behavior depends on the current task state * (see also set_current_state() description): * * %TASK_RUNNING - the scheduler is called, but the task does not sleep * at all. That happens because sched_submit_work() does nothing for * tasks in %TASK_RUNNING state. * * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to * pass before the routine returns unless the current task is explicitly * woken up, (e.g. by wake_up_process()). * * %TASK_INTERRUPTIBLE - the routine may return early if a signal is * delivered to the current task or the current task is explicitly woken * up. * * The current task state is guaranteed to be %TASK_RUNNING when this * routine returns. * * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule * the CPU away without a bound on the timeout. In this case the return * value will be %MAX_SCHEDULE_TIMEOUT. * * Returns: 0 when the timer has expired otherwise the remaining time in * jiffies will be returned. In all cases the return value is guaranteed * to be non-negative. */ signed long __sched schedule_timeout(signed long timeout) { struct process_timer timer; unsigned long expire; switch (timeout) { case MAX_SCHEDULE_TIMEOUT: /* * These two special cases are useful to be comfortable * in the caller. Nothing more. We could take * MAX_SCHEDULE_TIMEOUT from one of the negative value * but I' d like to return a valid offset (>=0) to allow * the caller to do everything it want with the retval. */ schedule(); goto out; default: /* * Another bit of PARANOID. Note that the retval will be * 0 since no piece of kernel is supposed to do a check * for a negative retval of schedule_timeout() (since it * should never happens anyway). You just have the printk() * that will tell you if something is gone wrong and where. */ if (timeout < 0) { pr_err("%s: wrong timeout value %lx\n", __func__, timeout); dump_stack(); __set_current_state(TASK_RUNNING); goto out; } } expire = timeout + jiffies; timer.task = current; timer_setup_on_stack(&timer.timer, process_timeout, 0); timer.timer.expires = expire; add_timer(&timer.timer); schedule(); timer_delete_sync(&timer.timer); /* Remove the timer from the object tracker */ timer_destroy_on_stack(&timer.timer); timeout = expire - jiffies; out: return timeout < 0 ? 0 : timeout; } EXPORT_SYMBOL(schedule_timeout); /* * __set_current_state() can be used in schedule_timeout_*() functions, because * schedule_timeout() calls schedule() unconditionally. */ /** * schedule_timeout_interruptible - sleep until timeout (interruptible) * @timeout: timeout value in jiffies * * See schedule_timeout() for details. * * Task state is set to TASK_INTERRUPTIBLE before starting the timeout. */ signed long __sched schedule_timeout_interruptible(signed long timeout) { __set_current_state(TASK_INTERRUPTIBLE); return schedule_timeout(timeout); } EXPORT_SYMBOL(schedule_timeout_interruptible); /** * schedule_timeout_killable - sleep until timeout (killable) * @timeout: timeout value in jiffies * * See schedule_timeout() for details. * * Task state is set to TASK_KILLABLE before starting the timeout. */ signed long __sched schedule_timeout_killable(signed long timeout) { __set_current_state(TASK_KILLABLE); return schedule_timeout(timeout); } EXPORT_SYMBOL(schedule_timeout_killable); /** * schedule_timeout_uninterruptible - sleep until timeout (uninterruptible) * @timeout: timeout value in jiffies * * See schedule_timeout() for details. * * Task state is set to TASK_UNINTERRUPTIBLE before starting the timeout. */ signed long __sched schedule_timeout_uninterruptible(signed long timeout) { __set_current_state(TASK_UNINTERRUPTIBLE); return schedule_timeout(timeout); } EXPORT_SYMBOL(schedule_timeout_uninterruptible); /** * schedule_timeout_idle - sleep until timeout (idle) * @timeout: timeout value in jiffies * * See schedule_timeout() for details. * * Task state is set to TASK_IDLE before starting the timeout. It is similar to * schedule_timeout_uninterruptible(), except this task will not contribute to * load average. */ signed long __sched schedule_timeout_idle(signed long timeout) { __set_current_state(TASK_IDLE); return schedule_timeout(timeout); } EXPORT_SYMBOL(schedule_timeout_idle); /** * schedule_hrtimeout_range_clock - sleep until timeout * @expires: timeout value (ktime_t) * @delta: slack in expires timeout (ktime_t) * @mode: timer mode * @clock_id: timer clock to be used * * Details are explained in schedule_hrtimeout_range() function description as * this function is commonly used. */ int __sched schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, const enum hrtimer_mode mode, clockid_t clock_id) { struct hrtimer_sleeper t; /* * Optimize when a zero timeout value is given. It does not * matter whether this is an absolute or a relative time. */ if (expires && *expires == 0) { __set_current_state(TASK_RUNNING); return 0; } /* * A NULL parameter means "infinite" */ if (!expires) { schedule(); return -EINTR; } hrtimer_setup_sleeper_on_stack(&t, clock_id, mode); hrtimer_set_expires_range_ns(&t.timer, *expires, delta); hrtimer_sleeper_start_expires(&t, mode); if (likely(t.task)) schedule(); hrtimer_cancel(&t.timer); destroy_hrtimer_on_stack(&t.timer); __set_current_state(TASK_RUNNING); return !t.task ? 0 : -EINTR; } EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock); /** * schedule_hrtimeout_range - sleep until timeout * @expires: timeout value (ktime_t) * @delta: slack in expires timeout (ktime_t) * @mode: timer mode * * Make the current task sleep until the given expiry time has * elapsed. The routine will return immediately unless * the current task state has been set (see set_current_state()). * * The @delta argument gives the kernel the freedom to schedule the * actual wakeup to a time that is both power and performance friendly * for regular (non RT/DL) tasks. * The kernel give the normal best effort behavior for "@expires+@delta", * but may decide to fire the timer earlier, but no earlier than @expires. * * You can set the task state as follows - * * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to * pass before the routine returns unless the current task is explicitly * woken up, (e.g. by wake_up_process()). * * %TASK_INTERRUPTIBLE - the routine may return early if a signal is * delivered to the current task or the current task is explicitly woken * up. * * The current task state is guaranteed to be TASK_RUNNING when this * routine returns. * * Returns: 0 when the timer has expired. If the task was woken before the * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or * by an explicit wakeup, it returns -EINTR. */ int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta, const enum hrtimer_mode mode) { return schedule_hrtimeout_range_clock(expires, delta, mode, CLOCK_MONOTONIC); } EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); /** * schedule_hrtimeout - sleep until timeout * @expires: timeout value (ktime_t) * @mode: timer mode * * See schedule_hrtimeout_range() for details. @delta argument of * schedule_hrtimeout_range() is set to 0 and has therefore no impact. */ int __sched schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode) { return schedule_hrtimeout_range(expires, 0, mode); } EXPORT_SYMBOL_GPL(schedule_hrtimeout); /** * msleep - sleep safely even with waitqueue interruptions * @msecs: Requested sleep duration in milliseconds * * msleep() uses jiffy based timeouts for the sleep duration. Because of the * design of the timer wheel, the maximum additional percentage delay (slack) is * 12.5%. This is only valid for timers which will end up in level 1 or a higher * level of the timer wheel. For explanation of those 12.5% please check the * detailed description about the basics of the timer wheel. * * The slack of timers which will end up in level 0 depends on sleep duration * (msecs) and HZ configuration and can be calculated in the following way (with * the timer wheel design restriction that the slack is not less than 12.5%): * * ``slack = MSECS_PER_TICK / msecs`` * * When the allowed slack of the callsite is known, the calculation could be * turned around to find the minimal allowed sleep duration to meet the * constraints. For example: * * * ``HZ=1000`` with ``slack=25%``: ``MSECS_PER_TICK / slack = 1 / (1/4) = 4``: * all sleep durations greater or equal 4ms will meet the constraints. * * ``HZ=1000`` with ``slack=12.5%``: ``MSECS_PER_TICK / slack = 1 / (1/8) = 8``: * all sleep durations greater or equal 8ms will meet the constraints. * * ``HZ=250`` with ``slack=25%``: ``MSECS_PER_TICK / slack = 4 / (1/4) = 16``: * all sleep durations greater or equal 16ms will meet the constraints. * * ``HZ=250`` with ``slack=12.5%``: ``MSECS_PER_TICK / slack = 4 / (1/8) = 32``: * all sleep durations greater or equal 32ms will meet the constraints. * * See also the signal aware variant msleep_interruptible(). */ void msleep(unsigned int msecs) { unsigned long timeout = msecs_to_jiffies(msecs); while (timeout) timeout = schedule_timeout_uninterruptible(timeout); } EXPORT_SYMBOL(msleep); /** * msleep_interruptible - sleep waiting for signals * @msecs: Requested sleep duration in milliseconds * * See msleep() for some basic information. * * The difference between msleep() and msleep_interruptible() is that the sleep * could be interrupted by a signal delivery and then returns early. * * Returns: The remaining time of the sleep duration transformed to msecs (see * schedule_timeout() for details). */ unsigned long msleep_interruptible(unsigned int msecs) { unsigned long timeout = msecs_to_jiffies(msecs); while (timeout && !signal_pending(current)) timeout = schedule_timeout_interruptible(timeout); return jiffies_to_msecs(timeout); } EXPORT_SYMBOL(msleep_interruptible); /** * usleep_range_state - Sleep for an approximate time in a given state * @min: Minimum time in usecs to sleep * @max: Maximum time in usecs to sleep * @state: State of the current task that will be while sleeping * * usleep_range_state() sleeps at least for the minimum specified time but not * longer than the maximum specified amount of time. The range might reduce * power usage by allowing hrtimers to coalesce an already scheduled interrupt * with this hrtimer. In the worst case, an interrupt is scheduled for the upper * bound. * * The sleeping task is set to the specified state before starting the sleep. * * In non-atomic context where the exact wakeup time is flexible, use * usleep_range() or its variants instead of udelay(). The sleep improves * responsiveness by avoiding the CPU-hogging busy-wait of udelay(). */ void __sched usleep_range_state(unsigned long min, unsigned long max, unsigned int state) { ktime_t exp = ktime_add_us(ktime_get(), min); u64 delta = (u64)(max - min) * NSEC_PER_USEC; if (WARN_ON_ONCE(max < min)) delta = 0; for (;;) { __set_current_state(state); /* Do not return before the requested sleep time has elapsed */ if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS)) break; } } EXPORT_SYMBOL(usleep_range_state);
4 4 4 7 4 5 7 5 7 7 7 7 7 7 4 4 4 4 4 4 4 4 3 7 8 7 1 8 8 1 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. */ #include <linux/list_sort.h> #include <linux/libnvdimm.h> #include <linux/module.h> #include <linux/nospec.h> #include <linux/mutex.h> #include <linux/ndctl.h> #include <linux/sysfs.h> #include <linux/delay.h> #include <linux/list.h> #include <linux/acpi.h> #include <linux/sort.h> #include <linux/io.h> #include <linux/nd.h> #include <asm/cacheflush.h> #include <acpi/nfit.h> #include "intel.h" #include "nfit.h" /* * For readq() and writeq() on 32-bit builds, the hi-lo, lo-hi order is * irrelevant. */ #include <linux/io-64-nonatomic-hi-lo.h> static bool force_enable_dimms; module_param(force_enable_dimms, bool, S_IRUGO|S_IWUSR); MODULE_PARM_DESC(force_enable_dimms, "Ignore _STA (ACPI DIMM device) status"); static bool disable_vendor_specific; module_param(disable_vendor_specific, bool, S_IRUGO); MODULE_PARM_DESC(disable_vendor_specific, "Limit commands to the publicly specified set"); static unsigned long override_dsm_mask; module_param(override_dsm_mask, ulong, S_IRUGO); MODULE_PARM_DESC(override_dsm_mask, "Bitmask of allowed NVDIMM DSM functions"); static int default_dsm_family = -1; module_param(default_dsm_family, int, S_IRUGO); MODULE_PARM_DESC(default_dsm_family, "Try this DSM type first when identifying NVDIMM family"); static bool no_init_ars; module_param(no_init_ars, bool, 0644); MODULE_PARM_DESC(no_init_ars, "Skip ARS run at nfit init time"); static bool force_labels; module_param(force_labels, bool, 0444); MODULE_PARM_DESC(force_labels, "Opt-in to labels despite missing methods"); LIST_HEAD(acpi_descs); DEFINE_MUTEX(acpi_desc_lock); static struct workqueue_struct *nfit_wq; struct nfit_table_prev { struct list_head spas; struct list_head memdevs; struct list_head dcrs; struct list_head bdws; struct list_head idts; struct list_head flushes; }; static guid_t nfit_uuid[NFIT_UUID_MAX]; const guid_t *to_nfit_uuid(enum nfit_uuids id) { return &nfit_uuid[id]; } EXPORT_SYMBOL(to_nfit_uuid); static const guid_t *to_nfit_bus_uuid(int family) { if (WARN_ONCE(family == NVDIMM_BUS_FAMILY_NFIT, "only secondary bus families can be translated\n")) return NULL; /* * The index of bus UUIDs starts immediately following the last * NVDIMM/leaf family. */ return to_nfit_uuid(family + NVDIMM_FAMILY_MAX); } static struct acpi_device *to_acpi_dev(struct acpi_nfit_desc *acpi_desc) { struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc; /* * If provider == 'ACPI.NFIT' we can assume 'dev' is a struct * acpi_device. */ if (!nd_desc->provider_name || strcmp(nd_desc->provider_name, "ACPI.NFIT") != 0) return NULL; return to_acpi_device(acpi_desc->dev); } static int xlat_bus_status(void *buf, unsigned int cmd, u32 status) { struct nd_cmd_clear_error *clear_err; struct nd_cmd_ars_status *ars_status; u16 flags; switch (cmd) { case ND_CMD_ARS_CAP: if ((status & 0xffff) == NFIT_ARS_CAP_NONE) return -ENOTTY; /* Command failed */ if (status & 0xffff) return -EIO; /* No supported scan types for this range */ flags = ND_ARS_PERSISTENT | ND_ARS_VOLATILE; if ((status >> 16 & flags) == 0) return -ENOTTY; return 0; case ND_CMD_ARS_START: /* ARS is in progress */ if ((status & 0xffff) == NFIT_ARS_START_BUSY) return -EBUSY; /* Command failed */ if (status & 0xffff) return -EIO; return 0; case ND_CMD_ARS_STATUS: ars_status = buf; /* Command failed */ if (status & 0xffff) return -EIO; /* Check extended status (Upper two bytes) */ if (status == NFIT_ARS_STATUS_DONE) return 0; /* ARS is in progress */ if (status == NFIT_ARS_STATUS_BUSY) return -EBUSY; /* No ARS performed for the current boot */ if (status == NFIT_ARS_STATUS_NONE) return -EAGAIN; /* * ARS interrupted, either we overflowed or some other * agent wants the scan to stop. If we didn't overflow * then just continue with the returned results. */ if (status == NFIT_ARS_STATUS_INTR) { if (ars_status->out_length >= 40 && (ars_status->flags & NFIT_ARS_F_OVERFLOW)) return -ENOSPC; return 0; } /* Unknown status */ if (status >> 16) return -EIO; return 0; case ND_CMD_CLEAR_ERROR: clear_err = buf; if (status & 0xffff) return -EIO; if (!clear_err->cleared) return -EIO; if (clear_err->length > clear_err->cleared) return clear_err->cleared; return 0; default: break; } /* all other non-zero status results in an error */ if (status) return -EIO; return 0; } #define ACPI_LABELS_LOCKED 3 static int xlat_nvdimm_status(struct nvdimm *nvdimm, void *buf, unsigned int cmd, u32 status) { struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); switch (cmd) { case ND_CMD_GET_CONFIG_SIZE: /* * In the _LSI, _LSR, _LSW case the locked status is * communicated via the read/write commands */ if (test_bit(NFIT_MEM_LSR, &nfit_mem->flags)) break; if (status >> 16 & ND_CONFIG_LOCKED) return -EACCES; break; case ND_CMD_GET_CONFIG_DATA: if (test_bit(NFIT_MEM_LSR, &nfit_mem->flags) && status == ACPI_LABELS_LOCKED) return -EACCES; break; case ND_CMD_SET_CONFIG_DATA: if (test_bit(NFIT_MEM_LSW, &nfit_mem->flags) && status == ACPI_LABELS_LOCKED) return -EACCES; break; default: break; } /* all other non-zero status results in an error */ if (status) return -EIO; return 0; } static int xlat_status(struct nvdimm *nvdimm, void *buf, unsigned int cmd, u32 status) { if (!nvdimm) return xlat_bus_status(buf, cmd, status); return xlat_nvdimm_status(nvdimm, buf, cmd, status); } /* convert _LS{I,R} packages to the buffer object acpi_nfit_ctl expects */ static union acpi_object *pkg_to_buf(union acpi_object *pkg) { int i; void *dst; size_t size = 0; union acpi_object *buf = NULL; if (pkg->type != ACPI_TYPE_PACKAGE) { WARN_ONCE(1, "BIOS bug, unexpected element type: %d\n", pkg->type); goto err; } for (i = 0; i < pkg->package.count; i++) { union acpi_object *obj = &pkg->package.elements[i]; if (obj->type == ACPI_TYPE_INTEGER) size += 4; else if (obj->type == ACPI_TYPE_BUFFER) size += obj->buffer.length; else { WARN_ONCE(1, "BIOS bug, unexpected element type: %d\n", obj->type); goto err; } } buf = ACPI_ALLOCATE(sizeof(*buf) + size); if (!buf) goto err; dst = buf + 1; buf->type = ACPI_TYPE_BUFFER; buf->buffer.length = size; buf->buffer.pointer = dst; for (i = 0; i < pkg->package.count; i++) { union acpi_object *obj = &pkg->package.elements[i]; if (obj->type == ACPI_TYPE_INTEGER) { memcpy(dst, &obj->integer.value, 4); dst += 4; } else if (obj->type == ACPI_TYPE_BUFFER) { memcpy(dst, obj->buffer.pointer, obj->buffer.length); dst += obj->buffer.length; } } err: ACPI_FREE(pkg); return buf; } static union acpi_object *int_to_buf(union acpi_object *integer) { union acpi_object *buf = NULL; void *dst = NULL; if (integer->type != ACPI_TYPE_INTEGER) { WARN_ONCE(1, "BIOS bug, unexpected element type: %d\n", integer->type); goto err; } buf = ACPI_ALLOCATE(sizeof(*buf) + 4); if (!buf) goto err; dst = buf + 1; buf->type = ACPI_TYPE_BUFFER; buf->buffer.length = 4; buf->buffer.pointer = dst; memcpy(dst, &integer->integer.value, 4); err: ACPI_FREE(integer); return buf; } static union acpi_object *acpi_label_write(acpi_handle handle, u32 offset, u32 len, void *data) { acpi_status rc; struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL }; struct acpi_object_list input = { .count = 3, .pointer = (union acpi_object []) { [0] = { .integer.type = ACPI_TYPE_INTEGER, .integer.value = offset, }, [1] = { .integer.type = ACPI_TYPE_INTEGER, .integer.value = len, }, [2] = { .buffer.type = ACPI_TYPE_BUFFER, .buffer.pointer = data, .buffer.length = len, }, }, }; rc = acpi_evaluate_object(handle, "_LSW", &input, &buf); if (ACPI_FAILURE(rc)) return NULL; return int_to_buf(buf.pointer); } static union acpi_object *acpi_label_read(acpi_handle handle, u32 offset, u32 len) { acpi_status rc; struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL }; struct acpi_object_list input = { .count = 2, .pointer = (union acpi_object []) { [0] = { .integer.type = ACPI_TYPE_INTEGER, .integer.value = offset, }, [1] = { .integer.type = ACPI_TYPE_INTEGER, .integer.value = len, }, }, }; rc = acpi_evaluate_object(handle, "_LSR", &input, &buf); if (ACPI_FAILURE(rc)) return NULL; return pkg_to_buf(buf.pointer); } static union acpi_object *acpi_label_info(acpi_handle handle) { acpi_status rc; struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL }; rc = acpi_evaluate_object(handle, "_LSI", NULL, &buf); if (ACPI_FAILURE(rc)) return NULL; return pkg_to_buf(buf.pointer); } static u8 nfit_dsm_revid(unsigned family, unsigned func) { static const u8 revid_table[NVDIMM_FAMILY_MAX+1][NVDIMM_CMD_MAX+1] = { [NVDIMM_FAMILY_INTEL] = { [NVDIMM_INTEL_GET_MODES ... NVDIMM_INTEL_FW_ACTIVATE_ARM] = 2, }, }; u8 id; if (family > NVDIMM_FAMILY_MAX) return 0; if (func > NVDIMM_CMD_MAX) return 0; id = revid_table[family][func]; if (id == 0) return 1; /* default */ return id; } static bool payload_dumpable(struct nvdimm *nvdimm, unsigned int func) { struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); if (nfit_mem && nfit_mem->family == NVDIMM_FAMILY_INTEL && func >= NVDIMM_INTEL_GET_SECURITY_STATE && func <= NVDIMM_INTEL_MASTER_SECURE_ERASE) return IS_ENABLED(CONFIG_NFIT_SECURITY_DEBUG); return true; } static int cmd_to_func(struct nfit_mem *nfit_mem, unsigned int cmd, struct nd_cmd_pkg *call_pkg, int *family) { if (call_pkg) { int i; if (nfit_mem && nfit_mem->family != call_pkg->nd_family) return -ENOTTY; for (i = 0; i < ARRAY_SIZE(call_pkg->nd_reserved2); i++) if (call_pkg->nd_reserved2[i]) return -EINVAL; *family = call_pkg->nd_family; return call_pkg->nd_command; } /* In the !call_pkg case, bus commands == bus functions */ if (!nfit_mem) return cmd; /* Linux ND commands == NVDIMM_FAMILY_INTEL function numbers */ if (nfit_mem->family == NVDIMM_FAMILY_INTEL) return cmd; /* * Force function number validation to fail since 0 is never * published as a valid function in dsm_mask. */ return 0; } int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, unsigned int cmd, void *buf, unsigned int buf_len, int *cmd_rc) { struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc); struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); union acpi_object in_obj, in_buf, *out_obj; const struct nd_cmd_desc *desc = NULL; struct device *dev = acpi_desc->dev; struct nd_cmd_pkg *call_pkg = NULL; const char *cmd_name, *dimm_name; unsigned long cmd_mask, dsm_mask; u32 offset, fw_status = 0; acpi_handle handle; const guid_t *guid; int func, rc, i; int family = 0; if (cmd_rc) *cmd_rc = -EINVAL; if (cmd == ND_CMD_CALL) { if (!buf || buf_len < sizeof(*call_pkg)) return -EINVAL; call_pkg = buf; } func = cmd_to_func(nfit_mem, cmd, call_pkg, &family); if (func < 0) return func; if (nvdimm) { struct acpi_device *adev = nfit_mem->adev; if (!adev) return -ENOTTY; dimm_name = nvdimm_name(nvdimm); cmd_name = nvdimm_cmd_name(cmd); cmd_mask = nvdimm_cmd_mask(nvdimm); dsm_mask = nfit_mem->dsm_mask; desc = nd_cmd_dimm_desc(cmd); guid = to_nfit_uuid(nfit_mem->family); handle = adev->handle; } else { struct acpi_device *adev = to_acpi_dev(acpi_desc); cmd_name = nvdimm_bus_cmd_name(cmd); cmd_mask = nd_desc->cmd_mask; if (cmd == ND_CMD_CALL && call_pkg->nd_family) { family = call_pkg->nd_family; if (call_pkg->nd_family > NVDIMM_BUS_FAMILY_MAX || !test_bit(family, &nd_desc->bus_family_mask)) return -EINVAL; family = array_index_nospec(family, NVDIMM_BUS_FAMILY_MAX + 1); dsm_mask = acpi_desc->family_dsm_mask[family]; guid = to_nfit_bus_uuid(family); } else { dsm_mask = acpi_desc->bus_dsm_mask; guid = to_nfit_uuid(NFIT_DEV_BUS); } desc = nd_cmd_bus_desc(cmd); handle = adev->handle; dimm_name = "bus"; } if (!desc || (cmd && (desc->out_num + desc->in_num == 0))) return -ENOTTY; /* * Check for a valid command. For ND_CMD_CALL, we also have to * make sure that the DSM function is supported. */ if (cmd == ND_CMD_CALL && (func > NVDIMM_CMD_MAX || !test_bit(func, &dsm_mask))) return -ENOTTY; else if (!test_bit(cmd, &cmd_mask)) return -ENOTTY; in_obj.type = ACPI_TYPE_PACKAGE; in_obj.package.count = 1; in_obj.package.elements = &in_buf; in_buf.type = ACPI_TYPE_BUFFER; in_buf.buffer.pointer = buf; in_buf.buffer.length = 0; /* libnvdimm has already validated the input envelope */ for (i = 0; i < desc->in_num; i++) in_buf.buffer.length += nd_cmd_in_size(nvdimm, cmd, desc, i, buf); if (call_pkg) { /* skip over package wrapper */ in_buf.buffer.pointer = (void *) &call_pkg->nd_payload; in_buf.buffer.length = call_pkg->nd_size_in; } dev_dbg(dev, "%s cmd: %d: family: %d func: %d input length: %d\n", dimm_name, cmd, family, func, in_buf.buffer.length); if (payload_dumpable(nvdimm, func)) print_hex_dump_debug("nvdimm in ", DUMP_PREFIX_OFFSET, 4, 4, in_buf.buffer.pointer, min_t(u32, 256, in_buf.buffer.length), true); /* call the BIOS, prefer the named methods over _DSM if available */ if (nvdimm && cmd == ND_CMD_GET_CONFIG_SIZE && test_bit(NFIT_MEM_LSR, &nfit_mem->flags)) out_obj = acpi_label_info(handle); else if (nvdimm && cmd == ND_CMD_GET_CONFIG_DATA && test_bit(NFIT_MEM_LSR, &nfit_mem->flags)) { struct nd_cmd_get_config_data_hdr *p = buf; out_obj = acpi_label_read(handle, p->in_offset, p->in_length); } else if (nvdimm && cmd == ND_CMD_SET_CONFIG_DATA && test_bit(NFIT_MEM_LSW, &nfit_mem->flags)) { struct nd_cmd_set_config_hdr *p = buf; out_obj = acpi_label_write(handle, p->in_offset, p->in_length, p->in_buf); } else { u8 revid; if (nvdimm) revid = nfit_dsm_revid(nfit_mem->family, func); else revid = 1; out_obj = acpi_evaluate_dsm(handle, guid, revid, func, &in_obj); } if (!out_obj) { dev_dbg(dev, "%s _DSM failed cmd: %s\n", dimm_name, cmd_name); return -EINVAL; } if (out_obj->type != ACPI_TYPE_BUFFER) { dev_dbg(dev, "%s unexpected output object type cmd: %s type: %d\n", dimm_name, cmd_name, out_obj->type); rc = -EINVAL; goto out; } dev_dbg(dev, "%s cmd: %s output length: %d\n", dimm_name, cmd_name, out_obj->buffer.length); print_hex_dump_debug(cmd_name, DUMP_PREFIX_OFFSET, 4, 4, out_obj->buffer.pointer, min_t(u32, 128, out_obj->buffer.length), true); if (call_pkg) { call_pkg->nd_fw_size = out_obj->buffer.length; memcpy(call_pkg->nd_payload + call_pkg->nd_size_in, out_obj->buffer.pointer, min(call_pkg->nd_fw_size, call_pkg->nd_size_out)); ACPI_FREE(out_obj); /* * Need to support FW function w/o known size in advance. * Caller can determine required size based upon nd_fw_size. * If we return an error (like elsewhere) then caller wouldn't * be able to rely upon data returned to make calculation. */ if (cmd_rc) *cmd_rc = 0; return 0; } for (i = 0, offset = 0; i < desc->out_num; i++) { u32 out_size = nd_cmd_out_size(nvdimm, cmd, desc, i, buf, (u32 *) out_obj->buffer.pointer, out_obj->buffer.length - offset); if (offset + out_size > out_obj->buffer.length) { dev_dbg(dev, "%s output object underflow cmd: %s field: %d\n", dimm_name, cmd_name, i); break; } if (in_buf.buffer.length + offset + out_size > buf_len) { dev_dbg(dev, "%s output overrun cmd: %s field: %d\n", dimm_name, cmd_name, i); rc = -ENXIO; goto out; } memcpy(buf + in_buf.buffer.length + offset, out_obj->buffer.pointer + offset, out_size); offset += out_size; } /* * Set fw_status for all the commands with a known format to be * later interpreted by xlat_status(). */ if (i >= 1 && ((!nvdimm && cmd >= ND_CMD_ARS_CAP && cmd <= ND_CMD_CLEAR_ERROR) || (nvdimm && cmd >= ND_CMD_SMART && cmd <= ND_CMD_VENDOR))) fw_status = *(u32 *) out_obj->buffer.pointer; if (offset + in_buf.buffer.length < buf_len) { if (i >= 1) { /* * status valid, return the number of bytes left * unfilled in the output buffer */ rc = buf_len - offset - in_buf.buffer.length; if (cmd_rc) *cmd_rc = xlat_status(nvdimm, buf, cmd, fw_status); } else { dev_err(dev, "%s:%s underrun cmd: %s buf_len: %d out_len: %d\n", __func__, dimm_name, cmd_name, buf_len, offset); rc = -ENXIO; } } else { rc = 0; if (cmd_rc) *cmd_rc = xlat_status(nvdimm, buf, cmd, fw_status); } out: ACPI_FREE(out_obj); return rc; } EXPORT_SYMBOL_GPL(acpi_nfit_ctl); static const char *spa_type_name(u16 type) { static const char *to_name[] = { [NFIT_SPA_VOLATILE] = "volatile", [NFIT_SPA_PM] = "pmem", [NFIT_SPA_DCR] = "dimm-control-region", [NFIT_SPA_BDW] = "block-data-window", [NFIT_SPA_VDISK] = "volatile-disk", [NFIT_SPA_VCD] = "volatile-cd", [NFIT_SPA_PDISK] = "persistent-disk", [NFIT_SPA_PCD] = "persistent-cd", }; if (type > NFIT_SPA_PCD) return "unknown"; return to_name[type]; } int nfit_spa_type(struct acpi_nfit_system_address *spa) { guid_t guid; int i; import_guid(&guid, spa->range_guid); for (i = 0; i < NFIT_UUID_MAX; i++) if (guid_equal(to_nfit_uuid(i), &guid)) return i; return -1; } static size_t sizeof_spa(struct acpi_nfit_system_address *spa) { if (spa->flags & ACPI_NFIT_LOCATION_COOKIE_VALID) return sizeof(*spa); return sizeof(*spa) - 8; } static bool add_spa(struct acpi_nfit_desc *acpi_desc, struct nfit_table_prev *prev, struct acpi_nfit_system_address *spa) { struct device *dev = acpi_desc->dev; struct nfit_spa *nfit_spa; if (spa->header.length != sizeof_spa(spa)) return false; list_for_each_entry(nfit_spa, &prev->spas, list) { if (memcmp(nfit_spa->spa, spa, sizeof_spa(spa)) == 0) { list_move_tail(&nfit_spa->list, &acpi_desc->spas); return true; } } nfit_spa = devm_kzalloc(dev, sizeof(*nfit_spa) + sizeof_spa(spa), GFP_KERNEL); if (!nfit_spa) return false; INIT_LIST_HEAD(&nfit_spa->list); memcpy(nfit_spa->spa, spa, sizeof_spa(spa)); list_add_tail(&nfit_spa->list, &acpi_desc->spas); dev_dbg(dev, "spa index: %d type: %s\n", spa->range_index, spa_type_name(nfit_spa_type(spa))); return true; } static bool add_memdev(struct acpi_nfit_desc *acpi_desc, struct nfit_table_prev *prev, struct acpi_nfit_memory_map *memdev) { struct device *dev = acpi_desc->dev; struct nfit_memdev *nfit_memdev; if (memdev->header.length != sizeof(*memdev)) return false; list_for_each_entry(nfit_memdev, &prev->memdevs, list) if (memcmp(nfit_memdev->memdev, memdev, sizeof(*memdev)) == 0) { list_move_tail(&nfit_memdev->list, &acpi_desc->memdevs); return true; } nfit_memdev = devm_kzalloc(dev, sizeof(*nfit_memdev) + sizeof(*memdev), GFP_KERNEL); if (!nfit_memdev) return false; INIT_LIST_HEAD(&nfit_memdev->list); memcpy(nfit_memdev->memdev, memdev, sizeof(*memdev)); list_add_tail(&nfit_memdev->list, &acpi_desc->memdevs); dev_dbg(dev, "memdev handle: %#x spa: %d dcr: %d flags: %#x\n", memdev->device_handle, memdev->range_index, memdev->region_index, memdev->flags); return true; } int nfit_get_smbios_id(u32 device_handle, u16 *flags) { struct acpi_nfit_memory_map *memdev; struct acpi_nfit_desc *acpi_desc; struct nfit_mem *nfit_mem; u16 physical_id; mutex_lock(&acpi_desc_lock); list_for_each_entry(acpi_desc, &acpi_descs, list) { mutex_lock(&acpi_desc->init_mutex); list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) { memdev = __to_nfit_memdev(nfit_mem); if (memdev->device_handle == device_handle) { *flags = memdev->flags; physical_id = memdev->physical_id; mutex_unlock(&acpi_desc->init_mutex); mutex_unlock(&acpi_desc_lock); return physical_id; } } mutex_unlock(&acpi_desc->init_mutex); } mutex_unlock(&acpi_desc_lock); return -ENODEV; } EXPORT_SYMBOL_GPL(nfit_get_smbios_id); /* * An implementation may provide a truncated control region if no block windows * are defined. */ static size_t sizeof_dcr(struct acpi_nfit_control_region *dcr) { if (dcr->header.length < offsetof(struct acpi_nfit_control_region, window_size)) return 0; if (dcr->windows) return sizeof(*dcr); return offsetof(struct acpi_nfit_control_region, window_size); } static bool add_dcr(struct acpi_nfit_desc *acpi_desc, struct nfit_table_prev *prev, struct acpi_nfit_control_region *dcr) { struct device *dev = acpi_desc->dev; struct nfit_dcr *nfit_dcr; if (!sizeof_dcr(dcr)) return false; list_for_each_entry(nfit_dcr, &prev->dcrs, list) if (memcmp(nfit_dcr->dcr, dcr, sizeof_dcr(dcr)) == 0) { list_move_tail(&nfit_dcr->list, &acpi_desc->dcrs); return true; } nfit_dcr = devm_kzalloc(dev, sizeof(*nfit_dcr) + sizeof(*dcr), GFP_KERNEL); if (!nfit_dcr) return false; INIT_LIST_HEAD(&nfit_dcr->list); memcpy(nfit_dcr->dcr, dcr, sizeof_dcr(dcr)); list_add_tail(&nfit_dcr->list, &acpi_desc->dcrs); dev_dbg(dev, "dcr index: %d windows: %d\n", dcr->region_index, dcr->windows); return true; } static bool add_bdw(struct acpi_nfit_desc *acpi_desc, struct nfit_table_prev *prev, struct acpi_nfit_data_region *bdw) { struct device *dev = acpi_desc->dev; struct nfit_bdw *nfit_bdw; if (bdw->header.length != sizeof(*bdw)) return false; list_for_each_entry(nfit_bdw, &prev->bdws, list) if (memcmp(nfit_bdw->bdw, bdw, sizeof(*bdw)) == 0) { list_move_tail(&nfit_bdw->list, &acpi_desc->bdws); return true; } nfit_bdw = devm_kzalloc(dev, sizeof(*nfit_bdw) + sizeof(*bdw), GFP_KERNEL); if (!nfit_bdw) return false; INIT_LIST_HEAD(&nfit_bdw->list); memcpy(nfit_bdw->bdw, bdw, sizeof(*bdw)); list_add_tail(&nfit_bdw->list, &acpi_desc->bdws); dev_dbg(dev, "bdw dcr: %d windows: %d\n", bdw->region_index, bdw->windows); return true; } static size_t sizeof_idt(struct acpi_nfit_interleave *idt) { if (idt->header.length < sizeof(*idt)) return 0; return sizeof(*idt) + sizeof(u32) * idt->line_count; } static bool add_idt(struct acpi_nfit_desc *acpi_desc, struct nfit_table_prev *prev, struct acpi_nfit_interleave *idt) { struct device *dev = acpi_desc->dev; struct nfit_idt *nfit_idt; if (!sizeof_idt(idt)) return false; list_for_each_entry(nfit_idt, &prev->idts, list) { if (sizeof_idt(nfit_idt->idt) != sizeof_idt(idt)) continue; if (memcmp(nfit_idt->idt, idt, sizeof_idt(idt)) == 0) { list_move_tail(&nfit_idt->list, &acpi_desc->idts); return true; } } nfit_idt = devm_kzalloc(dev, sizeof(*nfit_idt) + sizeof_idt(idt), GFP_KERNEL); if (!nfit_idt) return false; INIT_LIST_HEAD(&nfit_idt->list); memcpy(nfit_idt->idt, idt, sizeof_idt(idt)); list_add_tail(&nfit_idt->list, &acpi_desc->idts); dev_dbg(dev, "idt index: %d num_lines: %d\n", idt->interleave_index, idt->line_count); return true; } static size_t sizeof_flush(struct acpi_nfit_flush_address *flush) { if (flush->header.length < sizeof(*flush)) return 0; return struct_size(flush, hint_address, flush->hint_count); } static bool add_flush(struct acpi_nfit_desc *acpi_desc, struct nfit_table_prev *prev, struct acpi_nfit_flush_address *flush) { struct device *dev = acpi_desc->dev; struct nfit_flush *nfit_flush; if (!sizeof_flush(flush)) return false; list_for_each_entry(nfit_flush, &prev->flushes, list) { if (sizeof_flush(nfit_flush->flush) != sizeof_flush(flush)) continue; if (memcmp(nfit_flush->flush, flush, sizeof_flush(flush)) == 0) { list_move_tail(&nfit_flush->list, &acpi_desc->flushes); return true; } } nfit_flush = devm_kzalloc(dev, sizeof(*nfit_flush) + sizeof_flush(flush), GFP_KERNEL); if (!nfit_flush) return false; INIT_LIST_HEAD(&nfit_flush->list); memcpy(nfit_flush->flush, flush, sizeof_flush(flush)); list_add_tail(&nfit_flush->list, &acpi_desc->flushes); dev_dbg(dev, "nfit_flush handle: %d hint_count: %d\n", flush->device_handle, flush->hint_count); return true; } static bool add_platform_cap(struct acpi_nfit_desc *acpi_desc, struct acpi_nfit_capabilities *pcap) { struct device *dev = acpi_desc->dev; u32 mask; mask = (1 << (pcap->highest_capability + 1)) - 1; acpi_desc->platform_cap = pcap->capabilities & mask; dev_dbg(dev, "cap: %#x\n", acpi_desc->platform_cap); return true; } static void *add_table(struct acpi_nfit_desc *acpi_desc, struct nfit_table_prev *prev, void *table, const void *end) { struct device *dev = acpi_desc->dev; struct acpi_nfit_header *hdr; void *err = ERR_PTR(-ENOMEM); if (table >= end) return NULL; hdr = table; if (!hdr->length) { dev_warn(dev, "found a zero length table '%d' parsing nfit\n", hdr->type); return NULL; } switch (hdr->type) { case ACPI_NFIT_TYPE_SYSTEM_ADDRESS: if (!add_spa(acpi_desc, prev, table)) return err; break; case ACPI_NFIT_TYPE_MEMORY_MAP: if (!add_memdev(acpi_desc, prev, table)) return err; break; case ACPI_NFIT_TYPE_CONTROL_REGION: if (!add_dcr(acpi_desc, prev, table)) return err; break; case ACPI_NFIT_TYPE_DATA_REGION: if (!add_bdw(acpi_desc, prev, table)) return err; break; case ACPI_NFIT_TYPE_INTERLEAVE: if (!add_idt(acpi_desc, prev, table)) return err; break; case ACPI_NFIT_TYPE_FLUSH_ADDRESS: if (!add_flush(acpi_desc, prev, table)) return err; break; case ACPI_NFIT_TYPE_SMBIOS: dev_dbg(dev, "smbios\n"); break; case ACPI_NFIT_TYPE_CAPABILITIES: if (!add_platform_cap(acpi_desc, table)) return err; break; default: dev_err(dev, "unknown table '%d' parsing nfit\n", hdr->type); break; } return table + hdr->length; } static int __nfit_mem_init(struct acpi_nfit_desc *acpi_desc, struct acpi_nfit_system_address *spa) { struct nfit_mem *nfit_mem, *found; struct nfit_memdev *nfit_memdev; int type = spa ? nfit_spa_type(spa) : 0; switch (type) { case NFIT_SPA_DCR: case NFIT_SPA_PM: break; default: if (spa) return 0; } /* * This loop runs in two modes, when a dimm is mapped the loop * adds memdev associations to an existing dimm, or creates a * dimm. In the unmapped dimm case this loop sweeps for memdev * instances with an invalid / zero range_index and adds those * dimms without spa associations. */ list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) { struct nfit_flush *nfit_flush; struct nfit_dcr *nfit_dcr; u32 device_handle; u16 dcr; if (spa && nfit_memdev->memdev->range_index != spa->range_index) continue; if (!spa && nfit_memdev->memdev->range_index) continue; found = NULL; dcr = nfit_memdev->memdev->region_index; device_handle = nfit_memdev->memdev->device_handle; list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) if (__to_nfit_memdev(nfit_mem)->device_handle == device_handle) { found = nfit_mem; break; } if (found) nfit_mem = found; else { nfit_mem = devm_kzalloc(acpi_desc->dev, sizeof(*nfit_mem), GFP_KERNEL); if (!nfit_mem) return -ENOMEM; INIT_LIST_HEAD(&nfit_mem->list); nfit_mem->acpi_desc = acpi_desc; list_add(&nfit_mem->list, &acpi_desc->dimms); } list_for_each_entry(nfit_dcr, &acpi_desc->dcrs, list) { if (nfit_dcr->dcr->region_index != dcr) continue; /* * Record the control region for the dimm. For * the ACPI 6.1 case, where there are separate * control regions for the pmem vs blk * interfaces, be sure to record the extended * blk details. */ if (!nfit_mem->dcr) nfit_mem->dcr = nfit_dcr->dcr; else if (nfit_mem->dcr->windows == 0 && nfit_dcr->dcr->windows) nfit_mem->dcr = nfit_dcr->dcr; break; } list_for_each_entry(nfit_flush, &acpi_desc->flushes, list) { struct acpi_nfit_flush_address *flush; u16 i; if (nfit_flush->flush->device_handle != device_handle) continue; nfit_mem->nfit_flush = nfit_flush; flush = nfit_flush->flush; nfit_mem->flush_wpq = devm_kcalloc(acpi_desc->dev, flush->hint_count, sizeof(struct resource), GFP_KERNEL); if (!nfit_mem->flush_wpq) return -ENOMEM; for (i = 0; i < flush->hint_count; i++) { struct resource *res = &nfit_mem->flush_wpq[i]; res->start = flush->hint_address[i]; res->end = res->start + 8 - 1; } break; } if (dcr && !nfit_mem->dcr) { dev_err(acpi_desc->dev, "SPA %d missing DCR %d\n", spa->range_index, dcr); return -ENODEV; } if (type == NFIT_SPA_DCR) { struct nfit_idt *nfit_idt; u16 idt_idx; /* multiple dimms may share a SPA when interleaved */ nfit_mem->spa_dcr = spa; nfit_mem->memdev_dcr = nfit_memdev->memdev; idt_idx = nfit_memdev->memdev->interleave_index; list_for_each_entry(nfit_idt, &acpi_desc->idts, list) { if (nfit_idt->idt->interleave_index != idt_idx) continue; nfit_mem->idt_dcr = nfit_idt->idt; break; } } else if (type == NFIT_SPA_PM) { /* * A single dimm may belong to multiple SPA-PM * ranges, record at least one in addition to * any SPA-DCR range. */ nfit_mem->memdev_pmem = nfit_memdev->memdev; } else nfit_mem->memdev_dcr = nfit_memdev->memdev; } return 0; } static int nfit_mem_cmp(void *priv, const struct list_head *_a, const struct list_head *_b) { struct nfit_mem *a = container_of(_a, typeof(*a), list); struct nfit_mem *b = container_of(_b, typeof(*b), list); u32 handleA, handleB; handleA = __to_nfit_memdev(a)->device_handle; handleB = __to_nfit_memdev(b)->device_handle; if (handleA < handleB) return -1; else if (handleA > handleB) return 1; return 0; } static int nfit_mem_init(struct acpi_nfit_desc *acpi_desc) { struct nfit_spa *nfit_spa; int rc; /* * For each SPA-DCR or SPA-PMEM address range find its * corresponding MEMDEV(s). From each MEMDEV find the * corresponding DCR. Then, if we're operating on a SPA-DCR, * try to find a SPA-BDW and a corresponding BDW that references * the DCR. Throw it all into an nfit_mem object. Note, that * BDWs are optional. */ list_for_each_entry(nfit_spa, &acpi_desc->spas, list) { rc = __nfit_mem_init(acpi_desc, nfit_spa->spa); if (rc) return rc; } /* * If a DIMM has failed to be mapped into SPA there will be no * SPA entries above. Find and register all the unmapped DIMMs * for reporting and recovery purposes. */ rc = __nfit_mem_init(acpi_desc, NULL); if (rc) return rc; list_sort(NULL, &acpi_desc->dimms, nfit_mem_cmp); return 0; } static ssize_t bus_dsm_mask_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev); struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus); struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc); return sysfs_emit(buf, "%#lx\n", acpi_desc->bus_dsm_mask); } static struct device_attribute dev_attr_bus_dsm_mask = __ATTR(dsm_mask, 0444, bus_dsm_mask_show, NULL); static ssize_t revision_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev); struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus); struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc); return sysfs_emit(buf, "%d\n", acpi_desc->acpi_header.revision); } static DEVICE_ATTR_RO(revision); static ssize_t hw_error_scrub_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev); struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus); struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc); return sysfs_emit(buf, "%d\n", acpi_desc->scrub_mode); } /* * The 'hw_error_scrub' attribute can have the following values written to it: * '0': Switch to the default mode where an exception will only insert * the address of the memory error into the poison and badblocks lists. * '1': Enable a full scrub to happen if an exception for a memory error is * received. */ static ssize_t hw_error_scrub_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t size) { struct nvdimm_bus_descriptor *nd_desc; ssize_t rc; long val; rc = kstrtol(buf, 0, &val); if (rc) return rc; device_lock(dev); nd_desc = dev_get_drvdata(dev); if (nd_desc) { struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc); switch (val) { case HW_ERROR_SCRUB_ON: acpi_desc->scrub_mode = HW_ERROR_SCRUB_ON; break; case HW_ERROR_SCRUB_OFF: acpi_desc->scrub_mode = HW_ERROR_SCRUB_OFF; break; default: rc = -EINVAL; break; } } device_unlock(dev); if (rc) return rc; return size; } static DEVICE_ATTR_RW(hw_error_scrub); /* * This shows the number of full Address Range Scrubs that have been * completed since driver load time. Userspace can wait on this using * select/poll etc. A '+' at the end indicates an ARS is in progress */ static ssize_t scrub_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvdimm_bus_descriptor *nd_desc; struct acpi_nfit_desc *acpi_desc; ssize_t rc = -ENXIO; bool busy; device_lock(dev); nd_desc = dev_get_drvdata(dev); if (!nd_desc) { device_unlock(dev); return rc; } acpi_desc = to_acpi_desc(nd_desc); mutex_lock(&acpi_desc->init_mutex); busy = test_bit(ARS_BUSY, &acpi_desc->scrub_flags) && !test_bit(ARS_CANCEL, &acpi_desc->scrub_flags); rc = sysfs_emit(buf, "%d%s", acpi_desc->scrub_count, busy ? "+\n" : "\n"); /* Allow an admin to poll the busy state at a higher rate */ if (busy && capable(CAP_SYS_RAWIO) && !test_and_set_bit(ARS_POLL, &acpi_desc->scrub_flags)) { acpi_desc->scrub_tmo = 1; mod_delayed_work(nfit_wq, &acpi_desc->dwork, HZ); } mutex_unlock(&acpi_desc->init_mutex); device_unlock(dev); return rc; } static ssize_t scrub_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t size) { struct nvdimm_bus_descriptor *nd_desc; ssize_t rc; long val; rc = kstrtol(buf, 0, &val); if (rc) return rc; if (val != 1) return -EINVAL; device_lock(dev); nd_desc = dev_get_drvdata(dev); if (nd_desc) { struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc); rc = acpi_nfit_ars_rescan(acpi_desc, ARS_REQ_LONG); } device_unlock(dev); if (rc) return rc; return size; } static DEVICE_ATTR_RW(scrub); static bool ars_supported(struct nvdimm_bus *nvdimm_bus) { struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus); const unsigned long mask = 1 << ND_CMD_ARS_CAP | 1 << ND_CMD_ARS_START | 1 << ND_CMD_ARS_STATUS; return (nd_desc->cmd_mask & mask) == mask; } static umode_t nfit_visible(struct kobject *kobj, struct attribute *a, int n) { struct device *dev = kobj_to_dev(kobj); struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev); if (a == &dev_attr_scrub.attr) return ars_supported(nvdimm_bus) ? a->mode : 0; if (a == &dev_attr_firmware_activate_noidle.attr) return intel_fwa_supported(nvdimm_bus) ? a->mode : 0; return a->mode; } static struct attribute *acpi_nfit_attributes[] = { &dev_attr_revision.attr, &dev_attr_scrub.attr, &dev_attr_hw_error_scrub.attr, &dev_attr_bus_dsm_mask.attr, &dev_attr_firmware_activate_noidle.attr, NULL, }; static const struct attribute_group acpi_nfit_attribute_group = { .name = "nfit", .attrs = acpi_nfit_attributes, .is_visible = nfit_visible, }; static const struct attribute_group *acpi_nfit_attribute_groups[] = { &acpi_nfit_attribute_group, NULL, }; static struct acpi_nfit_memory_map *to_nfit_memdev(struct device *dev) { struct nvdimm *nvdimm = to_nvdimm(dev); struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); return __to_nfit_memdev(nfit_mem); } static struct acpi_nfit_control_region *to_nfit_dcr(struct device *dev) { struct nvdimm *nvdimm = to_nvdimm(dev); struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); return nfit_mem->dcr; } static ssize_t handle_show(struct device *dev, struct device_attribute *attr, char *buf) { struct acpi_nfit_memory_map *memdev = to_nfit_memdev(dev); return sysfs_emit(buf, "%#x\n", memdev->device_handle); } static DEVICE_ATTR_RO(handle); static ssize_t phys_id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct acpi_nfit_memory_map *memdev = to_nfit_memdev(dev); return sysfs_emit(buf, "%#x\n", memdev->physical_id); } static DEVICE_ATTR_RO(phys_id); static ssize_t vendor_show(struct device *dev, struct device_attribute *attr, char *buf) { struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev); return sysfs_emit(buf, "0x%04x\n", be16_to_cpu(dcr->vendor_id)); } static DEVICE_ATTR_RO(vendor); static ssize_t rev_id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev); return sysfs_emit(buf, "0x%04x\n", be16_to_cpu(dcr->revision_id)); } static DEVICE_ATTR_RO(rev_id); static ssize_t device_show(struct device *dev, struct device_attribute *attr, char *buf) { struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev); return sysfs_emit(buf, "0x%04x\n", be16_to_cpu(dcr->device_id)); } static DEVICE_ATTR_RO(device); static ssize_t subsystem_vendor_show(struct device *dev, struct device_attribute *attr, char *buf) { struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev); return sysfs_emit(buf, "0x%04x\n", be16_to_cpu(dcr->subsystem_vendor_id)); } static DEVICE_ATTR_RO(subsystem_vendor); static ssize_t subsystem_rev_id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev); return sysfs_emit(buf, "0x%04x\n", be16_to_cpu(dcr->subsystem_revision_id)); } static DEVICE_ATTR_RO(subsystem_rev_id); static ssize_t subsystem_device_show(struct device *dev, struct device_attribute *attr, char *buf) { struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev); return sysfs_emit(buf, "0x%04x\n", be16_to_cpu(dcr->subsystem_device_id)); } static DEVICE_ATTR_RO(subsystem_device); static int num_nvdimm_formats(struct nvdimm *nvdimm) { struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); int formats = 0; if (nfit_mem->memdev_pmem) formats++; return formats; } static ssize_t format_show(struct device *dev, struct device_attribute *attr, char *buf) { struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev); return sysfs_emit(buf, "0x%04x\n", le16_to_cpu(dcr->code)); } static DEVICE_ATTR_RO(format); static ssize_t format1_show(struct device *dev, struct device_attribute *attr, char *buf) { u32 handle; ssize_t rc = -ENXIO; struct nfit_mem *nfit_mem; struct nfit_memdev *nfit_memdev; struct acpi_nfit_desc *acpi_desc; struct nvdimm *nvdimm = to_nvdimm(dev); struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev); nfit_mem = nvdimm_provider_data(nvdimm); acpi_desc = nfit_mem->acpi_desc; handle = to_nfit_memdev(dev)->device_handle; /* assumes DIMMs have at most 2 published interface codes */ mutex_lock(&acpi_desc->init_mutex); list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) { struct acpi_nfit_memory_map *memdev = nfit_memdev->memdev; struct nfit_dcr *nfit_dcr; if (memdev->device_handle != handle) continue; list_for_each_entry(nfit_dcr, &acpi_desc->dcrs, list) { if (nfit_dcr->dcr->region_index != memdev->region_index) continue; if (nfit_dcr->dcr->code == dcr->code) continue; rc = sysfs_emit(buf, "0x%04x\n", le16_to_cpu(nfit_dcr->dcr->code)); break; } if (rc != -ENXIO) break; } mutex_unlock(&acpi_desc->init_mutex); return rc; } static DEVICE_ATTR_RO(format1); static ssize_t formats_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvdimm *nvdimm = to_nvdimm(dev); return sysfs_emit(buf, "%d\n", num_nvdimm_formats(nvdimm)); } static DEVICE_ATTR_RO(formats); static ssize_t serial_show(struct device *dev, struct device_attribute *attr, char *buf) { struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev); return sysfs_emit(buf, "0x%08x\n", be32_to_cpu(dcr->serial_number)); } static DEVICE_ATTR_RO(serial); static ssize_t family_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvdimm *nvdimm = to_nvdimm(dev); struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); if (nfit_mem->family < 0) return -ENXIO; return sysfs_emit(buf, "%d\n", nfit_mem->family); } static DEVICE_ATTR_RO(family); static ssize_t dsm_mask_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvdimm *nvdimm = to_nvdimm(dev); struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); if (nfit_mem->family < 0) return -ENXIO; return sysfs_emit(buf, "%#lx\n", nfit_mem->dsm_mask); } static DEVICE_ATTR_RO(dsm_mask); static ssize_t flags_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvdimm *nvdimm = to_nvdimm(dev); struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); u16 flags = __to_nfit_memdev(nfit_mem)->flags; if (test_bit(NFIT_MEM_DIRTY, &nfit_mem->flags)) flags |= ACPI_NFIT_MEM_FLUSH_FAILED; return sysfs_emit(buf, "%s%s%s%s%s%s%s\n", flags & ACPI_NFIT_MEM_SAVE_FAILED ? "save_fail " : "", flags & ACPI_NFIT_MEM_RESTORE_FAILED ? "restore_fail " : "", flags & ACPI_NFIT_MEM_FLUSH_FAILED ? "flush_fail " : "", flags & ACPI_NFIT_MEM_NOT_ARMED ? "not_armed " : "", flags & ACPI_NFIT_MEM_HEALTH_OBSERVED ? "smart_event " : "", flags & ACPI_NFIT_MEM_MAP_FAILED ? "map_fail " : "", flags & ACPI_NFIT_MEM_HEALTH_ENABLED ? "smart_notify " : ""); } static DEVICE_ATTR_RO(flags); static ssize_t id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvdimm *nvdimm = to_nvdimm(dev); struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); return sysfs_emit(buf, "%s\n", nfit_mem->id); } static DEVICE_ATTR_RO(id); static ssize_t dirty_shutdown_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvdimm *nvdimm = to_nvdimm(dev); struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); return sysfs_emit(buf, "%d\n", nfit_mem->dirty_shutdown); } static DEVICE_ATTR_RO(dirty_shutdown); static struct attribute *acpi_nfit_dimm_attributes[] = { &dev_attr_handle.attr, &dev_attr_phys_id.attr, &dev_attr_vendor.attr, &dev_attr_device.attr, &dev_attr_rev_id.attr, &dev_attr_subsystem_vendor.attr, &dev_attr_subsystem_device.attr, &dev_attr_subsystem_rev_id.attr, &dev_attr_format.attr, &dev_attr_formats.attr, &dev_attr_format1.attr, &dev_attr_serial.attr, &dev_attr_flags.attr, &dev_attr_id.attr, &dev_attr_family.attr, &dev_attr_dsm_mask.attr, &dev_attr_dirty_shutdown.attr, NULL, }; static umode_t acpi_nfit_dimm_attr_visible(struct kobject *kobj, struct attribute *a, int n) { struct device *dev = kobj_to_dev(kobj); struct nvdimm *nvdimm = to_nvdimm(dev); struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); if (!to_nfit_dcr(dev)) { /* Without a dcr only the memdev attributes can be surfaced */ if (a == &dev_attr_handle.attr || a == &dev_attr_phys_id.attr || a == &dev_attr_flags.attr || a == &dev_attr_family.attr || a == &dev_attr_dsm_mask.attr) return a->mode; return 0; } if (a == &dev_attr_format1.attr && num_nvdimm_formats(nvdimm) <= 1) return 0; if (!test_bit(NFIT_MEM_DIRTY_COUNT, &nfit_mem->flags) && a == &dev_attr_dirty_shutdown.attr) return 0; return a->mode; } static const struct attribute_group acpi_nfit_dimm_attribute_group = { .name = "nfit", .attrs = acpi_nfit_dimm_attributes, .is_visible = acpi_nfit_dimm_attr_visible, }; static const struct attribute_group *acpi_nfit_dimm_attribute_groups[] = { &acpi_nfit_dimm_attribute_group, NULL, }; static struct nvdimm *acpi_nfit_dimm_by_handle(struct acpi_nfit_desc *acpi_desc, u32 device_handle) { struct nfit_mem *nfit_mem; list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) if (__to_nfit_memdev(nfit_mem)->device_handle == device_handle) return nfit_mem->nvdimm; return NULL; } void __acpi_nvdimm_notify(struct device *dev, u32 event) { struct nfit_mem *nfit_mem; struct acpi_nfit_desc *acpi_desc; dev_dbg(dev->parent, "%s: event: %d\n", dev_name(dev), event); if (event != NFIT_NOTIFY_DIMM_HEALTH) { dev_dbg(dev->parent, "%s: unknown event: %d\n", dev_name(dev), event); return; } acpi_desc = dev_get_drvdata(dev->parent); if (!acpi_desc) return; /* * If we successfully retrieved acpi_desc, then we know nfit_mem data * is still valid. */ nfit_mem = dev_get_drvdata(dev); if (nfit_mem && nfit_mem->flags_attr) sysfs_notify_dirent(nfit_mem->flags_attr); } EXPORT_SYMBOL_GPL(__acpi_nvdimm_notify); static void acpi_nvdimm_notify(acpi_handle handle, u32 event, void *data) { struct acpi_device *adev = data; struct device *dev = &adev->dev; device_lock(dev->parent); __acpi_nvdimm_notify(dev, event); device_unlock(dev->parent); } static bool acpi_nvdimm_has_method(struct acpi_device *adev, char *method) { acpi_handle handle; acpi_status status; status = acpi_get_handle(adev->handle, method, &handle); if (ACPI_SUCCESS(status)) return true; return false; } __weak void nfit_intel_shutdown_status(struct nfit_mem *nfit_mem) { struct device *dev = &nfit_mem->adev->dev; struct nd_intel_smart smart = { 0 }; union acpi_object in_buf = { .buffer.type = ACPI_TYPE_BUFFER, .buffer.length = 0, }; union acpi_object in_obj = { .package.type = ACPI_TYPE_PACKAGE, .package.count = 1, .package.elements = &in_buf, }; const u8 func = ND_INTEL_SMART; const guid_t *guid = to_nfit_uuid(nfit_mem->family); u8 revid = nfit_dsm_revid(nfit_mem->family, func); struct acpi_device *adev = nfit_mem->adev; acpi_handle handle = adev->handle; union acpi_object *out_obj; if ((nfit_mem->dsm_mask & (1 << func)) == 0) return; out_obj = acpi_evaluate_dsm_typed(handle, guid, revid, func, &in_obj, ACPI_TYPE_BUFFER); if (!out_obj || out_obj->buffer.length < sizeof(smart)) { dev_dbg(dev->parent, "%s: failed to retrieve initial health\n", dev_name(dev)); ACPI_FREE(out_obj); return; } memcpy(&smart, out_obj->buffer.pointer, sizeof(smart)); ACPI_FREE(out_obj); if (smart.flags & ND_INTEL_SMART_SHUTDOWN_VALID) { if (smart.shutdown_state) set_bit(NFIT_MEM_DIRTY, &nfit_mem->flags); } if (smart.flags & ND_INTEL_SMART_SHUTDOWN_COUNT_VALID) { set_bit(NFIT_MEM_DIRTY_COUNT, &nfit_mem->flags); nfit_mem->dirty_shutdown = smart.shutdown_count; } } static void populate_shutdown_status(struct nfit_mem *nfit_mem) { /* * For DIMMs that provide a dynamic facility to retrieve a * dirty-shutdown status and/or a dirty-shutdown count, cache * these values in nfit_mem. */ if (nfit_mem->family == NVDIMM_FAMILY_INTEL) nfit_intel_shutdown_status(nfit_mem); } static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc, struct nfit_mem *nfit_mem, u32 device_handle) { struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc; struct acpi_device *adev, *adev_dimm; struct device *dev = acpi_desc->dev; unsigned long dsm_mask, label_mask; const guid_t *guid; int i; int family = -1; struct acpi_nfit_control_region *dcr = nfit_mem->dcr; /* nfit test assumes 1:1 relationship between commands and dsms */ nfit_mem->dsm_mask = acpi_desc->dimm_cmd_force_en; nfit_mem->family = NVDIMM_FAMILY_INTEL; set_bit(NVDIMM_FAMILY_INTEL, &nd_desc->dimm_family_mask); if (dcr->valid_fields & ACPI_NFIT_CONTROL_MFG_INFO_VALID) sprintf(nfit_mem->id, "%04x-%02x-%04x-%08x", be16_to_cpu(dcr->vendor_id), dcr->manufacturing_location, be16_to_cpu(dcr->manufacturing_date), be32_to_cpu(dcr->serial_number)); else sprintf(nfit_mem->id, "%04x-%08x", be16_to_cpu(dcr->vendor_id), be32_to_cpu(dcr->serial_number)); adev = to_acpi_dev(acpi_desc); if (!adev) { /* unit test case */ populate_shutdown_status(nfit_mem); return 0; } adev_dimm = acpi_find_child_device(adev, device_handle, false); nfit_mem->adev = adev_dimm; if (!adev_dimm) { dev_err(dev, "no ACPI.NFIT device with _ADR %#x, disabling...\n", device_handle); return force_enable_dimms ? 0 : -ENODEV; } if (ACPI_FAILURE(acpi_install_notify_handler(adev_dimm->handle, ACPI_DEVICE_NOTIFY, acpi_nvdimm_notify, adev_dimm))) { dev_err(dev, "%s: notification registration failed\n", dev_name(&adev_dimm->dev)); return -ENXIO; } /* * Record nfit_mem for the notification path to track back to * the nfit sysfs attributes for this dimm device object. */ dev_set_drvdata(&adev_dimm->dev, nfit_mem); /* * There are 4 "legacy" NVDIMM command sets * (NVDIMM_FAMILY_{INTEL,MSFT,HPE1,HPE2}) that were created before * an EFI working group was established to constrain this * proliferation. The nfit driver probes for the supported command * set by GUID. Note, if you're a platform developer looking to add * a new command set to this probe, consider using an existing set, * or otherwise seek approval to publish the command set at * http://www.uefi.org/RFIC_LIST. * * Note, that checking for function0 (bit0) tells us if any commands * are reachable through this GUID. */ clear_bit(NVDIMM_FAMILY_INTEL, &nd_desc->dimm_family_mask); for (i = 0; i <= NVDIMM_FAMILY_MAX; i++) if (acpi_check_dsm(adev_dimm->handle, to_nfit_uuid(i), 1, 1)) { set_bit(i, &nd_desc->dimm_family_mask); if (family < 0 || i == default_dsm_family) family = i; } /* limit the supported commands to those that are publicly documented */ nfit_mem->family = family; if (override_dsm_mask && !disable_vendor_specific) dsm_mask = override_dsm_mask; else if (nfit_mem->family == NVDIMM_FAMILY_INTEL) { dsm_mask = NVDIMM_INTEL_CMDMASK; if (disable_vendor_specific) dsm_mask &= ~(1 << ND_CMD_VENDOR); } else if (nfit_mem->family == NVDIMM_FAMILY_HPE1) { dsm_mask = 0x1c3c76; } else if (nfit_mem->family == NVDIMM_FAMILY_HPE2) { dsm_mask = 0x1fe; if (disable_vendor_specific) dsm_mask &= ~(1 << 8); } else if (nfit_mem->family == NVDIMM_FAMILY_MSFT) { dsm_mask = 0xffffffff; } else if (nfit_mem->family == NVDIMM_FAMILY_HYPERV) { dsm_mask = 0x1f; } else { dev_dbg(dev, "unknown dimm command family\n"); nfit_mem->family = -1; /* DSMs are optional, continue loading the driver... */ return 0; } /* * Function 0 is the command interrogation function, don't * export it to potential userspace use, and enable it to be * used as an error value in acpi_nfit_ctl(). */ dsm_mask &= ~1UL; guid = to_nfit_uuid(nfit_mem->family); for_each_set_bit(i, &dsm_mask, BITS_PER_LONG) if (acpi_check_dsm(adev_dimm->handle, guid, nfit_dsm_revid(nfit_mem->family, i), 1ULL << i)) set_bit(i, &nfit_mem->dsm_mask); /* * Prefer the NVDIMM_FAMILY_INTEL label read commands if present * due to their better semantics handling locked capacity. */ label_mask = 1 << ND_CMD_GET_CONFIG_SIZE | 1 << ND_CMD_GET_CONFIG_DATA | 1 << ND_CMD_SET_CONFIG_DATA; if (family == NVDIMM_FAMILY_INTEL && (dsm_mask & label_mask) == label_mask) /* skip _LS{I,R,W} enabling */; else { if (acpi_nvdimm_has_method(adev_dimm, "_LSI") && acpi_nvdimm_has_method(adev_dimm, "_LSR")) { dev_dbg(dev, "%s: has _LSR\n", dev_name(&adev_dimm->dev)); set_bit(NFIT_MEM_LSR, &nfit_mem->flags); } if (test_bit(NFIT_MEM_LSR, &nfit_mem->flags) && acpi_nvdimm_has_method(adev_dimm, "_LSW")) { dev_dbg(dev, "%s: has _LSW\n", dev_name(&adev_dimm->dev)); set_bit(NFIT_MEM_LSW, &nfit_mem->flags); } /* * Quirk read-only label configurations to preserve * access to label-less namespaces by default. */ if (!test_bit(NFIT_MEM_LSW, &nfit_mem->flags) && !force_labels) { dev_dbg(dev, "%s: No _LSW, disable labels\n", dev_name(&adev_dimm->dev)); clear_bit(NFIT_MEM_LSR, &nfit_mem->flags); } else dev_dbg(dev, "%s: Force enable labels\n", dev_name(&adev_dimm->dev)); } populate_shutdown_status(nfit_mem); return 0; } static void shutdown_dimm_notify(void *data) { struct acpi_nfit_desc *acpi_desc = data; struct nfit_mem *nfit_mem; mutex_lock(&acpi_desc->init_mutex); /* * Clear out the nfit_mem->flags_attr and shut down dimm event * notifications. */ list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) { struct acpi_device *adev_dimm = nfit_mem->adev; if (nfit_mem->flags_attr) { sysfs_put(nfit_mem->flags_attr); nfit_mem->flags_attr = NULL; } if (adev_dimm) { acpi_remove_notify_handler(adev_dimm->handle, ACPI_DEVICE_NOTIFY, acpi_nvdimm_notify); dev_set_drvdata(&adev_dimm->dev, NULL); } } mutex_unlock(&acpi_desc->init_mutex); } static const struct nvdimm_security_ops *acpi_nfit_get_security_ops(int family) { switch (family) { case NVDIMM_FAMILY_INTEL: return intel_security_ops; default: return NULL; } } static const struct nvdimm_fw_ops *acpi_nfit_get_fw_ops( struct nfit_mem *nfit_mem) { unsigned long mask; struct acpi_nfit_desc *acpi_desc = nfit_mem->acpi_desc; struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc; if (!nd_desc->fw_ops) return NULL; if (nfit_mem->family != NVDIMM_FAMILY_INTEL) return NULL; mask = nfit_mem->dsm_mask & NVDIMM_INTEL_FW_ACTIVATE_CMDMASK; if (mask != NVDIMM_INTEL_FW_ACTIVATE_CMDMASK) return NULL; return intel_fw_ops; } static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc) { struct nfit_mem *nfit_mem; int dimm_count = 0, rc; struct nvdimm *nvdimm; list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) { struct acpi_nfit_flush_address *flush; unsigned long flags = 0, cmd_mask; struct nfit_memdev *nfit_memdev; u32 device_handle; u16 mem_flags; device_handle = __to_nfit_memdev(nfit_mem)->device_handle; nvdimm = acpi_nfit_dimm_by_handle(acpi_desc, device_handle); if (nvdimm) { dimm_count++; continue; } /* collate flags across all memdevs for this dimm */ list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) { struct acpi_nfit_memory_map *dimm_memdev; dimm_memdev = __to_nfit_memdev(nfit_mem); if (dimm_memdev->device_handle != nfit_memdev->memdev->device_handle) continue; dimm_memdev->flags |= nfit_memdev->memdev->flags; } mem_flags = __to_nfit_memdev(nfit_mem)->flags; if (mem_flags & ACPI_NFIT_MEM_NOT_ARMED) set_bit(NDD_UNARMED, &flags); rc = acpi_nfit_add_dimm(acpi_desc, nfit_mem, device_handle); if (rc) continue; /* * TODO: provide translation for non-NVDIMM_FAMILY_INTEL * devices (i.e. from nd_cmd to acpi_dsm) to standardize the * userspace interface. */ cmd_mask = 1UL << ND_CMD_CALL; if (nfit_mem->family == NVDIMM_FAMILY_INTEL) { /* * These commands have a 1:1 correspondence * between DSM payload and libnvdimm ioctl * payload format. */ cmd_mask |= nfit_mem->dsm_mask & NVDIMM_STANDARD_CMDMASK; } if (test_bit(NFIT_MEM_LSR, &nfit_mem->flags)) { set_bit(ND_CMD_GET_CONFIG_SIZE, &cmd_mask); set_bit(ND_CMD_GET_CONFIG_DATA, &cmd_mask); } if (test_bit(NFIT_MEM_LSW, &nfit_mem->flags)) set_bit(ND_CMD_SET_CONFIG_DATA, &cmd_mask); flush = nfit_mem->nfit_flush ? nfit_mem->nfit_flush->flush : NULL; nvdimm = __nvdimm_create(acpi_desc->nvdimm_bus, nfit_mem, acpi_nfit_dimm_attribute_groups, flags, cmd_mask, flush ? flush->hint_count : 0, nfit_mem->flush_wpq, &nfit_mem->id[0], acpi_nfit_get_security_ops(nfit_mem->family), acpi_nfit_get_fw_ops(nfit_mem)); if (!nvdimm) return -ENOMEM; nfit_mem->nvdimm = nvdimm; dimm_count++; if ((mem_flags & ACPI_NFIT_MEM_FAILED_MASK) == 0) continue; dev_err(acpi_desc->dev, "Error found in NVDIMM %s flags:%s%s%s%s%s\n", nvdimm_name(nvdimm), mem_flags & ACPI_NFIT_MEM_SAVE_FAILED ? " save_fail" : "", mem_flags & ACPI_NFIT_MEM_RESTORE_FAILED ? " restore_fail":"", mem_flags & ACPI_NFIT_MEM_FLUSH_FAILED ? " flush_fail" : "", mem_flags & ACPI_NFIT_MEM_NOT_ARMED ? " not_armed" : "", mem_flags & ACPI_NFIT_MEM_MAP_FAILED ? " map_fail" : ""); } rc = nvdimm_bus_check_dimm_count(acpi_desc->nvdimm_bus, dimm_count); if (rc) return rc; /* * Now that dimms are successfully registered, and async registration * is flushed, attempt to enable event notification. */ list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) { struct kernfs_node *nfit_kernfs; nvdimm = nfit_mem->nvdimm; if (!nvdimm) continue; nfit_kernfs = sysfs_get_dirent(nvdimm_kobj(nvdimm)->sd, "nfit"); if (nfit_kernfs) nfit_mem->flags_attr = sysfs_get_dirent(nfit_kernfs, "flags"); sysfs_put(nfit_kernfs); if (!nfit_mem->flags_attr) dev_warn(acpi_desc->dev, "%s: notifications disabled\n", nvdimm_name(nvdimm)); } return devm_add_action_or_reset(acpi_desc->dev, shutdown_dimm_notify, acpi_desc); } /* * These constants are private because there are no kernel consumers of * these commands. */ enum nfit_aux_cmds { NFIT_CMD_TRANSLATE_SPA = 5, NFIT_CMD_ARS_INJECT_SET = 7, NFIT_CMD_ARS_INJECT_CLEAR = 8, NFIT_CMD_ARS_INJECT_GET = 9, }; static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc) { struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc; const guid_t *guid = to_nfit_uuid(NFIT_DEV_BUS); unsigned long dsm_mask, *mask; struct acpi_device *adev; int i; set_bit(ND_CMD_CALL, &nd_desc->cmd_mask); set_bit(NVDIMM_BUS_FAMILY_NFIT, &nd_desc->bus_family_mask); /* enable nfit_test to inject bus command emulation */ if (acpi_desc->bus_cmd_force_en) { nd_desc->cmd_mask = acpi_desc->bus_cmd_force_en; mask = &nd_desc->bus_family_mask; if (acpi_desc->family_dsm_mask[NVDIMM_BUS_FAMILY_INTEL]) { set_bit(NVDIMM_BUS_FAMILY_INTEL, mask); nd_desc->fw_ops = intel_bus_fw_ops; } } adev = to_acpi_dev(acpi_desc); if (!adev) return; for (i = ND_CMD_ARS_CAP; i <= ND_CMD_CLEAR_ERROR; i++) if (acpi_check_dsm(adev->handle, guid, 1, 1ULL << i)) set_bit(i, &nd_desc->cmd_mask); dsm_mask = (1 << ND_CMD_ARS_CAP) | (1 << ND_CMD_ARS_START) | (1 << ND_CMD_ARS_STATUS) | (1 << ND_CMD_CLEAR_ERROR) | (1 << NFIT_CMD_TRANSLATE_SPA) | (1 << NFIT_CMD_ARS_INJECT_SET) | (1 << NFIT_CMD_ARS_INJECT_CLEAR) | (1 << NFIT_CMD_ARS_INJECT_GET); for_each_set_bit(i, &dsm_mask, BITS_PER_LONG) if (acpi_check_dsm(adev->handle, guid, 1, 1ULL << i)) set_bit(i, &acpi_desc->bus_dsm_mask); /* Enumerate allowed NVDIMM_BUS_FAMILY_INTEL commands */ dsm_mask = NVDIMM_BUS_INTEL_FW_ACTIVATE_CMDMASK; guid = to_nfit_bus_uuid(NVDIMM_BUS_FAMILY_INTEL); mask = &acpi_desc->family_dsm_mask[NVDIMM_BUS_FAMILY_INTEL]; for_each_set_bit(i, &dsm_mask, BITS_PER_LONG) if (acpi_check_dsm(adev->handle, guid, 1, 1ULL << i)) set_bit(i, mask); if (*mask == dsm_mask) { set_bit(NVDIMM_BUS_FAMILY_INTEL, &nd_desc->bus_family_mask); nd_desc->fw_ops = intel_bus_fw_ops; } } static ssize_t range_index_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nd_region *nd_region = to_nd_region(dev); struct nfit_spa *nfit_spa = nd_region_provider_data(nd_region); return sysfs_emit(buf, "%d\n", nfit_spa->spa->range_index); } static DEVICE_ATTR_RO(range_index); static struct attribute *acpi_nfit_region_attributes[] = { &dev_attr_range_index.attr, NULL, }; static const struct attribute_group acpi_nfit_region_attribute_group = { .name = "nfit", .attrs = acpi_nfit_region_attributes, }; static const struct attribute_group *acpi_nfit_region_attribute_groups[] = { &acpi_nfit_region_attribute_group, NULL, }; /* enough info to uniquely specify an interleave set */ struct nfit_set_info { u64 region_offset; u32 serial_number; u32 pad; }; struct nfit_set_info2 { u64 region_offset; u32 serial_number; u16 vendor_id; u16 manufacturing_date; u8 manufacturing_location; u8 reserved[31]; }; static int cmp_map_compat(const void *m0, const void *m1) { const struct nfit_set_info *map0 = m0; const struct nfit_set_info *map1 = m1; return memcmp(&map0->region_offset, &map1->region_offset, sizeof(u64)); } static int cmp_map(const void *m0, const void *m1) { const struct nfit_set_info *map0 = m0; const struct nfit_set_info *map1 = m1; if (map0->region_offset < map1->region_offset) return -1; else if (map0->region_offset > map1->region_offset) return 1; return 0; } static int cmp_map2(const void *m0, const void *m1) { const struct nfit_set_info2 *map0 = m0; const struct nfit_set_info2 *map1 = m1; if (map0->region_offset < map1->region_offset) return -1; else if (map0->region_offset > map1->region_offset) return 1; return 0; } /* Retrieve the nth entry referencing this spa */ static struct acpi_nfit_memory_map *memdev_from_spa( struct acpi_nfit_desc *acpi_desc, u16 range_index, int n) { struct nfit_memdev *nfit_memdev; list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) if (nfit_memdev->memdev->range_index == range_index) if (n-- == 0) return nfit_memdev->memdev; return NULL; } static int acpi_nfit_init_interleave_set(struct acpi_nfit_desc *acpi_desc, struct nd_region_desc *ndr_desc, struct acpi_nfit_system_address *spa) { u16 nr = ndr_desc->num_mappings; struct nfit_set_info2 *info2 __free(kfree) = kcalloc(nr, sizeof(*info2), GFP_KERNEL); struct nfit_set_info *info __free(kfree) = kcalloc(nr, sizeof(*info), GFP_KERNEL); struct device *dev = acpi_desc->dev; struct nd_interleave_set *nd_set; int i; if (!info || !info2) return -ENOMEM; nd_set = devm_kzalloc(dev, sizeof(*nd_set), GFP_KERNEL); if (!nd_set) return -ENOMEM; import_guid(&nd_set->type_guid, spa->range_guid); for (i = 0; i < nr; i++) { struct nd_mapping_desc *mapping = &ndr_desc->mapping[i]; struct nvdimm *nvdimm = mapping->nvdimm; struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); struct nfit_set_info *map = &info[i]; struct nfit_set_info2 *map2 = &info2[i]; struct acpi_nfit_memory_map *memdev = memdev_from_spa(acpi_desc, spa->range_index, i); struct acpi_nfit_control_region *dcr = nfit_mem->dcr; if (!memdev || !nfit_mem->dcr) { dev_err(dev, "%s: failed to find DCR\n", __func__); return -ENODEV; } map->region_offset = memdev->region_offset; map->serial_number = dcr->serial_number; map2->region_offset = memdev->region_offset; map2->serial_number = dcr->serial_number; map2->vendor_id = dcr->vendor_id; map2->manufacturing_date = dcr->manufacturing_date; map2->manufacturing_location = dcr->manufacturing_location; } /* v1.1 namespaces */ sort(info, nr, sizeof(*info), cmp_map, NULL); nd_set->cookie1 = nd_fletcher64(info, sizeof(*info) * nr, 0); /* v1.2 namespaces */ sort(info2, nr, sizeof(*info2), cmp_map2, NULL); nd_set->cookie2 = nd_fletcher64(info2, sizeof(*info2) * nr, 0); /* support v1.1 namespaces created with the wrong sort order */ sort(info, nr, sizeof(*info), cmp_map_compat, NULL); nd_set->altcookie = nd_fletcher64(info, sizeof(*info) * nr, 0); /* record the result of the sort for the mapping position */ for (i = 0; i < nr; i++) { struct nfit_set_info2 *map2 = &info2[i]; int j; for (j = 0; j < nr; j++) { struct nd_mapping_desc *mapping = &ndr_desc->mapping[j]; struct nvdimm *nvdimm = mapping->nvdimm; struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); struct acpi_nfit_control_region *dcr = nfit_mem->dcr; if (map2->serial_number == dcr->serial_number && map2->vendor_id == dcr->vendor_id && map2->manufacturing_date == dcr->manufacturing_date && map2->manufacturing_location == dcr->manufacturing_location) { mapping->position = i; break; } } } ndr_desc->nd_set = nd_set; return 0; } static int ars_get_cap(struct acpi_nfit_desc *acpi_desc, struct nd_cmd_ars_cap *cmd, struct nfit_spa *nfit_spa) { struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc; struct acpi_nfit_system_address *spa = nfit_spa->spa; int cmd_rc, rc; cmd->address = spa->address; cmd->length = spa->length; rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_CAP, cmd, sizeof(*cmd), &cmd_rc); if (rc < 0) return rc; return cmd_rc; } static int ars_start(struct acpi_nfit_desc *acpi_desc, struct nfit_spa *nfit_spa, enum nfit_ars_state req_type) { int rc; int cmd_rc; struct nd_cmd_ars_start ars_start; struct acpi_nfit_system_address *spa = nfit_spa->spa; struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc; memset(&ars_start, 0, sizeof(ars_start)); ars_start.address = spa->address; ars_start.length = spa->length; if (req_type == ARS_REQ_SHORT) ars_start.flags = ND_ARS_RETURN_PREV_DATA; if (nfit_spa_type(spa) == NFIT_SPA_PM) ars_start.type = ND_ARS_PERSISTENT; else if (nfit_spa_type(spa) == NFIT_SPA_VOLATILE) ars_start.type = ND_ARS_VOLATILE; else return -ENOTTY; rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_START, &ars_start, sizeof(ars_start), &cmd_rc); if (rc < 0) return rc; if (cmd_rc < 0) return cmd_rc; set_bit(ARS_VALID, &acpi_desc->scrub_flags); return 0; } static int ars_continue(struct acpi_nfit_desc *acpi_desc) { int rc, cmd_rc; struct nd_cmd_ars_start ars_start; struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc; struct nd_cmd_ars_status *ars_status = acpi_desc->ars_status; ars_start = (struct nd_cmd_ars_start) { .address = ars_status->restart_address, .length = ars_status->restart_length, .type = ars_status->type, }; rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_START, &ars_start, sizeof(ars_start), &cmd_rc); if (rc < 0) return rc; return cmd_rc; } static int ars_get_status(struct acpi_nfit_desc *acpi_desc) { struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc; struct nd_cmd_ars_status *ars_status = acpi_desc->ars_status; int rc, cmd_rc; rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_STATUS, ars_status, acpi_desc->max_ars, &cmd_rc); if (rc < 0) return rc; return cmd_rc; } static void ars_complete(struct acpi_nfit_desc *acpi_desc, struct nfit_spa *nfit_spa) { struct nd_cmd_ars_status *ars_status = acpi_desc->ars_status; struct acpi_nfit_system_address *spa = nfit_spa->spa; struct nd_region *nd_region = nfit_spa->nd_region; struct device *dev; lockdep_assert_held(&acpi_desc->init_mutex); /* * Only advance the ARS state for ARS runs initiated by the * kernel, ignore ARS results from BIOS initiated runs for scrub * completion tracking. */ if (acpi_desc->scrub_spa != nfit_spa) return; if ((ars_status->address >= spa->address && ars_status->address < spa->address + spa->length) || (ars_status->address < spa->address)) { /* * Assume that if a scrub starts at an offset from the * start of nfit_spa that we are in the continuation * case. * * Otherwise, if the scrub covers the spa range, mark * any pending request complete. */ if (ars_status->address + ars_status->length >= spa->address + spa->length) /* complete */; else return; } else return; acpi_desc->scrub_spa = NULL; if (nd_region) { dev = nd_region_dev(nd_region); nvdimm_region_notify(nd_region, NVDIMM_REVALIDATE_POISON); } else dev = acpi_desc->dev; dev_dbg(dev, "ARS: range %d complete\n", spa->range_index); } static int ars_status_process_records(struct acpi_nfit_desc *acpi_desc) { struct nvdimm_bus *nvdimm_bus = acpi_desc->nvdimm_bus; struct nd_cmd_ars_status *ars_status = acpi_desc->ars_status; int rc; u32 i; /* * First record starts at 44 byte offset from the start of the * payload. */ if (ars_status->out_length < 44) return 0; /* * Ignore potentially stale results that are only refreshed * after a start-ARS event. */ if (!test_and_clear_bit(ARS_VALID, &acpi_desc->scrub_flags)) { dev_dbg(acpi_desc->dev, "skip %d stale records\n", ars_status->num_records); return 0; } for (i = 0; i < ars_status->num_records; i++) { /* only process full records */ if (ars_status->out_length < 44 + sizeof(struct nd_ars_record) * (i + 1)) break; rc = nvdimm_bus_add_badrange(nvdimm_bus, ars_status->records[i].err_address, ars_status->records[i].length); if (rc) return rc; } if (i < ars_status->num_records) dev_warn(acpi_desc->dev, "detected truncated ars results\n"); return 0; } static void acpi_nfit_remove_resource(void *data) { struct resource *res = data; remove_resource(res); } static int acpi_nfit_insert_resource(struct acpi_nfit_desc *acpi_desc, struct nd_region_desc *ndr_desc) { struct resource *res, *nd_res = ndr_desc->res; int is_pmem, ret; /* No operation if the region is already registered as PMEM */ is_pmem = region_intersects(nd_res->start, resource_size(nd_res), IORESOURCE_MEM, IORES_DESC_PERSISTENT_MEMORY); if (is_pmem == REGION_INTERSECTS) return 0; res = devm_kzalloc(acpi_desc->dev, sizeof(*res), GFP_KERNEL); if (!res) return -ENOMEM; res->name = "Persistent Memory"; res->start = nd_res->start; res->end = nd_res->end; res->flags = IORESOURCE_MEM; res->desc = IORES_DESC_PERSISTENT_MEMORY; ret = insert_resource(&iomem_resource, res); if (ret) return ret; ret = devm_add_action_or_reset(acpi_desc->dev, acpi_nfit_remove_resource, res); if (ret) return ret; return 0; } static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc, struct nd_mapping_desc *mapping, struct nd_region_desc *ndr_desc, struct acpi_nfit_memory_map *memdev, struct nfit_spa *nfit_spa) { struct nvdimm *nvdimm = acpi_nfit_dimm_by_handle(acpi_desc, memdev->device_handle); struct acpi_nfit_system_address *spa = nfit_spa->spa; if (!nvdimm) { dev_err(acpi_desc->dev, "spa%d dimm: %#x not found\n", spa->range_index, memdev->device_handle); return -ENODEV; } mapping->nvdimm = nvdimm; switch (nfit_spa_type(spa)) { case NFIT_SPA_PM: case NFIT_SPA_VOLATILE: mapping->start = memdev->address; mapping->size = memdev->region_size; break; } return 0; } static bool nfit_spa_is_virtual(struct acpi_nfit_system_address *spa) { return (nfit_spa_type(spa) == NFIT_SPA_VDISK || nfit_spa_type(spa) == NFIT_SPA_VCD || nfit_spa_type(spa) == NFIT_SPA_PDISK || nfit_spa_type(spa) == NFIT_SPA_PCD); } static bool nfit_spa_is_volatile(struct acpi_nfit_system_address *spa) { return (nfit_spa_type(spa) == NFIT_SPA_VDISK || nfit_spa_type(spa) == NFIT_SPA_VCD || nfit_spa_type(spa) == NFIT_SPA_VOLATILE); } static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, struct nfit_spa *nfit_spa) { static struct nd_mapping_desc mappings[ND_MAX_MAPPINGS]; struct acpi_nfit_system_address *spa = nfit_spa->spa; struct nd_region_desc *ndr_desc, _ndr_desc; struct nfit_memdev *nfit_memdev; struct nvdimm_bus *nvdimm_bus; struct resource res; int count = 0, rc; if (nfit_spa->nd_region) return 0; if (spa->range_index == 0 && !nfit_spa_is_virtual(spa)) { dev_dbg(acpi_desc->dev, "detected invalid spa index\n"); return 0; } memset(&res, 0, sizeof(res)); memset(&mappings, 0, sizeof(mappings)); memset(&_ndr_desc, 0, sizeof(_ndr_desc)); res.start = spa->address; res.end = res.start + spa->length - 1; ndr_desc = &_ndr_desc; ndr_desc->res = &res; ndr_desc->provider_data = nfit_spa; ndr_desc->attr_groups = acpi_nfit_region_attribute_groups; if (spa->flags & ACPI_NFIT_PROXIMITY_VALID) { ndr_desc->numa_node = pxm_to_online_node(spa->proximity_domain); ndr_desc->target_node = pxm_to_node(spa->proximity_domain); } else { ndr_desc->numa_node = NUMA_NO_NODE; ndr_desc->target_node = NUMA_NO_NODE; } /* Fallback to address based numa information if node lookup failed */ if (ndr_desc->numa_node == NUMA_NO_NODE) { ndr_desc->numa_node = memory_add_physaddr_to_nid(spa->address); dev_info(acpi_desc->dev, "changing numa node from %d to %d for nfit region [%pa-%pa]", NUMA_NO_NODE, ndr_desc->numa_node, &res.start, &res.end); } if (ndr_desc->target_node == NUMA_NO_NODE) { ndr_desc->target_node = phys_to_target_node(spa->address); dev_info(acpi_desc->dev, "changing target node from %d to %d for nfit region [%pa-%pa]", NUMA_NO_NODE, ndr_desc->target_node, &res.start, &res.end); } /* * Persistence domain bits are hierarchical, if * ACPI_NFIT_CAPABILITY_CACHE_FLUSH is set then * ACPI_NFIT_CAPABILITY_MEM_FLUSH is implied. */ if (acpi_desc->platform_cap & ACPI_NFIT_CAPABILITY_CACHE_FLUSH) set_bit(ND_REGION_PERSIST_CACHE, &ndr_desc->flags); else if (acpi_desc->platform_cap & ACPI_NFIT_CAPABILITY_MEM_FLUSH) set_bit(ND_REGION_PERSIST_MEMCTRL, &ndr_desc->flags); list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) { struct acpi_nfit_memory_map *memdev = nfit_memdev->memdev; struct nd_mapping_desc *mapping; /* range index 0 == unmapped in SPA or invalid-SPA */ if (memdev->range_index == 0 || spa->range_index == 0) continue; if (memdev->range_index != spa->range_index) continue; if (count >= ND_MAX_MAPPINGS) { dev_err(acpi_desc->dev, "spa%d exceeds max mappings %d\n", spa->range_index, ND_MAX_MAPPINGS); return -ENXIO; } mapping = &mappings[count++]; rc = acpi_nfit_init_mapping(acpi_desc, mapping, ndr_desc, memdev, nfit_spa); if (rc) goto out; } ndr_desc->mapping = mappings; ndr_desc->num_mappings = count; rc = acpi_nfit_init_interleave_set(acpi_desc, ndr_desc, spa); if (rc) goto out; nvdimm_bus = acpi_desc->nvdimm_bus; if (nfit_spa_type(spa) == NFIT_SPA_PM) { rc = acpi_nfit_insert_resource(acpi_desc, ndr_desc); if (rc) { dev_warn(acpi_desc->dev, "failed to insert pmem resource to iomem: %d\n", rc); goto out; } nfit_spa->nd_region = nvdimm_pmem_region_create(nvdimm_bus, ndr_desc); if (!nfit_spa->nd_region) rc = -ENOMEM; } else if (nfit_spa_is_volatile(spa)) { nfit_spa->nd_region = nvdimm_volatile_region_create(nvdimm_bus, ndr_desc); if (!nfit_spa->nd_region) rc = -ENOMEM; } else if (nfit_spa_is_virtual(spa)) { nfit_spa->nd_region = nvdimm_pmem_region_create(nvdimm_bus, ndr_desc); if (!nfit_spa->nd_region) rc = -ENOMEM; } out: if (rc) dev_err(acpi_desc->dev, "failed to register spa range %d\n", nfit_spa->spa->range_index); return rc; } static int ars_status_alloc(struct acpi_nfit_desc *acpi_desc) { struct device *dev = acpi_desc->dev; struct nd_cmd_ars_status *ars_status; if (acpi_desc->ars_status) { memset(acpi_desc->ars_status, 0, acpi_desc->max_ars); return 0; } ars_status = devm_kzalloc(dev, acpi_desc->max_ars, GFP_KERNEL); if (!ars_status) return -ENOMEM; acpi_desc->ars_status = ars_status; return 0; } static int acpi_nfit_query_poison(struct acpi_nfit_desc *acpi_desc) { int rc; if (ars_status_alloc(acpi_desc)) return -ENOMEM; rc = ars_get_status(acpi_desc); if (rc < 0 && rc != -ENOSPC) return rc; if (ars_status_process_records(acpi_desc)) dev_err(acpi_desc->dev, "Failed to process ARS records\n"); return rc; } static int ars_register(struct acpi_nfit_desc *acpi_desc, struct nfit_spa *nfit_spa) { int rc; if (test_bit(ARS_FAILED, &nfit_spa->ars_state)) return acpi_nfit_register_region(acpi_desc, nfit_spa); set_bit(ARS_REQ_SHORT, &nfit_spa->ars_state); if (!no_init_ars) set_bit(ARS_REQ_LONG, &nfit_spa->ars_state); switch (acpi_nfit_query_poison(acpi_desc)) { case 0: case -ENOSPC: case -EAGAIN: rc = ars_start(acpi_desc, nfit_spa, ARS_REQ_SHORT); /* shouldn't happen, try again later */ if (rc == -EBUSY) break; if (rc) { set_bit(ARS_FAILED, &nfit_spa->ars_state); break; } clear_bit(ARS_REQ_SHORT, &nfit_spa->ars_state); rc = acpi_nfit_query_poison(acpi_desc); if (rc) break; acpi_desc->scrub_spa = nfit_spa; ars_complete(acpi_desc, nfit_spa); /* * If ars_complete() says we didn't complete the * short scrub, we'll try again with a long * request. */ acpi_desc->scrub_spa = NULL; break; case -EBUSY: case -ENOMEM: /* * BIOS was using ARS, wait for it to complete (or * resources to become available) and then perform our * own scrubs. */ break; default: set_bit(ARS_FAILED, &nfit_spa->ars_state); break; } return acpi_nfit_register_region(acpi_desc, nfit_spa); } static void ars_complete_all(struct acpi_nfit_desc *acpi_desc) { struct nfit_spa *nfit_spa; list_for_each_entry(nfit_spa, &acpi_desc->spas, list) { if (test_bit(ARS_FAILED, &nfit_spa->ars_state)) continue; ars_complete(acpi_desc, nfit_spa); } } static unsigned int __acpi_nfit_scrub(struct acpi_nfit_desc *acpi_desc, int query_rc) { unsigned int tmo = acpi_desc->scrub_tmo; struct device *dev = acpi_desc->dev; struct nfit_spa *nfit_spa; lockdep_assert_held(&acpi_desc->init_mutex); if (test_bit(ARS_CANCEL, &acpi_desc->scrub_flags)) return 0; if (query_rc == -EBUSY) { dev_dbg(dev, "ARS: ARS busy\n"); return min(30U * 60U, tmo * 2); } if (query_rc == -ENOSPC) { dev_dbg(dev, "ARS: ARS continue\n"); ars_continue(acpi_desc); return 1; } if (query_rc && query_rc != -EAGAIN) { unsigned long long addr, end; addr = acpi_desc->ars_status->address; end = addr + acpi_desc->ars_status->length; dev_dbg(dev, "ARS: %llx-%llx failed (%d)\n", addr, end, query_rc); } ars_complete_all(acpi_desc); list_for_each_entry(nfit_spa, &acpi_desc->spas, list) { enum nfit_ars_state req_type; int rc; if (test_bit(ARS_FAILED, &nfit_spa->ars_state)) continue; /* prefer short ARS requests first */ if (test_bit(ARS_REQ_SHORT, &nfit_spa->ars_state)) req_type = ARS_REQ_SHORT; else if (test_bit(ARS_REQ_LONG, &nfit_spa->ars_state)) req_type = ARS_REQ_LONG; else continue; rc = ars_start(acpi_desc, nfit_spa, req_type); dev = nd_region_dev(nfit_spa->nd_region); dev_dbg(dev, "ARS: range %d ARS start %s (%d)\n", nfit_spa->spa->range_index, req_type == ARS_REQ_SHORT ? "short" : "long", rc); /* * Hmm, we raced someone else starting ARS? Try again in * a bit. */ if (rc == -EBUSY) return 1; if (rc == 0) { dev_WARN_ONCE(dev, acpi_desc->scrub_spa, "scrub start while range %d active\n", acpi_desc->scrub_spa->spa->range_index); clear_bit(req_type, &nfit_spa->ars_state); acpi_desc->scrub_spa = nfit_spa; /* * Consider this spa last for future scrub * requests */ list_move_tail(&nfit_spa->list, &acpi_desc->spas); return 1; } dev_err(dev, "ARS: range %d ARS failed (%d)\n", nfit_spa->spa->range_index, rc); set_bit(ARS_FAILED, &nfit_spa->ars_state); } return 0; } static void __sched_ars(struct acpi_nfit_desc *acpi_desc, unsigned int tmo) { lockdep_assert_held(&acpi_desc->init_mutex); set_bit(ARS_BUSY, &acpi_desc->scrub_flags); /* note this should only be set from within the workqueue */ if (tmo) acpi_desc->scrub_tmo = tmo; queue_delayed_work(nfit_wq, &acpi_desc->dwork, tmo * HZ); } static void sched_ars(struct acpi_nfit_desc *acpi_desc) { __sched_ars(acpi_desc, 0); } static void notify_ars_done(struct acpi_nfit_desc *acpi_desc) { lockdep_assert_held(&acpi_desc->init_mutex); clear_bit(ARS_BUSY, &acpi_desc->scrub_flags); acpi_desc->scrub_count++; if (acpi_desc->scrub_count_state) sysfs_notify_dirent(acpi_desc->scrub_count_state); } static void acpi_nfit_scrub(struct work_struct *work) { struct acpi_nfit_desc *acpi_desc; unsigned int tmo; int query_rc; acpi_desc = container_of(work, typeof(*acpi_desc), dwork.work); mutex_lock(&acpi_desc->init_mutex); query_rc = acpi_nfit_query_poison(acpi_desc); tmo = __acpi_nfit_scrub(acpi_desc, query_rc); if (tmo) __sched_ars(acpi_desc, tmo); else notify_ars_done(acpi_desc); memset(acpi_desc->ars_status, 0, acpi_desc->max_ars); clear_bit(ARS_POLL, &acpi_desc->scrub_flags); mutex_unlock(&acpi_desc->init_mutex); } static void acpi_nfit_init_ars(struct acpi_nfit_desc *acpi_desc, struct nfit_spa *nfit_spa) { int type = nfit_spa_type(nfit_spa->spa); struct nd_cmd_ars_cap ars_cap; int rc; set_bit(ARS_FAILED, &nfit_spa->ars_state); memset(&ars_cap, 0, sizeof(ars_cap)); rc = ars_get_cap(acpi_desc, &ars_cap, nfit_spa); if (rc < 0) return; /* check that the supported scrub types match the spa type */ if (type == NFIT_SPA_VOLATILE && ((ars_cap.status >> 16) & ND_ARS_VOLATILE) == 0) return; if (type == NFIT_SPA_PM && ((ars_cap.status >> 16) & ND_ARS_PERSISTENT) == 0) return; nfit_spa->max_ars = ars_cap.max_ars_out; nfit_spa->clear_err_unit = ars_cap.clear_err_unit; acpi_desc->max_ars = max(nfit_spa->max_ars, acpi_desc->max_ars); clear_bit(ARS_FAILED, &nfit_spa->ars_state); } static int acpi_nfit_register_regions(struct acpi_nfit_desc *acpi_desc) { struct nfit_spa *nfit_spa; int rc, do_sched_ars = 0; set_bit(ARS_VALID, &acpi_desc->scrub_flags); list_for_each_entry(nfit_spa, &acpi_desc->spas, list) { switch (nfit_spa_type(nfit_spa->spa)) { case NFIT_SPA_VOLATILE: case NFIT_SPA_PM: acpi_nfit_init_ars(acpi_desc, nfit_spa); break; } } list_for_each_entry(nfit_spa, &acpi_desc->spas, list) { switch (nfit_spa_type(nfit_spa->spa)) { case NFIT_SPA_VOLATILE: case NFIT_SPA_PM: /* register regions and kick off initial ARS run */ rc = ars_register(acpi_desc, nfit_spa); if (rc) return rc; /* * Kick off background ARS if at least one * region successfully registered ARS */ if (!test_bit(ARS_FAILED, &nfit_spa->ars_state)) do_sched_ars++; break; case NFIT_SPA_BDW: /* nothing to register */ break; case NFIT_SPA_DCR: case NFIT_SPA_VDISK: case NFIT_SPA_VCD: case NFIT_SPA_PDISK: case NFIT_SPA_PCD: /* register known regions that don't support ARS */ rc = acpi_nfit_register_region(acpi_desc, nfit_spa); if (rc) return rc; break; default: /* don't register unknown regions */ break; } } if (do_sched_ars) sched_ars(acpi_desc); return 0; } static int acpi_nfit_check_deletions(struct acpi_nfit_desc *acpi_desc, struct nfit_table_prev *prev) { struct device *dev = acpi_desc->dev; if (!list_empty(&prev->spas) || !list_empty(&prev->memdevs) || !list_empty(&prev->dcrs) || !list_empty(&prev->bdws) || !list_empty(&prev->idts) || !list_empty(&prev->flushes)) { dev_err(dev, "new nfit deletes entries (unsupported)\n"); return -ENXIO; } return 0; } static int acpi_nfit_desc_init_scrub_attr(struct acpi_nfit_desc *acpi_desc) { struct device *dev = acpi_desc->dev; struct kernfs_node *nfit; struct device *bus_dev; if (!ars_supported(acpi_desc->nvdimm_bus)) return 0; bus_dev = to_nvdimm_bus_dev(acpi_desc->nvdimm_bus); nfit = sysfs_get_dirent(bus_dev->kobj.sd, "nfit"); if (!nfit) { dev_err(dev, "sysfs_get_dirent 'nfit' failed\n"); return -ENODEV; } acpi_desc->scrub_count_state = sysfs_get_dirent(nfit, "scrub"); sysfs_put(nfit); if (!acpi_desc->scrub_count_state) { dev_err(dev, "sysfs_get_dirent 'scrub' failed\n"); return -ENODEV; } return 0; } static void acpi_nfit_unregister(void *data) { struct acpi_nfit_desc *acpi_desc = data; nvdimm_bus_unregister(acpi_desc->nvdimm_bus); } int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, void *data, acpi_size sz) { struct device *dev = acpi_desc->dev; struct nfit_table_prev prev; const void *end; int rc; if (!acpi_desc->nvdimm_bus) { acpi_nfit_init_dsms(acpi_desc); acpi_desc->nvdimm_bus = nvdimm_bus_register(dev, &acpi_desc->nd_desc); if (!acpi_desc->nvdimm_bus) return -ENOMEM; rc = devm_add_action_or_reset(dev, acpi_nfit_unregister, acpi_desc); if (rc) return rc; rc = acpi_nfit_desc_init_scrub_attr(acpi_desc); if (rc) return rc; /* register this acpi_desc for mce notifications */ mutex_lock(&acpi_desc_lock); list_add_tail(&acpi_desc->list, &acpi_descs); mutex_unlock(&acpi_desc_lock); } mutex_lock(&acpi_desc->init_mutex); INIT_LIST_HEAD(&prev.spas); INIT_LIST_HEAD(&prev.memdevs); INIT_LIST_HEAD(&prev.dcrs); INIT_LIST_HEAD(&prev.bdws); INIT_LIST_HEAD(&prev.idts); INIT_LIST_HEAD(&prev.flushes); list_cut_position(&prev.spas, &acpi_desc->spas, acpi_desc->spas.prev); list_cut_position(&prev.memdevs, &acpi_desc->memdevs, acpi_desc->memdevs.prev); list_cut_position(&prev.dcrs, &acpi_desc->dcrs, acpi_desc->dcrs.prev); list_cut_position(&prev.bdws, &acpi_desc->bdws, acpi_desc->bdws.prev); list_cut_position(&prev.idts, &acpi_desc->idts, acpi_desc->idts.prev); list_cut_position(&prev.flushes, &acpi_desc->flushes, acpi_desc->flushes.prev); end = data + sz; while (!IS_ERR_OR_NULL(data)) data = add_table(acpi_desc, &prev, data, end); if (IS_ERR(data)) { dev_dbg(dev, "nfit table parsing error: %ld\n", PTR_ERR(data)); rc = PTR_ERR(data); goto out_unlock; } rc = acpi_nfit_check_deletions(acpi_desc, &prev); if (rc) goto out_unlock; rc = nfit_mem_init(acpi_desc); if (rc) goto out_unlock; rc = acpi_nfit_register_dimms(acpi_desc); if (rc) goto out_unlock; rc = acpi_nfit_register_regions(acpi_desc); out_unlock: mutex_unlock(&acpi_desc->init_mutex); return rc; } EXPORT_SYMBOL_GPL(acpi_nfit_init); static int acpi_nfit_flush_probe(struct nvdimm_bus_descriptor *nd_desc) { struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc); struct device *dev = acpi_desc->dev; /* Bounce the device lock to flush acpi_nfit_add / acpi_nfit_notify */ device_lock(dev); device_unlock(dev); /* Bounce the init_mutex to complete initial registration */ mutex_lock(&acpi_desc->init_mutex); mutex_unlock(&acpi_desc->init_mutex); return 0; } static int __acpi_nfit_clear_to_send(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, unsigned int cmd) { struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc); if (nvdimm) return 0; if (cmd != ND_CMD_ARS_START) return 0; /* * The kernel and userspace may race to initiate a scrub, but * the scrub thread is prepared to lose that initial race. It * just needs guarantees that any ARS it initiates are not * interrupted by any intervening start requests from userspace. */ if (work_busy(&acpi_desc->dwork.work)) return -EBUSY; return 0; } /* * Prevent security and firmware activate commands from being issued via * ioctl. */ static int acpi_nfit_clear_to_send(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, unsigned int cmd, void *buf) { struct nd_cmd_pkg *call_pkg = buf; unsigned int func; if (nvdimm && cmd == ND_CMD_CALL && call_pkg->nd_family == NVDIMM_FAMILY_INTEL) { func = call_pkg->nd_command; if (func > NVDIMM_CMD_MAX || (1 << func) & NVDIMM_INTEL_DENY_CMDMASK) return -EOPNOTSUPP; } /* block all non-nfit bus commands */ if (!nvdimm && cmd == ND_CMD_CALL && call_pkg->nd_family != NVDIMM_BUS_FAMILY_NFIT) return -EOPNOTSUPP; return __acpi_nfit_clear_to_send(nd_desc, nvdimm, cmd); } int acpi_nfit_ars_rescan(struct acpi_nfit_desc *acpi_desc, enum nfit_ars_state req_type) { struct device *dev = acpi_desc->dev; int scheduled = 0, busy = 0; struct nfit_spa *nfit_spa; mutex_lock(&acpi_desc->init_mutex); if (test_bit(ARS_CANCEL, &acpi_desc->scrub_flags)) { mutex_unlock(&acpi_desc->init_mutex); return 0; } list_for_each_entry(nfit_spa, &acpi_desc->spas, list) { int type = nfit_spa_type(nfit_spa->spa); if (type != NFIT_SPA_PM && type != NFIT_SPA_VOLATILE) continue; if (test_bit(ARS_FAILED, &nfit_spa->ars_state)) continue; if (test_and_set_bit(req_type, &nfit_spa->ars_state)) busy++; else scheduled++; } if (scheduled) { sched_ars(acpi_desc); dev_dbg(dev, "ars_scan triggered\n"); } mutex_unlock(&acpi_desc->init_mutex); if (scheduled) return 0; if (busy) return -EBUSY; return -ENOTTY; } void acpi_nfit_desc_init(struct acpi_nfit_desc *acpi_desc, struct device *dev) { struct nvdimm_bus_descriptor *nd_desc; dev_set_drvdata(dev, acpi_desc); acpi_desc->dev = dev; nd_desc = &acpi_desc->nd_desc; nd_desc->provider_name = "ACPI.NFIT"; nd_desc->module = THIS_MODULE; nd_desc->ndctl = acpi_nfit_ctl; nd_desc->flush_probe = acpi_nfit_flush_probe; nd_desc->clear_to_send = acpi_nfit_clear_to_send; nd_desc->attr_groups = acpi_nfit_attribute_groups; INIT_LIST_HEAD(&acpi_desc->spas); INIT_LIST_HEAD(&acpi_desc->dcrs); INIT_LIST_HEAD(&acpi_desc->bdws); INIT_LIST_HEAD(&acpi_desc->idts); INIT_LIST_HEAD(&acpi_desc->flushes); INIT_LIST_HEAD(&acpi_desc->memdevs); INIT_LIST_HEAD(&acpi_desc->dimms); INIT_LIST_HEAD(&acpi_desc->list); mutex_init(&acpi_desc->init_mutex); acpi_desc->scrub_tmo = 1; INIT_DELAYED_WORK(&acpi_desc->dwork, acpi_nfit_scrub); } EXPORT_SYMBOL_GPL(acpi_nfit_desc_init); static void acpi_nfit_put_table(void *table) { acpi_put_table(table); } static void acpi_nfit_notify(acpi_handle handle, u32 event, void *data) { struct acpi_device *adev = data; device_lock(&adev->dev); __acpi_nfit_notify(&adev->dev, handle, event); device_unlock(&adev->dev); } static void acpi_nfit_remove_notify_handler(void *data) { struct acpi_device *adev = data; acpi_dev_remove_notify_handler(adev, ACPI_DEVICE_NOTIFY, acpi_nfit_notify); } void acpi_nfit_shutdown(void *data) { struct acpi_nfit_desc *acpi_desc = data; struct device *bus_dev = to_nvdimm_bus_dev(acpi_desc->nvdimm_bus); /* * Destruct under acpi_desc_lock so that nfit_handle_mce does not * race teardown */ mutex_lock(&acpi_desc_lock); list_del(&acpi_desc->list); mutex_unlock(&acpi_desc_lock); mutex_lock(&acpi_desc->init_mutex); set_bit(ARS_CANCEL, &acpi_desc->scrub_flags); mutex_unlock(&acpi_desc->init_mutex); cancel_delayed_work_sync(&acpi_desc->dwork); /* * Bounce the nvdimm bus lock to make sure any in-flight * acpi_nfit_ars_rescan() submissions have had a chance to * either submit or see ->cancel set. */ device_lock(bus_dev); device_unlock(bus_dev); flush_workqueue(nfit_wq); } EXPORT_SYMBOL_GPL(acpi_nfit_shutdown); static int acpi_nfit_add(struct acpi_device *adev) { struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL }; struct acpi_nfit_desc *acpi_desc; struct device *dev = &adev->dev; struct acpi_table_header *tbl; acpi_status status = AE_OK; acpi_size sz; int rc = 0; rc = acpi_dev_install_notify_handler(adev, ACPI_DEVICE_NOTIFY, acpi_nfit_notify, adev); if (rc) return rc; rc = devm_add_action_or_reset(dev, acpi_nfit_remove_notify_handler, adev); if (rc) return rc; status = acpi_get_table(ACPI_SIG_NFIT, 0, &tbl); if (ACPI_FAILURE(status)) { /* The NVDIMM root device allows OS to trigger enumeration of * NVDIMMs through NFIT at boot time and re-enumeration at * root level via the _FIT method during runtime. * This is ok to return 0 here, we could have an nvdimm * hotplugged later and evaluate _FIT method which returns * data in the format of a series of NFIT Structures. */ dev_dbg(dev, "failed to find NFIT at startup\n"); return 0; } rc = devm_add_action_or_reset(dev, acpi_nfit_put_table, tbl); if (rc) return rc; sz = tbl->length; acpi_desc = devm_kzalloc(dev, sizeof(*acpi_desc), GFP_KERNEL); if (!acpi_desc) return -ENOMEM; acpi_nfit_desc_init(acpi_desc, &adev->dev); /* Save the acpi header for exporting the revision via sysfs */ acpi_desc->acpi_header = *tbl; /* Evaluate _FIT and override with that if present */ status = acpi_evaluate_object(adev->handle, "_FIT", NULL, &buf); if (ACPI_SUCCESS(status) && buf.length > 0) { union acpi_object *obj = buf.pointer; if (obj->type == ACPI_TYPE_BUFFER) rc = acpi_nfit_init(acpi_desc, obj->buffer.pointer, obj->buffer.length); else dev_dbg(dev, "invalid type %d, ignoring _FIT\n", (int) obj->type); kfree(buf.pointer); } else /* skip over the lead-in header table */ rc = acpi_nfit_init(acpi_desc, (void *) tbl + sizeof(struct acpi_table_nfit), sz - sizeof(struct acpi_table_nfit)); if (rc) return rc; return devm_add_action_or_reset(dev, acpi_nfit_shutdown, acpi_desc); } static void acpi_nfit_update_notify(struct device *dev, acpi_handle handle) { struct acpi_nfit_desc *acpi_desc = dev_get_drvdata(dev); struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL }; union acpi_object *obj; acpi_status status; int ret; if (!dev->driver) { /* dev->driver may be null if we're being removed */ dev_dbg(dev, "no driver found for dev\n"); return; } if (!acpi_desc) { acpi_desc = devm_kzalloc(dev, sizeof(*acpi_desc), GFP_KERNEL); if (!acpi_desc) return; acpi_nfit_desc_init(acpi_desc, dev); } else { /* * Finish previous registration before considering new * regions. */ flush_workqueue(nfit_wq); } /* Evaluate _FIT */ status = acpi_evaluate_object(handle, "_FIT", NULL, &buf); if (ACPI_FAILURE(status)) { dev_err(dev, "failed to evaluate _FIT\n"); return; } obj = buf.pointer; if (obj->type == ACPI_TYPE_BUFFER) { ret = acpi_nfit_init(acpi_desc, obj->buffer.pointer, obj->buffer.length); if (ret) dev_err(dev, "failed to merge updated NFIT\n"); } else dev_err(dev, "Invalid _FIT\n"); kfree(buf.pointer); } static void acpi_nfit_uc_error_notify(struct device *dev, acpi_handle handle) { struct acpi_nfit_desc *acpi_desc = dev_get_drvdata(dev); if (acpi_desc->scrub_mode == HW_ERROR_SCRUB_ON) acpi_nfit_ars_rescan(acpi_desc, ARS_REQ_LONG); else acpi_nfit_ars_rescan(acpi_desc, ARS_REQ_SHORT); } void __acpi_nfit_notify(struct device *dev, acpi_handle handle, u32 event) { dev_dbg(dev, "event: 0x%x\n", event); switch (event) { case NFIT_NOTIFY_UPDATE: return acpi_nfit_update_notify(dev, handle); case NFIT_NOTIFY_UC_MEMORY_ERROR: return acpi_nfit_uc_error_notify(dev, handle); default: return; } } EXPORT_SYMBOL_GPL(__acpi_nfit_notify); static const struct acpi_device_id acpi_nfit_ids[] = { { "ACPI0012", 0 }, { "", 0 }, }; MODULE_DEVICE_TABLE(acpi, acpi_nfit_ids); static struct acpi_driver acpi_nfit_driver = { .name = KBUILD_MODNAME, .ids = acpi_nfit_ids, .ops = { .add = acpi_nfit_add, }, }; static __init int nfit_init(void) { int ret; BUILD_BUG_ON(sizeof(struct acpi_table_nfit) != 40); BUILD_BUG_ON(sizeof(struct acpi_nfit_system_address) != 64); BUILD_BUG_ON(sizeof(struct acpi_nfit_memory_map) != 48); BUILD_BUG_ON(sizeof(struct acpi_nfit_interleave) != 16); BUILD_BUG_ON(sizeof(struct acpi_nfit_smbios) != 8); BUILD_BUG_ON(sizeof(struct acpi_nfit_control_region) != 80); BUILD_BUG_ON(sizeof(struct acpi_nfit_data_region) != 40); BUILD_BUG_ON(sizeof(struct acpi_nfit_capabilities) != 16); guid_parse(UUID_VOLATILE_MEMORY, &nfit_uuid[NFIT_SPA_VOLATILE]); guid_parse(UUID_PERSISTENT_MEMORY, &nfit_uuid[NFIT_SPA_PM]); guid_parse(UUID_CONTROL_REGION, &nfit_uuid[NFIT_SPA_DCR]); guid_parse(UUID_DATA_REGION, &nfit_uuid[NFIT_SPA_BDW]); guid_parse(UUID_VOLATILE_VIRTUAL_DISK, &nfit_uuid[NFIT_SPA_VDISK]); guid_parse(UUID_VOLATILE_VIRTUAL_CD, &nfit_uuid[NFIT_SPA_VCD]); guid_parse(UUID_PERSISTENT_VIRTUAL_DISK, &nfit_uuid[NFIT_SPA_PDISK]); guid_parse(UUID_PERSISTENT_VIRTUAL_CD, &nfit_uuid[NFIT_SPA_PCD]); guid_parse(UUID_NFIT_BUS, &nfit_uuid[NFIT_DEV_BUS]); guid_parse(UUID_NFIT_DIMM, &nfit_uuid[NFIT_DEV_DIMM]); guid_parse(UUID_NFIT_DIMM_N_HPE1, &nfit_uuid[NFIT_DEV_DIMM_N_HPE1]); guid_parse(UUID_NFIT_DIMM_N_HPE2, &nfit_uuid[NFIT_DEV_DIMM_N_HPE2]); guid_parse(UUID_NFIT_DIMM_N_MSFT, &nfit_uuid[NFIT_DEV_DIMM_N_MSFT]); guid_parse(UUID_NFIT_DIMM_N_HYPERV, &nfit_uuid[NFIT_DEV_DIMM_N_HYPERV]); guid_parse(UUID_INTEL_BUS, &nfit_uuid[NFIT_BUS_INTEL]); nfit_wq = create_singlethread_workqueue("nfit"); if (!nfit_wq) return -ENOMEM; nfit_mce_register(); ret = acpi_bus_register_driver(&acpi_nfit_driver); if (ret) { nfit_mce_unregister(); destroy_workqueue(nfit_wq); } return ret; } static __exit void nfit_exit(void) { nfit_mce_unregister(); acpi_bus_unregister_driver(&acpi_nfit_driver); destroy_workqueue(nfit_wq); WARN_ON(!list_empty(&acpi_descs)); } module_init(nfit_init); module_exit(nfit_exit); MODULE_DESCRIPTION("ACPI NVDIMM Firmware Interface Table (NFIT) driver"); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Intel Corporation");
2 1 3 2 1 1 18 17 16 15 14 13 13 12 9 11 11 10 10 10 10 7 6 2 3 1 24 1 23 18 10 17 16 6 5 4 3 3 3 3 3 3 24 23 3 3 3 24 1 4 3 2 2 4 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 // SPDX-License-Identifier: GPL-2.0-or-later /* RxRPC key management * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * RxRPC keys should have a description of describing their purpose: * "afs@example.com" */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <crypto/skcipher.h> #include <linux/module.h> #include <linux/net.h> #include <linux/skbuff.h> #include <linux/key-type.h> #include <linux/ctype.h> #include <linux/slab.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include <keys/rxrpc-type.h> #include <keys/user-type.h> #include "ar-internal.h" static int rxrpc_preparse(struct key_preparsed_payload *); static void rxrpc_free_preparse(struct key_preparsed_payload *); static void rxrpc_destroy(struct key *); static void rxrpc_describe(const struct key *, struct seq_file *); static long rxrpc_read(const struct key *, char *, size_t); /* * rxrpc defined keys take an arbitrary string as the description and an * arbitrary blob of data as the payload */ struct key_type key_type_rxrpc = { .name = "rxrpc", .flags = KEY_TYPE_NET_DOMAIN, .preparse = rxrpc_preparse, .free_preparse = rxrpc_free_preparse, .instantiate = generic_key_instantiate, .destroy = rxrpc_destroy, .describe = rxrpc_describe, .read = rxrpc_read, }; EXPORT_SYMBOL(key_type_rxrpc); /* * parse an RxKAD type XDR format token * - the caller guarantees we have at least 4 words */ static int rxrpc_preparse_xdr_rxkad(struct key_preparsed_payload *prep, size_t datalen, const __be32 *xdr, unsigned int toklen) { struct rxrpc_key_token *token, **pptoken; time64_t expiry; size_t plen; u32 tktlen; _enter(",{%x,%x,%x,%x},%u", ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]), toklen); if (toklen <= 8 * 4) return -EKEYREJECTED; tktlen = ntohl(xdr[7]); _debug("tktlen: %x", tktlen); if (tktlen > AFSTOKEN_RK_TIX_MAX) return -EKEYREJECTED; if (toklen < 8 * 4 + tktlen) return -EKEYREJECTED; plen = sizeof(*token) + sizeof(*token->kad) + tktlen; prep->quotalen = datalen + plen; plen -= sizeof(*token); token = kzalloc(sizeof(*token), GFP_KERNEL); if (!token) return -ENOMEM; token->kad = kzalloc(plen, GFP_KERNEL); if (!token->kad) { kfree(token); return -ENOMEM; } token->security_index = RXRPC_SECURITY_RXKAD; token->kad->ticket_len = tktlen; token->kad->vice_id = ntohl(xdr[0]); token->kad->kvno = ntohl(xdr[1]); token->kad->start = ntohl(xdr[4]); token->kad->expiry = ntohl(xdr[5]); token->kad->primary_flag = ntohl(xdr[6]); memcpy(&token->kad->session_key, &xdr[2], 8); memcpy(&token->kad->ticket, &xdr[8], tktlen); _debug("SCIX: %u", token->security_index); _debug("TLEN: %u", token->kad->ticket_len); _debug("EXPY: %x", token->kad->expiry); _debug("KVNO: %u", token->kad->kvno); _debug("PRIM: %u", token->kad->primary_flag); _debug("SKEY: %02x%02x%02x%02x%02x%02x%02x%02x", token->kad->session_key[0], token->kad->session_key[1], token->kad->session_key[2], token->kad->session_key[3], token->kad->session_key[4], token->kad->session_key[5], token->kad->session_key[6], token->kad->session_key[7]); if (token->kad->ticket_len >= 8) _debug("TCKT: %02x%02x%02x%02x%02x%02x%02x%02x", token->kad->ticket[0], token->kad->ticket[1], token->kad->ticket[2], token->kad->ticket[3], token->kad->ticket[4], token->kad->ticket[5], token->kad->ticket[6], token->kad->ticket[7]); /* count the number of tokens attached */ prep->payload.data[1] = (void *)((unsigned long)prep->payload.data[1] + 1); /* attach the data */ for (pptoken = (struct rxrpc_key_token **)&prep->payload.data[0]; *pptoken; pptoken = &(*pptoken)->next) continue; *pptoken = token; expiry = rxrpc_u32_to_time64(token->kad->expiry); if (expiry < prep->expiry) prep->expiry = expiry; _leave(" = 0"); return 0; } static u64 xdr_dec64(const __be32 *xdr) { return (u64)ntohl(xdr[0]) << 32 | (u64)ntohl(xdr[1]); } static time64_t rxrpc_s64_to_time64(s64 time_in_100ns) { bool neg = false; u64 tmp = time_in_100ns; if (time_in_100ns < 0) { tmp = -time_in_100ns; neg = true; } do_div(tmp, 10000000); return neg ? -tmp : tmp; } /* * Parse a YFS-RxGK type XDR format token * - the caller guarantees we have at least 4 words * * struct token_rxgk { * opr_time begintime; * opr_time endtime; * afs_int64 level; * afs_int64 lifetime; * afs_int64 bytelife; * afs_int64 enctype; * opaque key<>; * opaque ticket<>; * }; */ static int rxrpc_preparse_xdr_yfs_rxgk(struct key_preparsed_payload *prep, size_t datalen, const __be32 *xdr, unsigned int toklen) { struct rxrpc_key_token *token, **pptoken; time64_t expiry; size_t plen; const __be32 *ticket, *key; s64 tmp; u32 tktlen, keylen; _enter(",{%x,%x,%x,%x},%x", ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]), toklen); if (6 * 2 + 2 > toklen / 4) goto reject; key = xdr + (6 * 2 + 1); keylen = ntohl(key[-1]); _debug("keylen: %x", keylen); keylen = round_up(keylen, 4); if ((6 * 2 + 2) * 4 + keylen > toklen) goto reject; ticket = xdr + (6 * 2 + 1 + (keylen / 4) + 1); tktlen = ntohl(ticket[-1]); _debug("tktlen: %x", tktlen); tktlen = round_up(tktlen, 4); if ((6 * 2 + 2) * 4 + keylen + tktlen != toklen) { kleave(" = -EKEYREJECTED [%x!=%x, %x,%x]", (6 * 2 + 2) * 4 + keylen + tktlen, toklen, keylen, tktlen); goto reject; } plen = sizeof(*token) + sizeof(*token->rxgk) + tktlen + keylen; prep->quotalen = datalen + plen; plen -= sizeof(*token); token = kzalloc(sizeof(*token), GFP_KERNEL); if (!token) goto nomem; token->rxgk = kzalloc(sizeof(*token->rxgk) + keylen, GFP_KERNEL); if (!token->rxgk) goto nomem_token; token->security_index = RXRPC_SECURITY_YFS_RXGK; token->rxgk->begintime = xdr_dec64(xdr + 0 * 2); token->rxgk->endtime = xdr_dec64(xdr + 1 * 2); token->rxgk->level = tmp = xdr_dec64(xdr + 2 * 2); if (tmp < -1LL || tmp > RXRPC_SECURITY_ENCRYPT) goto reject_token; token->rxgk->lifetime = xdr_dec64(xdr + 3 * 2); token->rxgk->bytelife = xdr_dec64(xdr + 4 * 2); token->rxgk->enctype = tmp = xdr_dec64(xdr + 5 * 2); if (tmp < 0 || tmp > UINT_MAX) goto reject_token; token->rxgk->key.len = ntohl(key[-1]); token->rxgk->key.data = token->rxgk->_key; token->rxgk->ticket.len = ntohl(ticket[-1]); if (token->rxgk->endtime != 0) { expiry = rxrpc_s64_to_time64(token->rxgk->endtime); if (expiry < 0) goto expired; if (expiry < prep->expiry) prep->expiry = expiry; } memcpy(token->rxgk->key.data, key, token->rxgk->key.len); /* Pad the ticket so that we can use it directly in XDR */ token->rxgk->ticket.data = kzalloc(round_up(token->rxgk->ticket.len, 4), GFP_KERNEL); if (!token->rxgk->ticket.data) goto nomem_yrxgk; memcpy(token->rxgk->ticket.data, ticket, token->rxgk->ticket.len); _debug("SCIX: %u", token->security_index); _debug("EXPY: %llx", token->rxgk->endtime); _debug("LIFE: %llx", token->rxgk->lifetime); _debug("BYTE: %llx", token->rxgk->bytelife); _debug("ENC : %u", token->rxgk->enctype); _debug("LEVL: %u", token->rxgk->level); _debug("KLEN: %u", token->rxgk->key.len); _debug("TLEN: %u", token->rxgk->ticket.len); _debug("KEY0: %*phN", token->rxgk->key.len, token->rxgk->key.data); _debug("TICK: %*phN", min_t(u32, token->rxgk->ticket.len, 32), token->rxgk->ticket.data); /* count the number of tokens attached */ prep->payload.data[1] = (void *)((unsigned long)prep->payload.data[1] + 1); /* attach the data */ for (pptoken = (struct rxrpc_key_token **)&prep->payload.data[0]; *pptoken; pptoken = &(*pptoken)->next) continue; *pptoken = token; _leave(" = 0"); return 0; nomem_yrxgk: kfree(token->rxgk); nomem_token: kfree(token); nomem: return -ENOMEM; reject_token: kfree(token); reject: return -EKEYREJECTED; expired: kfree(token->rxgk); kfree(token); return -EKEYEXPIRED; } /* * attempt to parse the data as the XDR format * - the caller guarantees we have more than 7 words */ static int rxrpc_preparse_xdr(struct key_preparsed_payload *prep) { const __be32 *xdr = prep->data, *token, *p; const char *cp; unsigned int len, paddedlen, loop, ntoken, toklen, sec_ix; size_t datalen = prep->datalen; int ret, ret2; _enter(",{%x,%x,%x,%x},%zu", ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]), prep->datalen); if (datalen > AFSTOKEN_LENGTH_MAX) goto not_xdr; /* XDR is an array of __be32's */ if (datalen & 3) goto not_xdr; /* the flags should be 0 (the setpag bit must be handled by * userspace) */ if (ntohl(*xdr++) != 0) goto not_xdr; datalen -= 4; /* check the cell name */ len = ntohl(*xdr++); if (len < 1 || len > AFSTOKEN_CELL_MAX) goto not_xdr; datalen -= 4; paddedlen = (len + 3) & ~3; if (paddedlen > datalen) goto not_xdr; cp = (const char *) xdr; for (loop = 0; loop < len; loop++) if (!isprint(cp[loop])) goto not_xdr; for (; loop < paddedlen; loop++) if (cp[loop]) goto not_xdr; _debug("cellname: [%u/%u] '%*.*s'", len, paddedlen, len, len, (const char *) xdr); datalen -= paddedlen; xdr += paddedlen >> 2; /* get the token count */ if (datalen < 12) goto not_xdr; ntoken = ntohl(*xdr++); datalen -= 4; _debug("ntoken: %x", ntoken); if (ntoken < 1 || ntoken > AFSTOKEN_MAX) goto not_xdr; /* check each token wrapper */ p = xdr; loop = ntoken; do { if (datalen < 8) goto not_xdr; toklen = ntohl(*p++); sec_ix = ntohl(*p); datalen -= 4; _debug("token: [%x/%zx] %x", toklen, datalen, sec_ix); paddedlen = (toklen + 3) & ~3; if (toklen < 20 || toklen > datalen || paddedlen > datalen) goto not_xdr; datalen -= paddedlen; p += paddedlen >> 2; } while (--loop > 0); _debug("remainder: %zu", datalen); if (datalen != 0) goto not_xdr; /* okay: we're going to assume it's valid XDR format * - we ignore the cellname, relying on the key to be correctly named */ ret = -EPROTONOSUPPORT; do { toklen = ntohl(*xdr++); token = xdr; xdr += (toklen + 3) / 4; sec_ix = ntohl(*token++); toklen -= 4; _debug("TOKEN type=%x len=%x", sec_ix, toklen); switch (sec_ix) { case RXRPC_SECURITY_RXKAD: ret2 = rxrpc_preparse_xdr_rxkad(prep, datalen, token, toklen); break; case RXRPC_SECURITY_YFS_RXGK: ret2 = rxrpc_preparse_xdr_yfs_rxgk(prep, datalen, token, toklen); break; default: ret2 = -EPROTONOSUPPORT; break; } switch (ret2) { case 0: ret = 0; break; case -EPROTONOSUPPORT: break; case -ENOPKG: if (ret != 0) ret = -ENOPKG; break; default: ret = ret2; goto error; } } while (--ntoken > 0); error: _leave(" = %d", ret); return ret; not_xdr: _leave(" = -EPROTO"); return -EPROTO; } /* * Preparse an rxrpc defined key. * * Data should be of the form: * OFFSET LEN CONTENT * 0 4 key interface version number * 4 2 security index (type) * 6 2 ticket length * 8 4 key expiry time (time_t) * 12 4 kvno * 16 8 session key * 24 [len] ticket * * if no data is provided, then a no-security key is made */ static int rxrpc_preparse(struct key_preparsed_payload *prep) { const struct rxrpc_key_data_v1 *v1; struct rxrpc_key_token *token, **pp; time64_t expiry; size_t plen; u32 kver; int ret; _enter("%zu", prep->datalen); /* handle a no-security key */ if (!prep->data && prep->datalen == 0) return 0; /* determine if the XDR payload format is being used */ if (prep->datalen > 7 * 4) { ret = rxrpc_preparse_xdr(prep); if (ret != -EPROTO) return ret; } /* get the key interface version number */ ret = -EINVAL; if (prep->datalen <= 4 || !prep->data) goto error; memcpy(&kver, prep->data, sizeof(kver)); prep->data += sizeof(kver); prep->datalen -= sizeof(kver); _debug("KEY I/F VERSION: %u", kver); ret = -EKEYREJECTED; if (kver != 1) goto error; /* deal with a version 1 key */ ret = -EINVAL; if (prep->datalen < sizeof(*v1)) goto error; v1 = prep->data; if (prep->datalen != sizeof(*v1) + v1->ticket_length) goto error; _debug("SCIX: %u", v1->security_index); _debug("TLEN: %u", v1->ticket_length); _debug("EXPY: %x", v1->expiry); _debug("KVNO: %u", v1->kvno); _debug("SKEY: %02x%02x%02x%02x%02x%02x%02x%02x", v1->session_key[0], v1->session_key[1], v1->session_key[2], v1->session_key[3], v1->session_key[4], v1->session_key[5], v1->session_key[6], v1->session_key[7]); if (v1->ticket_length >= 8) _debug("TCKT: %02x%02x%02x%02x%02x%02x%02x%02x", v1->ticket[0], v1->ticket[1], v1->ticket[2], v1->ticket[3], v1->ticket[4], v1->ticket[5], v1->ticket[6], v1->ticket[7]); ret = -EPROTONOSUPPORT; if (v1->security_index != RXRPC_SECURITY_RXKAD) goto error; plen = sizeof(*token->kad) + v1->ticket_length; prep->quotalen = plen + sizeof(*token); ret = -ENOMEM; token = kzalloc(sizeof(*token), GFP_KERNEL); if (!token) goto error; token->kad = kzalloc(plen, GFP_KERNEL); if (!token->kad) goto error_free; token->security_index = RXRPC_SECURITY_RXKAD; token->kad->ticket_len = v1->ticket_length; token->kad->expiry = v1->expiry; token->kad->kvno = v1->kvno; memcpy(&token->kad->session_key, &v1->session_key, 8); memcpy(&token->kad->ticket, v1->ticket, v1->ticket_length); /* count the number of tokens attached */ prep->payload.data[1] = (void *)((unsigned long)prep->payload.data[1] + 1); /* attach the data */ pp = (struct rxrpc_key_token **)&prep->payload.data[0]; while (*pp) pp = &(*pp)->next; *pp = token; expiry = rxrpc_u32_to_time64(token->kad->expiry); if (expiry < prep->expiry) prep->expiry = expiry; token = NULL; ret = 0; error_free: kfree(token); error: return ret; } /* * Free token list. */ static void rxrpc_free_token_list(struct rxrpc_key_token *token) { struct rxrpc_key_token *next; for (; token; token = next) { next = token->next; switch (token->security_index) { case RXRPC_SECURITY_RXKAD: kfree(token->kad); break; case RXRPC_SECURITY_YFS_RXGK: kfree(token->rxgk->ticket.data); kfree(token->rxgk); break; default: pr_err("Unknown token type %x on rxrpc key\n", token->security_index); BUG(); } kfree(token); } } /* * Clean up preparse data. */ static void rxrpc_free_preparse(struct key_preparsed_payload *prep) { rxrpc_free_token_list(prep->payload.data[0]); } /* * dispose of the data dangling from the corpse of a rxrpc key */ static void rxrpc_destroy(struct key *key) { rxrpc_free_token_list(key->payload.data[0]); } /* * describe the rxrpc key */ static void rxrpc_describe(const struct key *key, struct seq_file *m) { const struct rxrpc_key_token *token; const char *sep = ": "; seq_puts(m, key->description); for (token = key->payload.data[0]; token; token = token->next) { seq_puts(m, sep); switch (token->security_index) { case RXRPC_SECURITY_RXKAD: seq_puts(m, "ka"); break; case RXRPC_SECURITY_YFS_RXGK: seq_puts(m, "ygk"); break; default: /* we have a ticket we can't encode */ seq_printf(m, "%u", token->security_index); break; } sep = " "; } } /* * grab the security key for a socket */ int rxrpc_request_key(struct rxrpc_sock *rx, sockptr_t optval, int optlen) { struct key *key; char *description; _enter(""); if (optlen <= 0 || optlen > PAGE_SIZE - 1 || rx->securities) return -EINVAL; description = memdup_sockptr_nul(optval, optlen); if (IS_ERR(description)) return PTR_ERR(description); key = request_key_net(&key_type_rxrpc, description, sock_net(&rx->sk), NULL); if (IS_ERR(key)) { kfree(description); _leave(" = %ld", PTR_ERR(key)); return PTR_ERR(key); } rx->key = key; kfree(description); _leave(" = 0 [key %x]", key->serial); return 0; } /* * generate a server data key */ int rxrpc_get_server_data_key(struct rxrpc_connection *conn, const void *session_key, time64_t expiry, u32 kvno) { const struct cred *cred = current_cred(); struct key *key; int ret; struct { u32 kver; struct rxrpc_key_data_v1 v1; } data; _enter(""); key = key_alloc(&key_type_rxrpc, "x", GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, 0, KEY_ALLOC_NOT_IN_QUOTA, NULL); if (IS_ERR(key)) { _leave(" = -ENOMEM [alloc %ld]", PTR_ERR(key)); return -ENOMEM; } _debug("key %d", key_serial(key)); data.kver = 1; data.v1.security_index = RXRPC_SECURITY_RXKAD; data.v1.ticket_length = 0; data.v1.expiry = rxrpc_time64_to_u32(expiry); data.v1.kvno = 0; memcpy(&data.v1.session_key, session_key, sizeof(data.v1.session_key)); ret = key_instantiate_and_link(key, &data, sizeof(data), NULL, NULL); if (ret < 0) goto error; conn->key = key; _leave(" = 0 [%d]", key_serial(key)); return 0; error: key_revoke(key); key_put(key); _leave(" = -ENOMEM [ins %d]", ret); return -ENOMEM; } EXPORT_SYMBOL(rxrpc_get_server_data_key); /** * rxrpc_get_null_key - Generate a null RxRPC key * @keyname: The name to give the key. * * Generate a null RxRPC key that can be used to indicate anonymous security is * required for a particular domain. * * Return: The new key or a negative error code. */ struct key *rxrpc_get_null_key(const char *keyname) { const struct cred *cred = current_cred(); struct key *key; int ret; key = key_alloc(&key_type_rxrpc, keyname, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, KEY_POS_SEARCH, KEY_ALLOC_NOT_IN_QUOTA, NULL); if (IS_ERR(key)) return key; ret = key_instantiate_and_link(key, NULL, 0, NULL, NULL); if (ret < 0) { key_revoke(key); key_put(key); return ERR_PTR(ret); } return key; } EXPORT_SYMBOL(rxrpc_get_null_key); /* * read the contents of an rxrpc key * - this returns the result in XDR form */ static long rxrpc_read(const struct key *key, char *buffer, size_t buflen) { const struct rxrpc_key_token *token; size_t size; __be32 *xdr, *oldxdr; u32 cnlen, toksize, ntoks, tok, zero; u16 toksizes[AFSTOKEN_MAX]; _enter(""); /* we don't know what form we should return non-AFS keys in */ if (memcmp(key->description, "afs@", 4) != 0) return -EOPNOTSUPP; cnlen = strlen(key->description + 4); #define RND(X) (((X) + 3) & ~3) /* AFS keys we return in XDR form, so we need to work out the size of * the XDR */ size = 2 * 4; /* flags, cellname len */ size += RND(cnlen); /* cellname */ size += 1 * 4; /* token count */ ntoks = 0; for (token = key->payload.data[0]; token; token = token->next) { toksize = 4; /* sec index */ switch (token->security_index) { case RXRPC_SECURITY_RXKAD: toksize += 8 * 4; /* viceid, kvno, key*2, begin, * end, primary, tktlen */ if (!token->no_leak_key) toksize += RND(token->kad->ticket_len); break; case RXRPC_SECURITY_YFS_RXGK: toksize += 6 * 8 + 2 * 4; if (!token->no_leak_key) toksize += RND(token->rxgk->key.len); toksize += RND(token->rxgk->ticket.len); break; default: /* we have a ticket we can't encode */ pr_err("Unsupported key token type (%u)\n", token->security_index); return -ENOPKG; } _debug("token[%u]: toksize=%u", ntoks, toksize); if (WARN_ON(toksize > AFSTOKEN_LENGTH_MAX)) return -EIO; toksizes[ntoks++] = toksize; size += toksize + 4; /* each token has a length word */ } #undef RND if (!buffer || buflen < size) return size; xdr = (__be32 *)buffer; zero = 0; #define ENCODE(x) \ do { \ *xdr++ = htonl(x); \ } while(0) #define ENCODE_DATA(l, s) \ do { \ u32 _l = (l); \ ENCODE(l); \ memcpy(xdr, (s), _l); \ if (_l & 3) \ memcpy((u8 *)xdr + _l, &zero, 4 - (_l & 3)); \ xdr += (_l + 3) >> 2; \ } while(0) #define ENCODE_BYTES(l, s) \ do { \ u32 _l = (l); \ memcpy(xdr, (s), _l); \ if (_l & 3) \ memcpy((u8 *)xdr + _l, &zero, 4 - (_l & 3)); \ xdr += (_l + 3) >> 2; \ } while(0) #define ENCODE64(x) \ do { \ __be64 y = cpu_to_be64(x); \ memcpy(xdr, &y, 8); \ xdr += 8 >> 2; \ } while(0) #define ENCODE_STR(s) \ do { \ const char *_s = (s); \ ENCODE_DATA(strlen(_s), _s); \ } while(0) ENCODE(0); /* flags */ ENCODE_DATA(cnlen, key->description + 4); /* cellname */ ENCODE(ntoks); tok = 0; for (token = key->payload.data[0]; token; token = token->next) { toksize = toksizes[tok++]; ENCODE(toksize); oldxdr = xdr; ENCODE(token->security_index); switch (token->security_index) { case RXRPC_SECURITY_RXKAD: ENCODE(token->kad->vice_id); ENCODE(token->kad->kvno); ENCODE_BYTES(8, token->kad->session_key); ENCODE(token->kad->start); ENCODE(token->kad->expiry); ENCODE(token->kad->primary_flag); if (token->no_leak_key) ENCODE(0); else ENCODE_DATA(token->kad->ticket_len, token->kad->ticket); break; case RXRPC_SECURITY_YFS_RXGK: ENCODE64(token->rxgk->begintime); ENCODE64(token->rxgk->endtime); ENCODE64(token->rxgk->level); ENCODE64(token->rxgk->lifetime); ENCODE64(token->rxgk->bytelife); ENCODE64(token->rxgk->enctype); if (token->no_leak_key) ENCODE(0); else ENCODE_DATA(token->rxgk->key.len, token->rxgk->key.data); ENCODE_DATA(token->rxgk->ticket.len, token->rxgk->ticket.data); break; default: pr_err("Unsupported key token type (%u)\n", token->security_index); return -ENOPKG; } if (WARN_ON((unsigned long)xdr - (unsigned long)oldxdr != toksize)) return -EIO; } #undef ENCODE_STR #undef ENCODE_DATA #undef ENCODE64 #undef ENCODE if (WARN_ON(tok != ntoks)) return -EIO; if (WARN_ON((unsigned long)xdr - (unsigned long)buffer != size)) return -EIO; _leave(" = %zu", size); return size; }
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 6 11 6 6 4 6 2 5 2 1 11 6 11 5 5 5 5 5 4 4 5 5 1 5 5 2 5 4 4 1 3 2 5 5 4 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 // SPDX-License-Identifier: GPL-2.0-or-later /* * Joystick device driver for the input driver suite. * * Copyright (c) 1999-2002 Vojtech Pavlik * Copyright (c) 1999 Colin Van Dyke */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <asm/io.h> #include <linux/delay.h> #include <linux/errno.h> #include <linux/joystick.h> #include <linux/input.h> #include <linux/kernel.h> #include <linux/major.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/poll.h> #include <linux/init.h> #include <linux/device.h> #include <linux/cdev.h> MODULE_AUTHOR("Vojtech Pavlik <vojtech@ucw.cz>"); MODULE_DESCRIPTION("Joystick device interfaces"); MODULE_LICENSE("GPL"); #define JOYDEV_MINOR_BASE 0 #define JOYDEV_MINORS 16 #define JOYDEV_BUFFER_SIZE 64 struct joydev { int open; struct input_handle handle; wait_queue_head_t wait; struct list_head client_list; spinlock_t client_lock; /* protects client_list */ struct mutex mutex; struct device dev; struct cdev cdev; bool exist; struct js_corr corr[ABS_CNT]; struct JS_DATA_SAVE_TYPE glue; int nabs; int nkey; __u16 keymap[KEY_MAX - BTN_MISC + 1]; __u16 keypam[KEY_MAX - BTN_MISC + 1]; __u8 absmap[ABS_CNT]; __u8 abspam[ABS_CNT]; __s16 abs[ABS_CNT]; }; struct joydev_client { struct js_event buffer[JOYDEV_BUFFER_SIZE]; int head; int tail; int startup; spinlock_t buffer_lock; /* protects access to buffer, head and tail */ struct fasync_struct *fasync; struct joydev *joydev; struct list_head node; }; static int joydev_correct(int value, struct js_corr *corr) { switch (corr->type) { case JS_CORR_NONE: break; case JS_CORR_BROKEN: value = value > corr->coef[0] ? (value < corr->coef[1] ? 0 : ((corr->coef[3] * (value - corr->coef[1])) >> 14)) : ((corr->coef[2] * (value - corr->coef[0])) >> 14); break; default: return 0; } return clamp(value, -32767, 32767); } static void joydev_pass_event(struct joydev_client *client, struct js_event *event) { struct joydev *joydev = client->joydev; /* * IRQs already disabled, just acquire the lock */ spin_lock(&client->buffer_lock); client->buffer[client->head] = *event; if (client->startup == joydev->nabs + joydev->nkey) { client->head++; client->head &= JOYDEV_BUFFER_SIZE - 1; if (client->tail == client->head) client->startup = 0; } spin_unlock(&client->buffer_lock); kill_fasync(&client->fasync, SIGIO, POLL_IN); } static void joydev_event(struct input_handle *handle, unsigned int type, unsigned int code, int value) { struct joydev *joydev = handle->private; struct joydev_client *client; struct js_event event; switch (type) { case EV_KEY: if (code < BTN_MISC || value == 2) return; event.type = JS_EVENT_BUTTON; event.number = joydev->keymap[code - BTN_MISC]; event.value = value; break; case EV_ABS: event.type = JS_EVENT_AXIS; event.number = joydev->absmap[code]; event.value = joydev_correct(value, &joydev->corr[event.number]); if (event.value == joydev->abs[event.number]) return; joydev->abs[event.number] = event.value; break; default: return; } event.time = jiffies_to_msecs(jiffies); rcu_read_lock(); list_for_each_entry_rcu(client, &joydev->client_list, node) joydev_pass_event(client, &event); rcu_read_unlock(); wake_up_interruptible(&joydev->wait); } static int joydev_fasync(int fd, struct file *file, int on) { struct joydev_client *client = file->private_data; return fasync_helper(fd, file, on, &client->fasync); } static void joydev_free(struct device *dev) { struct joydev *joydev = container_of(dev, struct joydev, dev); input_put_device(joydev->handle.dev); kfree(joydev); } static void joydev_attach_client(struct joydev *joydev, struct joydev_client *client) { spin_lock(&joydev->client_lock); list_add_tail_rcu(&client->node, &joydev->client_list); spin_unlock(&joydev->client_lock); } static void joydev_detach_client(struct joydev *joydev, struct joydev_client *client) { spin_lock(&joydev->client_lock); list_del_rcu(&client->node); spin_unlock(&joydev->client_lock); synchronize_rcu(); } static void joydev_refresh_state(struct joydev *joydev) { struct input_dev *dev = joydev->handle.dev; int i, val; for (i = 0; i < joydev->nabs; i++) { val = input_abs_get_val(dev, joydev->abspam[i]); joydev->abs[i] = joydev_correct(val, &joydev->corr[i]); } } static int joydev_open_device(struct joydev *joydev) { int retval; retval = mutex_lock_interruptible(&joydev->mutex); if (retval) return retval; if (!joydev->exist) retval = -ENODEV; else if (!joydev->open++) { retval = input_open_device(&joydev->handle); if (retval) joydev->open--; else joydev_refresh_state(joydev); } mutex_unlock(&joydev->mutex); return retval; } static void joydev_close_device(struct joydev *joydev) { mutex_lock(&joydev->mutex); if (joydev->exist && !--joydev->open) input_close_device(&joydev->handle); mutex_unlock(&joydev->mutex); } /* * Wake up users waiting for IO so they can disconnect from * dead device. */ static void joydev_hangup(struct joydev *joydev) { struct joydev_client *client; spin_lock(&joydev->client_lock); list_for_each_entry(client, &joydev->client_list, node) kill_fasync(&client->fasync, SIGIO, POLL_HUP); spin_unlock(&joydev->client_lock); wake_up_interruptible(&joydev->wait); } static int joydev_release(struct inode *inode, struct file *file) { struct joydev_client *client = file->private_data; struct joydev *joydev = client->joydev; joydev_detach_client(joydev, client); kfree(client); joydev_close_device(joydev); return 0; } static int joydev_open(struct inode *inode, struct file *file) { struct joydev *joydev = container_of(inode->i_cdev, struct joydev, cdev); struct joydev_client *client; int error; client = kzalloc(sizeof(struct joydev_client), GFP_KERNEL); if (!client) return -ENOMEM; spin_lock_init(&client->buffer_lock); client->joydev = joydev; joydev_attach_client(joydev, client); error = joydev_open_device(joydev); if (error) goto err_free_client; file->private_data = client; stream_open(inode, file); return 0; err_free_client: joydev_detach_client(joydev, client); kfree(client); return error; } static int joydev_generate_startup_event(struct joydev_client *client, struct input_dev *input, struct js_event *event) { struct joydev *joydev = client->joydev; int have_event; spin_lock_irq(&client->buffer_lock); have_event = client->startup < joydev->nabs + joydev->nkey; if (have_event) { event->time = jiffies_to_msecs(jiffies); if (client->startup < joydev->nkey) { event->type = JS_EVENT_BUTTON | JS_EVENT_INIT; event->number = client->startup; event->value = !!test_bit(joydev->keypam[event->number], input->key); } else { event->type = JS_EVENT_AXIS | JS_EVENT_INIT; event->number = client->startup - joydev->nkey; event->value = joydev->abs[event->number]; } client->startup++; } spin_unlock_irq(&client->buffer_lock); return have_event; } static int joydev_fetch_next_event(struct joydev_client *client, struct js_event *event) { int have_event; spin_lock_irq(&client->buffer_lock); have_event = client->head != client->tail; if (have_event) { *event = client->buffer[client->tail++]; client->tail &= JOYDEV_BUFFER_SIZE - 1; } spin_unlock_irq(&client->buffer_lock); return have_event; } /* * Old joystick interface */ static ssize_t joydev_0x_read(struct joydev_client *client, struct input_dev *input, char __user *buf) { struct joydev *joydev = client->joydev; struct JS_DATA_TYPE data; int i; spin_lock_irq(&input->event_lock); /* * Get device state */ for (data.buttons = i = 0; i < 32 && i < joydev->nkey; i++) data.buttons |= test_bit(joydev->keypam[i], input->key) ? (1 << i) : 0; data.x = (joydev->abs[0] / 256 + 128) >> joydev->glue.JS_CORR.x; data.y = (joydev->abs[1] / 256 + 128) >> joydev->glue.JS_CORR.y; /* * Reset reader's event queue */ spin_lock(&client->buffer_lock); client->startup = 0; client->tail = client->head; spin_unlock(&client->buffer_lock); spin_unlock_irq(&input->event_lock); if (copy_to_user(buf, &data, sizeof(struct JS_DATA_TYPE))) return -EFAULT; return sizeof(struct JS_DATA_TYPE); } static inline int joydev_data_pending(struct joydev_client *client) { struct joydev *joydev = client->joydev; return client->startup < joydev->nabs + joydev->nkey || client->head != client->tail; } static ssize_t joydev_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct joydev_client *client = file->private_data; struct joydev *joydev = client->joydev; struct input_dev *input = joydev->handle.dev; struct js_event event; int retval; if (!joydev->exist) return -ENODEV; if (count < sizeof(struct js_event)) return -EINVAL; if (count == sizeof(struct JS_DATA_TYPE)) return joydev_0x_read(client, input, buf); if (!joydev_data_pending(client) && (file->f_flags & O_NONBLOCK)) return -EAGAIN; retval = wait_event_interruptible(joydev->wait, !joydev->exist || joydev_data_pending(client)); if (retval) return retval; if (!joydev->exist) return -ENODEV; while (retval + sizeof(struct js_event) <= count && joydev_generate_startup_event(client, input, &event)) { if (copy_to_user(buf + retval, &event, sizeof(struct js_event))) return -EFAULT; retval += sizeof(struct js_event); } while (retval + sizeof(struct js_event) <= count && joydev_fetch_next_event(client, &event)) { if (copy_to_user(buf + retval, &event, sizeof(struct js_event))) return -EFAULT; retval += sizeof(struct js_event); } return retval; } /* No kernel lock - fine */ static __poll_t joydev_poll(struct file *file, poll_table *wait) { struct joydev_client *client = file->private_data; struct joydev *joydev = client->joydev; poll_wait(file, &joydev->wait, wait); return (joydev_data_pending(client) ? (EPOLLIN | EPOLLRDNORM) : 0) | (joydev->exist ? 0 : (EPOLLHUP | EPOLLERR)); } static int joydev_handle_JSIOCSAXMAP(struct joydev *joydev, void __user *argp, size_t len) { __u8 *abspam; int i; int retval = 0; len = min(len, sizeof(joydev->abspam)); /* Validate the map. */ abspam = memdup_user(argp, len); if (IS_ERR(abspam)) return PTR_ERR(abspam); for (i = 0; i < len && i < joydev->nabs; i++) { if (abspam[i] > ABS_MAX) { retval = -EINVAL; goto out; } } memcpy(joydev->abspam, abspam, len); for (i = 0; i < joydev->nabs; i++) joydev->absmap[joydev->abspam[i]] = i; out: kfree(abspam); return retval; } static int joydev_handle_JSIOCSBTNMAP(struct joydev *joydev, void __user *argp, size_t len) { __u16 *keypam; int i; int retval = 0; if (len % sizeof(*keypam)) return -EINVAL; len = min(len, sizeof(joydev->keypam)); /* Validate the map. */ keypam = memdup_user(argp, len); if (IS_ERR(keypam)) return PTR_ERR(keypam); for (i = 0; i < (len / 2) && i < joydev->nkey; i++) { if (keypam[i] > KEY_MAX || keypam[i] < BTN_MISC) { retval = -EINVAL; goto out; } } memcpy(joydev->keypam, keypam, len); for (i = 0; i < joydev->nkey; i++) joydev->keymap[joydev->keypam[i] - BTN_MISC] = i; out: kfree(keypam); return retval; } static int joydev_ioctl_common(struct joydev *joydev, unsigned int cmd, void __user *argp) { struct input_dev *dev = joydev->handle.dev; size_t len; int i; const char *name; /* Process fixed-sized commands. */ switch (cmd) { case JS_SET_CAL: return copy_from_user(&joydev->glue.JS_CORR, argp, sizeof(joydev->glue.JS_CORR)) ? -EFAULT : 0; case JS_GET_CAL: return copy_to_user(argp, &joydev->glue.JS_CORR, sizeof(joydev->glue.JS_CORR)) ? -EFAULT : 0; case JS_SET_TIMEOUT: return get_user(joydev->glue.JS_TIMEOUT, (s32 __user *) argp); case JS_GET_TIMEOUT: return put_user(joydev->glue.JS_TIMEOUT, (s32 __user *) argp); case JSIOCGVERSION: return put_user(JS_VERSION, (__u32 __user *) argp); case JSIOCGAXES: return put_user(joydev->nabs, (__u8 __user *) argp); case JSIOCGBUTTONS: return put_user(joydev->nkey, (__u8 __user *) argp); case JSIOCSCORR: if (copy_from_user(joydev->corr, argp, sizeof(joydev->corr[0]) * joydev->nabs)) return -EFAULT; for (i = 0; i < joydev->nabs; i++) { int val = input_abs_get_val(dev, joydev->abspam[i]); joydev->abs[i] = joydev_correct(val, &joydev->corr[i]); } return 0; case JSIOCGCORR: return copy_to_user(argp, joydev->corr, sizeof(joydev->corr[0]) * joydev->nabs) ? -EFAULT : 0; } /* * Process variable-sized commands (the axis and button map commands * are considered variable-sized to decouple them from the values of * ABS_MAX and KEY_MAX). */ switch (cmd & ~IOCSIZE_MASK) { case (JSIOCSAXMAP & ~IOCSIZE_MASK): return joydev_handle_JSIOCSAXMAP(joydev, argp, _IOC_SIZE(cmd)); case (JSIOCGAXMAP & ~IOCSIZE_MASK): len = min_t(size_t, _IOC_SIZE(cmd), sizeof(joydev->abspam)); return copy_to_user(argp, joydev->abspam, len) ? -EFAULT : len; case (JSIOCSBTNMAP & ~IOCSIZE_MASK): return joydev_handle_JSIOCSBTNMAP(joydev, argp, _IOC_SIZE(cmd)); case (JSIOCGBTNMAP & ~IOCSIZE_MASK): len = min_t(size_t, _IOC_SIZE(cmd), sizeof(joydev->keypam)); return copy_to_user(argp, joydev->keypam, len) ? -EFAULT : len; case JSIOCGNAME(0): name = dev->name; if (!name) return 0; len = min_t(size_t, _IOC_SIZE(cmd), strlen(name) + 1); return copy_to_user(argp, name, len) ? -EFAULT : len; } return -EINVAL; } #ifdef CONFIG_COMPAT static long joydev_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct joydev_client *client = file->private_data; struct joydev *joydev = client->joydev; void __user *argp = (void __user *)arg; s32 tmp32; struct JS_DATA_SAVE_TYPE_32 ds32; int retval; retval = mutex_lock_interruptible(&joydev->mutex); if (retval) return retval; if (!joydev->exist) { retval = -ENODEV; goto out; } switch (cmd) { case JS_SET_TIMELIMIT: retval = get_user(tmp32, (s32 __user *) arg); if (retval == 0) joydev->glue.JS_TIMELIMIT = tmp32; break; case JS_GET_TIMELIMIT: tmp32 = joydev->glue.JS_TIMELIMIT; retval = put_user(tmp32, (s32 __user *) arg); break; case JS_SET_ALL: retval = copy_from_user(&ds32, argp, sizeof(ds32)) ? -EFAULT : 0; if (retval == 0) { joydev->glue.JS_TIMEOUT = ds32.JS_TIMEOUT; joydev->glue.BUSY = ds32.BUSY; joydev->glue.JS_EXPIRETIME = ds32.JS_EXPIRETIME; joydev->glue.JS_TIMELIMIT = ds32.JS_TIMELIMIT; joydev->glue.JS_SAVE = ds32.JS_SAVE; joydev->glue.JS_CORR = ds32.JS_CORR; } break; case JS_GET_ALL: ds32.JS_TIMEOUT = joydev->glue.JS_TIMEOUT; ds32.BUSY = joydev->glue.BUSY; ds32.JS_EXPIRETIME = joydev->glue.JS_EXPIRETIME; ds32.JS_TIMELIMIT = joydev->glue.JS_TIMELIMIT; ds32.JS_SAVE = joydev->glue.JS_SAVE; ds32.JS_CORR = joydev->glue.JS_CORR; retval = copy_to_user(argp, &ds32, sizeof(ds32)) ? -EFAULT : 0; break; default: retval = joydev_ioctl_common(joydev, cmd, argp); break; } out: mutex_unlock(&joydev->mutex); return retval; } #endif /* CONFIG_COMPAT */ static long joydev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct joydev_client *client = file->private_data; struct joydev *joydev = client->joydev; void __user *argp = (void __user *)arg; int retval; retval = mutex_lock_interruptible(&joydev->mutex); if (retval) return retval; if (!joydev->exist) { retval = -ENODEV; goto out; } switch (cmd) { case JS_SET_TIMELIMIT: retval = get_user(joydev->glue.JS_TIMELIMIT, (long __user *) arg); break; case JS_GET_TIMELIMIT: retval = put_user(joydev->glue.JS_TIMELIMIT, (long __user *) arg); break; case JS_SET_ALL: retval = copy_from_user(&joydev->glue, argp, sizeof(joydev->glue)) ? -EFAULT : 0; break; case JS_GET_ALL: retval = copy_to_user(argp, &joydev->glue, sizeof(joydev->glue)) ? -EFAULT : 0; break; default: retval = joydev_ioctl_common(joydev, cmd, argp); break; } out: mutex_unlock(&joydev->mutex); return retval; } static const struct file_operations joydev_fops = { .owner = THIS_MODULE, .read = joydev_read, .poll = joydev_poll, .open = joydev_open, .release = joydev_release, .unlocked_ioctl = joydev_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = joydev_compat_ioctl, #endif .fasync = joydev_fasync, }; /* * Mark device non-existent. This disables writes, ioctls and * prevents new users from opening the device. Already posted * blocking reads will stay, however new ones will fail. */ static void joydev_mark_dead(struct joydev *joydev) { mutex_lock(&joydev->mutex); joydev->exist = false; mutex_unlock(&joydev->mutex); } static void joydev_cleanup(struct joydev *joydev) { struct input_handle *handle = &joydev->handle; joydev_mark_dead(joydev); joydev_hangup(joydev); /* joydev is marked dead so no one else accesses joydev->open */ if (joydev->open) input_close_device(handle); } /* * These codes are copied from hid-ids.h, unfortunately there is no common * usb_ids/bt_ids.h header. */ #define USB_VENDOR_ID_SONY 0x054c #define USB_DEVICE_ID_SONY_PS3_CONTROLLER 0x0268 #define USB_DEVICE_ID_SONY_PS4_CONTROLLER 0x05c4 #define USB_DEVICE_ID_SONY_PS4_CONTROLLER_2 0x09cc #define USB_DEVICE_ID_SONY_PS4_CONTROLLER_DONGLE 0x0ba0 #define USB_VENDOR_ID_THQ 0x20d6 #define USB_DEVICE_ID_THQ_PS3_UDRAW 0xcb17 #define USB_VENDOR_ID_NINTENDO 0x057e #define USB_DEVICE_ID_NINTENDO_JOYCONL 0x2006 #define USB_DEVICE_ID_NINTENDO_JOYCONR 0x2007 #define USB_DEVICE_ID_NINTENDO_PROCON 0x2009 #define USB_DEVICE_ID_NINTENDO_CHRGGRIP 0x200E #define ACCEL_DEV(vnd, prd) \ { \ .flags = INPUT_DEVICE_ID_MATCH_VENDOR | \ INPUT_DEVICE_ID_MATCH_PRODUCT | \ INPUT_DEVICE_ID_MATCH_PROPBIT, \ .vendor = (vnd), \ .product = (prd), \ .propbit = { BIT_MASK(INPUT_PROP_ACCELEROMETER) }, \ } static const struct input_device_id joydev_blacklist[] = { /* Avoid touchpads and touchscreens */ { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT, .evbit = { BIT_MASK(EV_KEY) }, .keybit = { [BIT_WORD(BTN_TOUCH)] = BIT_MASK(BTN_TOUCH) }, }, /* Avoid tablets, digitisers and similar devices */ { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT, .evbit = { BIT_MASK(EV_KEY) }, .keybit = { [BIT_WORD(BTN_DIGI)] = BIT_MASK(BTN_DIGI) }, }, /* Disable accelerometers on composite devices */ ACCEL_DEV(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_PS3_CONTROLLER), ACCEL_DEV(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_PS4_CONTROLLER), ACCEL_DEV(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_PS4_CONTROLLER_2), ACCEL_DEV(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_PS4_CONTROLLER_DONGLE), ACCEL_DEV(USB_VENDOR_ID_THQ, USB_DEVICE_ID_THQ_PS3_UDRAW), ACCEL_DEV(USB_VENDOR_ID_NINTENDO, USB_DEVICE_ID_NINTENDO_PROCON), ACCEL_DEV(USB_VENDOR_ID_NINTENDO, USB_DEVICE_ID_NINTENDO_CHRGGRIP), ACCEL_DEV(USB_VENDOR_ID_NINTENDO, USB_DEVICE_ID_NINTENDO_JOYCONL), ACCEL_DEV(USB_VENDOR_ID_NINTENDO, USB_DEVICE_ID_NINTENDO_JOYCONR), { /* sentinel */ } }; static bool joydev_dev_is_blacklisted(struct input_dev *dev) { const struct input_device_id *id; for (id = joydev_blacklist; id->flags; id++) { if (input_match_device_id(dev, id)) { dev_dbg(&dev->dev, "joydev: blacklisting '%s'\n", dev->name); return true; } } return false; } static bool joydev_dev_is_absolute_mouse(struct input_dev *dev) { DECLARE_BITMAP(jd_scratch, KEY_CNT); bool ev_match = false; BUILD_BUG_ON(ABS_CNT > KEY_CNT || EV_CNT > KEY_CNT); /* * Virtualization (VMware, etc) and remote management (HP * ILO2) solutions use absolute coordinates for their virtual * pointing devices so that there is one-to-one relationship * between pointer position on the host screen and virtual * guest screen, and so their mice use ABS_X, ABS_Y and 3 * primary button events. This clashes with what joydev * considers to be joysticks (a device with at minimum ABS_X * axis). * * Here we are trying to separate absolute mice from * joysticks. A device is, for joystick detection purposes, * considered to be an absolute mouse if the following is * true: * * 1) Event types are exactly * EV_ABS, EV_KEY and EV_SYN * or * EV_ABS, EV_KEY, EV_SYN and EV_MSC * or * EV_ABS, EV_KEY, EV_SYN, EV_MSC and EV_REL. * 2) Absolute events are exactly ABS_X and ABS_Y. * 3) Keys are exactly BTN_LEFT, BTN_RIGHT and BTN_MIDDLE. * 4) Device is not on "Amiga" bus. */ bitmap_zero(jd_scratch, EV_CNT); /* VMware VMMouse, HP ILO2 */ __set_bit(EV_ABS, jd_scratch); __set_bit(EV_KEY, jd_scratch); __set_bit(EV_SYN, jd_scratch); if (bitmap_equal(jd_scratch, dev->evbit, EV_CNT)) ev_match = true; /* HP ILO2, AMI BMC firmware */ __set_bit(EV_MSC, jd_scratch); if (bitmap_equal(jd_scratch, dev->evbit, EV_CNT)) ev_match = true; /* VMware Virtual USB Mouse, QEMU USB Tablet, ATEN BMC firmware */ __set_bit(EV_REL, jd_scratch); if (bitmap_equal(jd_scratch, dev->evbit, EV_CNT)) ev_match = true; if (!ev_match) return false; bitmap_zero(jd_scratch, ABS_CNT); __set_bit(ABS_X, jd_scratch); __set_bit(ABS_Y, jd_scratch); if (!bitmap_equal(dev->absbit, jd_scratch, ABS_CNT)) return false; bitmap_zero(jd_scratch, KEY_CNT); __set_bit(BTN_LEFT, jd_scratch); __set_bit(BTN_RIGHT, jd_scratch); __set_bit(BTN_MIDDLE, jd_scratch); if (!bitmap_equal(dev->keybit, jd_scratch, KEY_CNT)) return false; /* * Amiga joystick (amijoy) historically uses left/middle/right * button events. */ if (dev->id.bustype == BUS_AMIGA) return false; return true; } static bool joydev_match(struct input_handler *handler, struct input_dev *dev) { /* Disable blacklisted devices */ if (joydev_dev_is_blacklisted(dev)) return false; /* Avoid absolute mice */ if (joydev_dev_is_absolute_mouse(dev)) return false; return true; } static int joydev_connect(struct input_handler *handler, struct input_dev *dev, const struct input_device_id *id) { struct joydev *joydev; int i, j, t, minor, dev_no; int error; minor = input_get_new_minor(JOYDEV_MINOR_BASE, JOYDEV_MINORS, true); if (minor < 0) { error = minor; pr_err("failed to reserve new minor: %d\n", error); return error; } joydev = kzalloc(sizeof(struct joydev), GFP_KERNEL); if (!joydev) { error = -ENOMEM; goto err_free_minor; } INIT_LIST_HEAD(&joydev->client_list); spin_lock_init(&joydev->client_lock); mutex_init(&joydev->mutex); init_waitqueue_head(&joydev->wait); joydev->exist = true; dev_no = minor; /* Normalize device number if it falls into legacy range */ if (dev_no < JOYDEV_MINOR_BASE + JOYDEV_MINORS) dev_no -= JOYDEV_MINOR_BASE; dev_set_name(&joydev->dev, "js%d", dev_no); joydev->handle.dev = input_get_device(dev); joydev->handle.name = dev_name(&joydev->dev); joydev->handle.handler = handler; joydev->handle.private = joydev; for_each_set_bit(i, dev->absbit, ABS_CNT) { joydev->absmap[i] = joydev->nabs; joydev->abspam[joydev->nabs] = i; joydev->nabs++; } for (i = BTN_JOYSTICK - BTN_MISC; i < KEY_MAX - BTN_MISC + 1; i++) if (test_bit(i + BTN_MISC, dev->keybit)) { joydev->keymap[i] = joydev->nkey; joydev->keypam[joydev->nkey] = i + BTN_MISC; joydev->nkey++; } for (i = 0; i < BTN_JOYSTICK - BTN_MISC; i++) if (test_bit(i + BTN_MISC, dev->keybit)) { joydev->keymap[i] = joydev->nkey; joydev->keypam[joydev->nkey] = i + BTN_MISC; joydev->nkey++; } for (i = 0; i < joydev->nabs; i++) { j = joydev->abspam[i]; if (input_abs_get_max(dev, j) == input_abs_get_min(dev, j)) { joydev->corr[i].type = JS_CORR_NONE; continue; } joydev->corr[i].type = JS_CORR_BROKEN; joydev->corr[i].prec = input_abs_get_fuzz(dev, j); t = (input_abs_get_max(dev, j) + input_abs_get_min(dev, j)) / 2; joydev->corr[i].coef[0] = t - input_abs_get_flat(dev, j); joydev->corr[i].coef[1] = t + input_abs_get_flat(dev, j); t = (input_abs_get_max(dev, j) - input_abs_get_min(dev, j)) / 2 - 2 * input_abs_get_flat(dev, j); if (t) { joydev->corr[i].coef[2] = (1 << 29) / t; joydev->corr[i].coef[3] = (1 << 29) / t; } } joydev->dev.devt = MKDEV(INPUT_MAJOR, minor); joydev->dev.class = &input_class; joydev->dev.parent = &dev->dev; joydev->dev.release = joydev_free; device_initialize(&joydev->dev); error = input_register_handle(&joydev->handle); if (error) goto err_free_joydev; cdev_init(&joydev->cdev, &joydev_fops); error = cdev_device_add(&joydev->cdev, &joydev->dev); if (error) goto err_cleanup_joydev; return 0; err_cleanup_joydev: joydev_cleanup(joydev); input_unregister_handle(&joydev->handle); err_free_joydev: put_device(&joydev->dev); err_free_minor: input_free_minor(minor); return error; } static void joydev_disconnect(struct input_handle *handle) { struct joydev *joydev = handle->private; cdev_device_del(&joydev->cdev, &joydev->dev); joydev_cleanup(joydev); input_free_minor(MINOR(joydev->dev.devt)); input_unregister_handle(handle); put_device(&joydev->dev); } static const struct input_device_id joydev_ids[] = { { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_ABSBIT, .evbit = { BIT_MASK(EV_ABS) }, .absbit = { BIT_MASK(ABS_X) }, }, { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_ABSBIT, .evbit = { BIT_MASK(EV_ABS) }, .absbit = { BIT_MASK(ABS_Z) }, }, { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_ABSBIT, .evbit = { BIT_MASK(EV_ABS) }, .absbit = { BIT_MASK(ABS_WHEEL) }, }, { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_ABSBIT, .evbit = { BIT_MASK(EV_ABS) }, .absbit = { BIT_MASK(ABS_THROTTLE) }, }, { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT, .evbit = { BIT_MASK(EV_KEY) }, .keybit = {[BIT_WORD(BTN_JOYSTICK)] = BIT_MASK(BTN_JOYSTICK) }, }, { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT, .evbit = { BIT_MASK(EV_KEY) }, .keybit = { [BIT_WORD(BTN_GAMEPAD)] = BIT_MASK(BTN_GAMEPAD) }, }, { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT, .evbit = { BIT_MASK(EV_KEY) }, .keybit = { [BIT_WORD(BTN_TRIGGER_HAPPY)] = BIT_MASK(BTN_TRIGGER_HAPPY) }, }, { } /* Terminating entry */ }; MODULE_DEVICE_TABLE(input, joydev_ids); static struct input_handler joydev_handler = { .event = joydev_event, .match = joydev_match, .connect = joydev_connect, .disconnect = joydev_disconnect, .legacy_minors = true, .minor = JOYDEV_MINOR_BASE, .name = "joydev", .id_table = joydev_ids, }; static int __init joydev_init(void) { return input_register_handler(&joydev_handler); } static void __exit joydev_exit(void) { input_unregister_handler(&joydev_handler); } module_init(joydev_init); module_exit(joydev_exit);
74 74 74 74 74 57 49 74 74 74 74 70 70 49 49 74 74 73 73 70 70 70 70 58 58 58 70 70 67 58 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 // SPDX-License-Identifier: GPL-2.0-only /* Object lifetime handling and tracing. * * Copyright (C) 2022 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/slab.h> #include <linux/mempool.h> #include <linux/delay.h> #include "internal.h" static void netfs_free_request(struct work_struct *work); /* * Allocate an I/O request and initialise it. */ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, struct file *file, loff_t start, size_t len, enum netfs_io_origin origin) { static atomic_t debug_ids; struct inode *inode = file ? file_inode(file) : mapping->host; struct netfs_inode *ctx = netfs_inode(inode); struct netfs_io_request *rreq; mempool_t *mempool = ctx->ops->request_pool ?: &netfs_request_pool; struct kmem_cache *cache = mempool->pool_data; int ret; for (;;) { rreq = mempool_alloc(mempool, GFP_KERNEL); if (rreq) break; msleep(10); } memset(rreq, 0, kmem_cache_size(cache)); INIT_WORK(&rreq->cleanup_work, netfs_free_request); rreq->start = start; rreq->len = len; rreq->origin = origin; rreq->netfs_ops = ctx->ops; rreq->mapping = mapping; rreq->inode = inode; rreq->i_size = i_size_read(inode); rreq->debug_id = atomic_inc_return(&debug_ids); rreq->wsize = INT_MAX; rreq->io_streams[0].sreq_max_len = ULONG_MAX; rreq->io_streams[0].sreq_max_segs = 0; spin_lock_init(&rreq->lock); INIT_LIST_HEAD(&rreq->io_streams[0].subrequests); INIT_LIST_HEAD(&rreq->io_streams[1].subrequests); init_waitqueue_head(&rreq->waitq); refcount_set(&rreq->ref, 2); if (origin == NETFS_READAHEAD || origin == NETFS_READPAGE || origin == NETFS_READ_GAPS || origin == NETFS_READ_SINGLE || origin == NETFS_READ_FOR_WRITE || origin == NETFS_UNBUFFERED_READ || origin == NETFS_DIO_READ) { INIT_WORK(&rreq->work, netfs_read_collection_worker); rreq->io_streams[0].avail = true; } else { INIT_WORK(&rreq->work, netfs_write_collection_worker); } __set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags); if (rreq->netfs_ops->init_request) { ret = rreq->netfs_ops->init_request(rreq, file); if (ret < 0) { mempool_free(rreq, rreq->netfs_ops->request_pool ?: &netfs_request_pool); return ERR_PTR(ret); } } atomic_inc(&ctx->io_count); trace_netfs_rreq_ref(rreq->debug_id, refcount_read(&rreq->ref), netfs_rreq_trace_new); netfs_proc_add_rreq(rreq); netfs_stat(&netfs_n_rh_rreq); return rreq; } void netfs_get_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace what) { int r; __refcount_inc(&rreq->ref, &r); trace_netfs_rreq_ref(rreq->debug_id, r + 1, what); } void netfs_clear_subrequests(struct netfs_io_request *rreq) { struct netfs_io_subrequest *subreq; struct netfs_io_stream *stream; int s; for (s = 0; s < ARRAY_SIZE(rreq->io_streams); s++) { stream = &rreq->io_streams[s]; while (!list_empty(&stream->subrequests)) { subreq = list_first_entry(&stream->subrequests, struct netfs_io_subrequest, rreq_link); list_del(&subreq->rreq_link); netfs_put_subrequest(subreq, netfs_sreq_trace_put_clear); } } } static void netfs_free_request_rcu(struct rcu_head *rcu) { struct netfs_io_request *rreq = container_of(rcu, struct netfs_io_request, rcu); mempool_free(rreq, rreq->netfs_ops->request_pool ?: &netfs_request_pool); netfs_stat_d(&netfs_n_rh_rreq); } static void netfs_deinit_request(struct netfs_io_request *rreq) { struct netfs_inode *ictx = netfs_inode(rreq->inode); unsigned int i; trace_netfs_rreq(rreq, netfs_rreq_trace_free); /* Cancel/flush the result collection worker. That does not carry a * ref of its own, so we must wait for it somewhere. */ cancel_work_sync(&rreq->work); netfs_proc_del_rreq(rreq); netfs_clear_subrequests(rreq); if (rreq->netfs_ops->free_request) rreq->netfs_ops->free_request(rreq); if (rreq->cache_resources.ops) rreq->cache_resources.ops->end_operation(&rreq->cache_resources); if (rreq->direct_bv) { for (i = 0; i < rreq->direct_bv_count; i++) { if (rreq->direct_bv[i].bv_page) { if (rreq->direct_bv_unpin) unpin_user_page(rreq->direct_bv[i].bv_page); } } kvfree(rreq->direct_bv); } rolling_buffer_clear(&rreq->buffer); if (atomic_dec_and_test(&ictx->io_count)) wake_up_var(&ictx->io_count); } static void netfs_free_request(struct work_struct *work) { struct netfs_io_request *rreq = container_of(work, struct netfs_io_request, cleanup_work); netfs_deinit_request(rreq); call_rcu(&rreq->rcu, netfs_free_request_rcu); } void netfs_put_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace what) { unsigned int debug_id; bool dead; int r; if (rreq) { debug_id = rreq->debug_id; dead = __refcount_dec_and_test(&rreq->ref, &r); trace_netfs_rreq_ref(debug_id, r - 1, what); if (dead) WARN_ON(!queue_work(system_dfl_wq, &rreq->cleanup_work)); } } /* * Free a request (synchronously) that was just allocated but has * failed before it could be submitted. */ void netfs_put_failed_request(struct netfs_io_request *rreq) { int r = refcount_read(&rreq->ref); /* new requests have two references (see * netfs_alloc_request(), and this function is only allowed on * new request objects */ WARN_ON_ONCE(r != 2); trace_netfs_rreq_ref(rreq->debug_id, r, netfs_rreq_trace_put_failed); netfs_free_request(&rreq->cleanup_work); } /* * Allocate and partially initialise an I/O request structure. */ struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq) { struct netfs_io_subrequest *subreq; mempool_t *mempool = rreq->netfs_ops->subrequest_pool ?: &netfs_subrequest_pool; struct kmem_cache *cache = mempool->pool_data; for (;;) { subreq = mempool_alloc(rreq->netfs_ops->subrequest_pool ?: &netfs_subrequest_pool, GFP_KERNEL); if (subreq) break; msleep(10); } memset(subreq, 0, kmem_cache_size(cache)); INIT_WORK(&subreq->work, NULL); INIT_LIST_HEAD(&subreq->rreq_link); refcount_set(&subreq->ref, 2); subreq->rreq = rreq; subreq->debug_index = atomic_inc_return(&rreq->subreq_counter); netfs_get_request(rreq, netfs_rreq_trace_get_subreq); netfs_stat(&netfs_n_rh_sreq); return subreq; } void netfs_get_subrequest(struct netfs_io_subrequest *subreq, enum netfs_sreq_ref_trace what) { int r; __refcount_inc(&subreq->ref, &r); trace_netfs_sreq_ref(subreq->rreq->debug_id, subreq->debug_index, r + 1, what); } static void netfs_free_subrequest(struct netfs_io_subrequest *subreq) { struct netfs_io_request *rreq = subreq->rreq; trace_netfs_sreq(subreq, netfs_sreq_trace_free); if (rreq->netfs_ops->free_subrequest) rreq->netfs_ops->free_subrequest(subreq); mempool_free(subreq, rreq->netfs_ops->subrequest_pool ?: &netfs_subrequest_pool); netfs_stat_d(&netfs_n_rh_sreq); netfs_put_request(rreq, netfs_rreq_trace_put_subreq); } void netfs_put_subrequest(struct netfs_io_subrequest *subreq, enum netfs_sreq_ref_trace what) { unsigned int debug_index = subreq->debug_index; unsigned int debug_id = subreq->rreq->debug_id; bool dead; int r; dead = __refcount_dec_and_test(&subreq->ref, &r); trace_netfs_sreq_ref(debug_id, debug_index, r - 1, what); if (dead) netfs_free_subrequest(subreq); }
28 22 94 97 94 2 93 31 31 31 31 79 1081 1083 1082 1081 1082 1082 1083 1187 1083 1188 517 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 // SPDX-License-Identifier: GPL-2.0-only /* -*- linux-c -*- * sysctl_net.c: sysctl interface to net subsystem. * * Begun April 1, 1996, Mike Shaver. * Added /proc/sys/net directories for each protocol family. [MS] * * Revision 1.2 1996/05/08 20:24:40 shaver * Added bits for NET_BRIDGE and the NET_IPV4_ARP stuff and * NET_IPV4_IP_FORWARD. * * */ #include <linux/mm.h> #include <linux/export.h> #include <linux/sysctl.h> #include <linux/nsproxy.h> #include <net/sock.h> #ifdef CONFIG_INET #include <net/ip.h> #endif #ifdef CONFIG_NET #include <linux/if_ether.h> #endif static struct ctl_table_set * net_ctl_header_lookup(struct ctl_table_root *root) { return &current->nsproxy->net_ns->sysctls; } static int is_seen(struct ctl_table_set *set) { return &current->nsproxy->net_ns->sysctls == set; } /* Return standard mode bits for table entry. */ static int net_ctl_permissions(struct ctl_table_header *head, const struct ctl_table *table) { struct net *net = container_of(head->set, struct net, sysctls); /* Allow network administrator to have same access as root. */ if (ns_capable_noaudit(net->user_ns, CAP_NET_ADMIN)) { int mode = (table->mode >> 6) & 7; return (mode << 6) | (mode << 3) | mode; } return table->mode; } static void net_ctl_set_ownership(struct ctl_table_header *head, kuid_t *uid, kgid_t *gid) { struct net *net = container_of(head->set, struct net, sysctls); kuid_t ns_root_uid; kgid_t ns_root_gid; ns_root_uid = make_kuid(net->user_ns, 0); if (uid_valid(ns_root_uid)) *uid = ns_root_uid; ns_root_gid = make_kgid(net->user_ns, 0); if (gid_valid(ns_root_gid)) *gid = ns_root_gid; } static struct ctl_table_root net_sysctl_root = { .lookup = net_ctl_header_lookup, .permissions = net_ctl_permissions, .set_ownership = net_ctl_set_ownership, }; static int __net_init sysctl_net_init(struct net *net) { setup_sysctl_set(&net->sysctls, &net_sysctl_root, is_seen); return 0; } static void __net_exit sysctl_net_exit(struct net *net) { retire_sysctl_set(&net->sysctls); } static struct pernet_operations sysctl_pernet_ops = { .init = sysctl_net_init, .exit = sysctl_net_exit, }; static struct ctl_table_header *net_header; __init int net_sysctl_init(void) { static struct ctl_table empty[1]; int ret = -ENOMEM; /* Avoid limitations in the sysctl implementation by * registering "/proc/sys/net" as an empty directory not in a * network namespace. */ net_header = register_sysctl_sz("net", empty, 0); if (!net_header) goto out; ret = register_pernet_subsys(&sysctl_pernet_ops); if (ret) goto out1; out: return ret; out1: unregister_sysctl_table(net_header); net_header = NULL; goto out; } /* Verify that sysctls for non-init netns are safe by either: * 1) being read-only, or * 2) having a data pointer which points outside of the global kernel/module * data segment, and rather into the heap where a per-net object was * allocated. */ static void ensure_safe_net_sysctl(struct net *net, const char *path, struct ctl_table *table, size_t table_size) { struct ctl_table *ent; pr_debug("Registering net sysctl (net %p): %s\n", net, path); ent = table; for (size_t i = 0; i < table_size; ent++, i++) { unsigned long addr; const char *where; pr_debug(" procname=%s mode=%o proc_handler=%ps data=%p\n", ent->procname, ent->mode, ent->proc_handler, ent->data); /* If it's not writable inside the netns, then it can't hurt. */ if ((ent->mode & 0222) == 0) { pr_debug(" Not writable by anyone\n"); continue; } /* Where does data point? */ addr = (unsigned long)ent->data; if (is_module_address(addr)) where = "module"; else if (is_kernel_core_data(addr)) where = "kernel"; else continue; /* If it is writable and points to kernel/module global * data, then it's probably a netns leak. */ WARN(1, "sysctl %s/%s: data points to %s global data: %ps\n", path, ent->procname, where, ent->data); /* Make it "safe" by dropping writable perms */ ent->mode &= ~0222; } } struct ctl_table_header *register_net_sysctl_sz(struct net *net, const char *path, struct ctl_table *table, size_t table_size) { if (!net_eq(net, &init_net)) ensure_safe_net_sysctl(net, path, table, table_size); return __register_sysctl_table(&net->sysctls, path, table, table_size); } EXPORT_SYMBOL_GPL(register_net_sysctl_sz); void unregister_net_sysctl_table(struct ctl_table_header *header) { unregister_sysctl_table(header); } EXPORT_SYMBOL_GPL(unregister_net_sysctl_table);
3 3 3 3 3 3 3 3 3 3 3 807 3 3 121 503 26 25 504 4 446 3 414 22 415 39 39 1 775 773 774 775 96 96 96 91 97 773 773 96 96 97 97 92 3 2 1 1 1 1 1 1 1 1 1 1 1 1046 33 1 1 1 1050 444 444 11 11 479 483 483 638 637 634 635 27 23 637 647 644 645 27 644 23 3 2 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 // SPDX-License-Identifier: GPL-2.0-only /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> */ #include <linux/types.h> #include <linux/export.h> #include <linux/init.h> #include <linux/udp.h> #include <linux/tcp.h> #include <linux/icmp.h> #include <linux/icmpv6.h> #include <linux/dccp.h> #include <linux/sctp.h> #include <net/sctp/checksum.h> #include <linux/netfilter.h> #include <net/netfilter/nf_nat.h> #include <linux/ipv6.h> #include <linux/netfilter_ipv6.h> #include <net/checksum.h> #include <net/ip6_checksum.h> #include <net/ip6_route.h> #include <net/xfrm.h> #include <net/ipv6.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack.h> #include <linux/netfilter/nfnetlink_conntrack.h> static void nf_csum_update(struct sk_buff *skb, unsigned int iphdroff, __sum16 *check, const struct nf_conntrack_tuple *t, enum nf_nat_manip_type maniptype); static void __udp_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, struct udphdr *hdr, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype, bool do_csum) { __be16 *portptr, newport; if (maniptype == NF_NAT_MANIP_SRC) { /* Get rid of src port */ newport = tuple->src.u.udp.port; portptr = &hdr->source; } else { /* Get rid of dst port */ newport = tuple->dst.u.udp.port; portptr = &hdr->dest; } if (do_csum) { nf_csum_update(skb, iphdroff, &hdr->check, tuple, maniptype); inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, false); if (!hdr->check) hdr->check = CSUM_MANGLED_0; } *portptr = newport; } static bool udp_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { struct udphdr *hdr; if (skb_ensure_writable(skb, hdroff + sizeof(*hdr))) return false; hdr = (struct udphdr *)(skb->data + hdroff); __udp_manip_pkt(skb, iphdroff, hdr, tuple, maniptype, !!hdr->check); return true; } static bool udplite_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { #ifdef CONFIG_NF_CT_PROTO_UDPLITE struct udphdr *hdr; if (skb_ensure_writable(skb, hdroff + sizeof(*hdr))) return false; hdr = (struct udphdr *)(skb->data + hdroff); __udp_manip_pkt(skb, iphdroff, hdr, tuple, maniptype, true); #endif return true; } static bool sctp_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { #ifdef CONFIG_NF_CT_PROTO_SCTP struct sctphdr *hdr; int hdrsize = 8; /* This could be an inner header returned in imcp packet; in such * cases we cannot update the checksum field since it is outside * of the 8 bytes of transport layer headers we are guaranteed. */ if (skb->len >= hdroff + sizeof(*hdr)) hdrsize = sizeof(*hdr); if (skb_ensure_writable(skb, hdroff + hdrsize)) return false; hdr = (struct sctphdr *)(skb->data + hdroff); if (maniptype == NF_NAT_MANIP_SRC) { /* Get rid of src port */ hdr->source = tuple->src.u.sctp.port; } else { /* Get rid of dst port */ hdr->dest = tuple->dst.u.sctp.port; } if (hdrsize < sizeof(*hdr)) return true; if (skb->ip_summed != CHECKSUM_PARTIAL) { hdr->checksum = sctp_compute_cksum(skb, hdroff); skb->ip_summed = CHECKSUM_NONE; } #endif return true; } static bool tcp_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { struct tcphdr *hdr; __be16 *portptr, newport, oldport; int hdrsize = 8; /* TCP connection tracking guarantees this much */ /* this could be a inner header returned in icmp packet; in such cases we cannot update the checksum field since it is outside of the 8 bytes of transport layer headers we are guaranteed */ if (skb->len >= hdroff + sizeof(struct tcphdr)) hdrsize = sizeof(struct tcphdr); if (skb_ensure_writable(skb, hdroff + hdrsize)) return false; hdr = (struct tcphdr *)(skb->data + hdroff); if (maniptype == NF_NAT_MANIP_SRC) { /* Get rid of src port */ newport = tuple->src.u.tcp.port; portptr = &hdr->source; } else { /* Get rid of dst port */ newport = tuple->dst.u.tcp.port; portptr = &hdr->dest; } oldport = *portptr; *portptr = newport; if (hdrsize < sizeof(*hdr)) return true; nf_csum_update(skb, iphdroff, &hdr->check, tuple, maniptype); inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, false); return true; } static bool icmp_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { struct icmphdr *hdr; if (skb_ensure_writable(skb, hdroff + sizeof(*hdr))) return false; hdr = (struct icmphdr *)(skb->data + hdroff); switch (hdr->type) { case ICMP_ECHO: case ICMP_ECHOREPLY: case ICMP_TIMESTAMP: case ICMP_TIMESTAMPREPLY: case ICMP_INFO_REQUEST: case ICMP_INFO_REPLY: case ICMP_ADDRESS: case ICMP_ADDRESSREPLY: break; default: return true; } inet_proto_csum_replace2(&hdr->checksum, skb, hdr->un.echo.id, tuple->src.u.icmp.id, false); hdr->un.echo.id = tuple->src.u.icmp.id; return true; } static bool icmpv6_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { struct icmp6hdr *hdr; if (skb_ensure_writable(skb, hdroff + sizeof(*hdr))) return false; hdr = (struct icmp6hdr *)(skb->data + hdroff); nf_csum_update(skb, iphdroff, &hdr->icmp6_cksum, tuple, maniptype); if (hdr->icmp6_type == ICMPV6_ECHO_REQUEST || hdr->icmp6_type == ICMPV6_ECHO_REPLY) { inet_proto_csum_replace2(&hdr->icmp6_cksum, skb, hdr->icmp6_identifier, tuple->src.u.icmp.id, false); hdr->icmp6_identifier = tuple->src.u.icmp.id; } return true; } /* manipulate a GRE packet according to maniptype */ static bool gre_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { #if IS_ENABLED(CONFIG_NF_CT_PROTO_GRE) const struct gre_base_hdr *greh; struct pptp_gre_header *pgreh; /* pgreh includes two optional 32bit fields which are not required * to be there. That's where the magic '8' comes from */ if (skb_ensure_writable(skb, hdroff + sizeof(*pgreh) - 8)) return false; greh = (void *)skb->data + hdroff; pgreh = (struct pptp_gre_header *)greh; /* we only have destination manip of a packet, since 'source key' * is not present in the packet itself */ if (maniptype != NF_NAT_MANIP_DST) return true; switch (greh->flags & GRE_VERSION) { case GRE_VERSION_0: /* We do not currently NAT any GREv0 packets. * Try to behave like "nf_nat_proto_unknown" */ break; case GRE_VERSION_1: pr_debug("call_id -> 0x%04x\n", ntohs(tuple->dst.u.gre.key)); pgreh->call_id = tuple->dst.u.gre.key; break; default: pr_debug("can't nat unknown GRE version\n"); return false; } #endif return true; } static bool l4proto_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { switch (tuple->dst.protonum) { case IPPROTO_TCP: return tcp_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); case IPPROTO_UDP: return udp_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); case IPPROTO_UDPLITE: return udplite_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); case IPPROTO_SCTP: return sctp_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); case IPPROTO_ICMP: return icmp_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); case IPPROTO_ICMPV6: return icmpv6_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); case IPPROTO_GRE: return gre_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); } /* If we don't know protocol -- no error, pass it unmodified. */ return true; } static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, const struct nf_conntrack_tuple *target, enum nf_nat_manip_type maniptype) { struct iphdr *iph; unsigned int hdroff; if (skb_ensure_writable(skb, iphdroff + sizeof(*iph))) return false; iph = (void *)skb->data + iphdroff; hdroff = iphdroff + iph->ihl * 4; if (!l4proto_manip_pkt(skb, iphdroff, hdroff, target, maniptype)) return false; iph = (void *)skb->data + iphdroff; if (maniptype == NF_NAT_MANIP_SRC) { csum_replace4(&iph->check, iph->saddr, target->src.u3.ip); iph->saddr = target->src.u3.ip; } else { csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip); iph->daddr = target->dst.u3.ip; } return true; } static bool nf_nat_ipv6_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, const struct nf_conntrack_tuple *target, enum nf_nat_manip_type maniptype) { #if IS_ENABLED(CONFIG_IPV6) struct ipv6hdr *ipv6h; __be16 frag_off; int hdroff; u8 nexthdr; if (skb_ensure_writable(skb, iphdroff + sizeof(*ipv6h))) return false; ipv6h = (void *)skb->data + iphdroff; nexthdr = ipv6h->nexthdr; hdroff = ipv6_skip_exthdr(skb, iphdroff + sizeof(*ipv6h), &nexthdr, &frag_off); if (hdroff < 0) goto manip_addr; if ((frag_off & htons(~0x7)) == 0 && !l4proto_manip_pkt(skb, iphdroff, hdroff, target, maniptype)) return false; /* must reload, offset might have changed */ ipv6h = (void *)skb->data + iphdroff; manip_addr: if (maniptype == NF_NAT_MANIP_SRC) ipv6h->saddr = target->src.u3.in6; else ipv6h->daddr = target->dst.u3.in6; #endif return true; } unsigned int nf_nat_manip_pkt(struct sk_buff *skb, struct nf_conn *ct, enum nf_nat_manip_type mtype, enum ip_conntrack_dir dir) { struct nf_conntrack_tuple target; /* We are aiming to look like inverse of other direction. */ nf_ct_invert_tuple(&target, &ct->tuplehash[!dir].tuple); switch (target.src.l3num) { case NFPROTO_IPV6: if (nf_nat_ipv6_manip_pkt(skb, 0, &target, mtype)) return NF_ACCEPT; break; case NFPROTO_IPV4: if (nf_nat_ipv4_manip_pkt(skb, 0, &target, mtype)) return NF_ACCEPT; break; default: WARN_ON_ONCE(1); break; } return NF_DROP; } static void nf_nat_ipv4_csum_update(struct sk_buff *skb, unsigned int iphdroff, __sum16 *check, const struct nf_conntrack_tuple *t, enum nf_nat_manip_type maniptype) { struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff); __be32 oldip, newip; if (maniptype == NF_NAT_MANIP_SRC) { oldip = iph->saddr; newip = t->src.u3.ip; } else { oldip = iph->daddr; newip = t->dst.u3.ip; } inet_proto_csum_replace4(check, skb, oldip, newip, true); } static void nf_nat_ipv6_csum_update(struct sk_buff *skb, unsigned int iphdroff, __sum16 *check, const struct nf_conntrack_tuple *t, enum nf_nat_manip_type maniptype) { #if IS_ENABLED(CONFIG_IPV6) const struct ipv6hdr *ipv6h = (struct ipv6hdr *)(skb->data + iphdroff); const struct in6_addr *oldip, *newip; if (maniptype == NF_NAT_MANIP_SRC) { oldip = &ipv6h->saddr; newip = &t->src.u3.in6; } else { oldip = &ipv6h->daddr; newip = &t->dst.u3.in6; } inet_proto_csum_replace16(check, skb, oldip->s6_addr32, newip->s6_addr32, true); #endif } static void nf_csum_update(struct sk_buff *skb, unsigned int iphdroff, __sum16 *check, const struct nf_conntrack_tuple *t, enum nf_nat_manip_type maniptype) { switch (t->src.l3num) { case NFPROTO_IPV4: nf_nat_ipv4_csum_update(skb, iphdroff, check, t, maniptype); return; case NFPROTO_IPV6: nf_nat_ipv6_csum_update(skb, iphdroff, check, t, maniptype); return; } } static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb, u8 proto, void *data, __sum16 *check, int datalen, int oldlen) { if (skb->ip_summed != CHECKSUM_PARTIAL) { const struct iphdr *iph = ip_hdr(skb); skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) + ip_hdrlen(skb); skb->csum_offset = (void *)check - data; *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, datalen, proto, 0); } else { inet_proto_csum_replace2(check, skb, htons(oldlen), htons(datalen), true); } } #if IS_ENABLED(CONFIG_IPV6) static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb, u8 proto, void *data, __sum16 *check, int datalen, int oldlen) { if (skb->ip_summed != CHECKSUM_PARTIAL) { const struct ipv6hdr *ipv6h = ipv6_hdr(skb); skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) + (data - (void *)skb->data); skb->csum_offset = (void *)check - data; *check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, datalen, proto, 0); } else { inet_proto_csum_replace2(check, skb, htons(oldlen), htons(datalen), true); } } #endif void nf_nat_csum_recalc(struct sk_buff *skb, u8 nfproto, u8 proto, void *data, __sum16 *check, int datalen, int oldlen) { switch (nfproto) { case NFPROTO_IPV4: nf_nat_ipv4_csum_recalc(skb, proto, data, check, datalen, oldlen); return; #if IS_ENABLED(CONFIG_IPV6) case NFPROTO_IPV6: nf_nat_ipv6_csum_recalc(skb, proto, data, check, datalen, oldlen); return; #endif } WARN_ON_ONCE(1); } int nf_nat_icmp_reply_translation(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int hooknum) { struct { struct icmphdr icmp; struct iphdr ip; } *inside; enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); enum nf_nat_manip_type manip = HOOK2MANIP(hooknum); unsigned int hdrlen = ip_hdrlen(skb); struct nf_conntrack_tuple target; unsigned long statusbit; WARN_ON(ctinfo != IP_CT_RELATED && ctinfo != IP_CT_RELATED_REPLY); if (skb_ensure_writable(skb, hdrlen + sizeof(*inside))) return 0; if (nf_ip_checksum(skb, hooknum, hdrlen, IPPROTO_ICMP)) return 0; inside = (void *)skb->data + hdrlen; if (inside->icmp.type == ICMP_REDIRECT) { if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK) return 0; if (ct->status & IPS_NAT_MASK) return 0; } if (manip == NF_NAT_MANIP_SRC) statusbit = IPS_SRC_NAT; else statusbit = IPS_DST_NAT; /* Invert if this is reply direction */ if (dir == IP_CT_DIR_REPLY) statusbit ^= IPS_NAT_MASK; if (!(ct->status & statusbit)) return 1; if (!nf_nat_ipv4_manip_pkt(skb, hdrlen + sizeof(inside->icmp), &ct->tuplehash[!dir].tuple, !manip)) return 0; if (skb->ip_summed != CHECKSUM_PARTIAL) { /* Reloading "inside" here since manip_pkt may reallocate */ inside = (void *)skb->data + hdrlen; inside->icmp.checksum = 0; inside->icmp.checksum = csum_fold(skb_checksum(skb, hdrlen, skb->len - hdrlen, 0)); } /* Change outer to look like the reply to an incoming packet */ nf_ct_invert_tuple(&target, &ct->tuplehash[!dir].tuple); target.dst.protonum = IPPROTO_ICMP; if (!nf_nat_ipv4_manip_pkt(skb, 0, &target, manip)) return 0; return 1; } EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation); static unsigned int nf_nat_ipv4_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; ct = nf_ct_get(skb, &ctinfo); if (!ct) return NF_ACCEPT; if (ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY) { if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, state->hook)) return NF_DROP; else return NF_ACCEPT; } } return nf_nat_inet_fn(priv, skb, state); } static unsigned int nf_nat_ipv4_pre_routing(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { unsigned int ret; __be32 daddr = ip_hdr(skb)->daddr; ret = nf_nat_ipv4_fn(priv, skb, state); if (ret == NF_ACCEPT && daddr != ip_hdr(skb)->daddr) skb_dst_drop(skb); return ret; } #ifdef CONFIG_XFRM static int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family) { struct sock *sk = skb->sk; struct dst_entry *dst; unsigned int hh_len; struct flowi fl; int err; err = xfrm_decode_session(net, skb, &fl, family); if (err < 0) return err; dst = skb_dst(skb); if (dst->xfrm) dst = ((struct xfrm_dst *)dst)->route; if (!dst_hold_safe(dst)) return -EHOSTUNREACH; if (sk && !net_eq(net, sock_net(sk))) sk = NULL; dst = xfrm_lookup(net, dst, &fl, sk, 0); if (IS_ERR(dst)) return PTR_ERR(dst); skb_dst_drop(skb); skb_dst_set(skb, dst); /* Change in oif may mean change in hh_len. */ hh_len = skb_dst(skb)->dev->hard_header_len; if (skb_headroom(skb) < hh_len && pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC)) return -ENOMEM; return 0; } #endif static bool nf_nat_inet_port_was_mangled(const struct sk_buff *skb, __be16 sport) { enum ip_conntrack_info ctinfo; enum ip_conntrack_dir dir; const struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); if (!ct) return false; switch (nf_ct_protonum(ct)) { case IPPROTO_TCP: case IPPROTO_UDP: break; default: return false; } dir = CTINFO2DIR(ctinfo); if (dir != IP_CT_DIR_ORIGINAL) return false; return ct->tuplehash[!dir].tuple.dst.u.all != sport; } static unsigned int nf_nat_ipv4_local_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { __be32 saddr = ip_hdr(skb)->saddr; struct sock *sk = skb->sk; unsigned int ret; ret = nf_nat_ipv4_fn(priv, skb, state); if (ret != NF_ACCEPT || !sk || inet_sk_transparent(sk)) return ret; /* skb has a socket assigned via tcp edemux. We need to check * if nf_nat_ipv4_fn() has mangled the packet in a way that * edemux would not have found this socket. * * This includes both changes to the source address and changes * to the source port, which are both handled by the * nf_nat_ipv4_fn() call above -- long after tcp/udp early demux * might have found a socket for the old (pre-snat) address. */ if (saddr != ip_hdr(skb)->saddr || nf_nat_inet_port_was_mangled(skb, sk->sk_dport)) skb_orphan(skb); /* TCP edemux obtained wrong socket */ return ret; } static unsigned int nf_nat_ipv4_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { #ifdef CONFIG_XFRM const struct nf_conn *ct; enum ip_conntrack_info ctinfo; int err; #endif unsigned int ret; ret = nf_nat_ipv4_fn(priv, skb, state); #ifdef CONFIG_XFRM if (ret != NF_ACCEPT) return ret; if (IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) return ret; ct = nf_ct_get(skb, &ctinfo); if (ct) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); if (ct->tuplehash[dir].tuple.src.u3.ip != ct->tuplehash[!dir].tuple.dst.u3.ip || (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && ct->tuplehash[dir].tuple.src.u.all != ct->tuplehash[!dir].tuple.dst.u.all)) { err = nf_xfrm_me_harder(state->net, skb, AF_INET); if (err < 0) ret = NF_DROP_ERR(err); } } #endif return ret; } static unsigned int nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { const struct nf_conn *ct; enum ip_conntrack_info ctinfo; unsigned int ret; int err; ret = nf_nat_ipv4_fn(priv, skb, state); if (ret != NF_ACCEPT) return ret; ct = nf_ct_get(skb, &ctinfo); if (ct) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); if (ct->tuplehash[dir].tuple.dst.u3.ip != ct->tuplehash[!dir].tuple.src.u3.ip) { err = ip_route_me_harder(state->net, state->sk, skb, RTN_UNSPEC); if (err < 0) ret = NF_DROP_ERR(err); } #ifdef CONFIG_XFRM else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && ct->tuplehash[dir].tuple.dst.u.all != ct->tuplehash[!dir].tuple.src.u.all) { err = nf_xfrm_me_harder(state->net, skb, AF_INET); if (err < 0) ret = NF_DROP_ERR(err); } #endif } return ret; } static const struct nf_hook_ops nf_nat_ipv4_ops[] = { /* Before packet filtering, change destination */ { .hook = nf_nat_ipv4_pre_routing, .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_NAT_DST, }, /* After packet filtering, change source */ { .hook = nf_nat_ipv4_out, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_NAT_SRC, }, /* Before packet filtering, change destination */ { .hook = nf_nat_ipv4_local_fn, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_NAT_DST, }, /* After packet filtering, change source */ { .hook = nf_nat_ipv4_local_in, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC, }, }; int nf_nat_ipv4_register_fn(struct net *net, const struct nf_hook_ops *ops) { return nf_nat_register_fn(net, ops->pf, ops, nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops)); } EXPORT_SYMBOL_GPL(nf_nat_ipv4_register_fn); void nf_nat_ipv4_unregister_fn(struct net *net, const struct nf_hook_ops *ops) { nf_nat_unregister_fn(net, ops->pf, ops, ARRAY_SIZE(nf_nat_ipv4_ops)); } EXPORT_SYMBOL_GPL(nf_nat_ipv4_unregister_fn); #if IS_ENABLED(CONFIG_IPV6) int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int hooknum, unsigned int hdrlen) { struct { struct icmp6hdr icmp6; struct ipv6hdr ip6; } *inside; enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); enum nf_nat_manip_type manip = HOOK2MANIP(hooknum); struct nf_conntrack_tuple target; unsigned long statusbit; WARN_ON(ctinfo != IP_CT_RELATED && ctinfo != IP_CT_RELATED_REPLY); if (skb_ensure_writable(skb, hdrlen + sizeof(*inside))) return 0; if (nf_ip6_checksum(skb, hooknum, hdrlen, IPPROTO_ICMPV6)) return 0; inside = (void *)skb->data + hdrlen; if (inside->icmp6.icmp6_type == NDISC_REDIRECT) { if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK) return 0; if (ct->status & IPS_NAT_MASK) return 0; } if (manip == NF_NAT_MANIP_SRC) statusbit = IPS_SRC_NAT; else statusbit = IPS_DST_NAT; /* Invert if this is reply direction */ if (dir == IP_CT_DIR_REPLY) statusbit ^= IPS_NAT_MASK; if (!(ct->status & statusbit)) return 1; if (!nf_nat_ipv6_manip_pkt(skb, hdrlen + sizeof(inside->icmp6), &ct->tuplehash[!dir].tuple, !manip)) return 0; if (skb->ip_summed != CHECKSUM_PARTIAL) { struct ipv6hdr *ipv6h = ipv6_hdr(skb); inside = (void *)skb->data + hdrlen; inside->icmp6.icmp6_cksum = 0; inside->icmp6.icmp6_cksum = csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, skb->len - hdrlen, IPPROTO_ICMPV6, skb_checksum(skb, hdrlen, skb->len - hdrlen, 0)); } nf_ct_invert_tuple(&target, &ct->tuplehash[!dir].tuple); target.dst.protonum = IPPROTO_ICMPV6; if (!nf_nat_ipv6_manip_pkt(skb, 0, &target, manip)) return 0; return 1; } EXPORT_SYMBOL_GPL(nf_nat_icmpv6_reply_translation); static unsigned int nf_nat_ipv6_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; __be16 frag_off; int hdrlen; u8 nexthdr; ct = nf_ct_get(skb, &ctinfo); /* Can't track? It's not due to stress, or conntrack would * have dropped it. Hence it's the user's responsibilty to * packet filter it out, or implement conntrack/NAT for that * protocol. 8) --RR */ if (!ct) return NF_ACCEPT; if (ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY) { nexthdr = ipv6_hdr(skb)->nexthdr; hdrlen = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, &frag_off); if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) { if (!nf_nat_icmpv6_reply_translation(skb, ct, ctinfo, state->hook, hdrlen)) return NF_DROP; else return NF_ACCEPT; } } return nf_nat_inet_fn(priv, skb, state); } static unsigned int nf_nat_ipv6_local_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct in6_addr saddr = ipv6_hdr(skb)->saddr; struct sock *sk = skb->sk; unsigned int ret; ret = nf_nat_ipv6_fn(priv, skb, state); if (ret != NF_ACCEPT || !sk || inet_sk_transparent(sk)) return ret; /* see nf_nat_ipv4_local_in */ if (ipv6_addr_cmp(&saddr, &ipv6_hdr(skb)->saddr) || nf_nat_inet_port_was_mangled(skb, sk->sk_dport)) skb_orphan(skb); return ret; } static unsigned int nf_nat_ipv6_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { unsigned int ret, verdict; struct in6_addr daddr = ipv6_hdr(skb)->daddr; ret = nf_nat_ipv6_fn(priv, skb, state); verdict = ret & NF_VERDICT_MASK; if (verdict != NF_DROP && verdict != NF_STOLEN && ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr)) skb_dst_drop(skb); return ret; } static unsigned int nf_nat_ipv6_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { #ifdef CONFIG_XFRM const struct nf_conn *ct; enum ip_conntrack_info ctinfo; int err; #endif unsigned int ret; ret = nf_nat_ipv6_fn(priv, skb, state); #ifdef CONFIG_XFRM if (ret != NF_ACCEPT) return ret; if (IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) return ret; ct = nf_ct_get(skb, &ctinfo); if (ct) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3, &ct->tuplehash[!dir].tuple.dst.u3) || (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 && ct->tuplehash[dir].tuple.src.u.all != ct->tuplehash[!dir].tuple.dst.u.all)) { err = nf_xfrm_me_harder(state->net, skb, AF_INET6); if (err < 0) ret = NF_DROP_ERR(err); } } #endif return ret; } static unsigned int nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { const struct nf_conn *ct; enum ip_conntrack_info ctinfo; unsigned int ret; int err; ret = nf_nat_ipv6_fn(priv, skb, state); if (ret != NF_ACCEPT) return ret; ct = nf_ct_get(skb, &ctinfo); if (ct) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3, &ct->tuplehash[!dir].tuple.src.u3)) { err = nf_ip6_route_me_harder(state->net, state->sk, skb); if (err < 0) ret = NF_DROP_ERR(err); } #ifdef CONFIG_XFRM else if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 && ct->tuplehash[dir].tuple.dst.u.all != ct->tuplehash[!dir].tuple.src.u.all) { err = nf_xfrm_me_harder(state->net, skb, AF_INET6); if (err < 0) ret = NF_DROP_ERR(err); } #endif } return ret; } static const struct nf_hook_ops nf_nat_ipv6_ops[] = { /* Before packet filtering, change destination */ { .hook = nf_nat_ipv6_in, .pf = NFPROTO_IPV6, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP6_PRI_NAT_DST, }, /* After packet filtering, change source */ { .hook = nf_nat_ipv6_out, .pf = NFPROTO_IPV6, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP6_PRI_NAT_SRC, }, /* Before packet filtering, change destination */ { .hook = nf_nat_ipv6_local_fn, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_NAT_DST, }, /* After packet filtering, change source */ { .hook = nf_nat_ipv6_local_in, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_NAT_SRC, }, }; int nf_nat_ipv6_register_fn(struct net *net, const struct nf_hook_ops *ops) { return nf_nat_register_fn(net, ops->pf, ops, nf_nat_ipv6_ops, ARRAY_SIZE(nf_nat_ipv6_ops)); } EXPORT_SYMBOL_GPL(nf_nat_ipv6_register_fn); void nf_nat_ipv6_unregister_fn(struct net *net, const struct nf_hook_ops *ops) { nf_nat_unregister_fn(net, ops->pf, ops, ARRAY_SIZE(nf_nat_ipv6_ops)); } EXPORT_SYMBOL_GPL(nf_nat_ipv6_unregister_fn); #endif /* CONFIG_IPV6 */ #if defined(CONFIG_NF_TABLES_INET) && IS_ENABLED(CONFIG_NFT_NAT) int nf_nat_inet_register_fn(struct net *net, const struct nf_hook_ops *ops) { int ret; if (WARN_ON_ONCE(ops->pf != NFPROTO_INET)) return -EINVAL; ret = nf_nat_register_fn(net, NFPROTO_IPV6, ops, nf_nat_ipv6_ops, ARRAY_SIZE(nf_nat_ipv6_ops)); if (ret) return ret; ret = nf_nat_register_fn(net, NFPROTO_IPV4, ops, nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops)); if (ret) nf_nat_unregister_fn(net, NFPROTO_IPV6, ops, ARRAY_SIZE(nf_nat_ipv6_ops)); return ret; } EXPORT_SYMBOL_GPL(nf_nat_inet_register_fn); void nf_nat_inet_unregister_fn(struct net *net, const struct nf_hook_ops *ops) { nf_nat_unregister_fn(net, NFPROTO_IPV4, ops, ARRAY_SIZE(nf_nat_ipv4_ops)); nf_nat_unregister_fn(net, NFPROTO_IPV6, ops, ARRAY_SIZE(nf_nat_ipv6_ops)); } EXPORT_SYMBOL_GPL(nf_nat_inet_unregister_fn); #endif /* NFT INET NAT */
12 7 5 4 11 7 1 6 5 12 15 8 14 1 12 51 12 42 52 22 12 42 43 12 2 2 7 18 18 5 5 5 18 12 12 12 11 3 12 12 12 17 17 14 11 7 3 3 11 8 6 3 3 8 37 2 38 36 27 4 3 24 26 7 25 4 1 1 15 12 20 5 2 18 27 10 13 12 4 13 7 12 12 25 24 23 23 13 12 20 4 2 18 4 2 2 5 3 13 13 12 22 5 5 2 2 5 5 5 4 2 5 5 1 2 2 2 2 1 2 1 2 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2023 Isovalent */ #include <linux/bpf.h> #include <linux/bpf_mprog.h> static int bpf_mprog_link(struct bpf_tuple *tuple, u32 id_or_fd, u32 flags, enum bpf_prog_type type) { struct bpf_link *link = ERR_PTR(-EINVAL); bool id = flags & BPF_F_ID; if (id) link = bpf_link_by_id(id_or_fd); else if (id_or_fd) link = bpf_link_get_from_fd(id_or_fd); if (IS_ERR(link)) return PTR_ERR(link); if (type && link->prog->type != type) { bpf_link_put(link); return -EINVAL; } tuple->link = link; tuple->prog = link->prog; return 0; } static int bpf_mprog_prog(struct bpf_tuple *tuple, u32 id_or_fd, u32 flags, enum bpf_prog_type type) { struct bpf_prog *prog = ERR_PTR(-EINVAL); bool id = flags & BPF_F_ID; if (id) prog = bpf_prog_by_id(id_or_fd); else if (id_or_fd) prog = bpf_prog_get(id_or_fd); if (IS_ERR(prog)) return PTR_ERR(prog); if (type && prog->type != type) { bpf_prog_put(prog); return -EINVAL; } tuple->link = NULL; tuple->prog = prog; return 0; } static int bpf_mprog_tuple_relative(struct bpf_tuple *tuple, u32 id_or_fd, u32 flags, enum bpf_prog_type type) { bool link = flags & BPF_F_LINK; bool id = flags & BPF_F_ID; memset(tuple, 0, sizeof(*tuple)); if (link) return bpf_mprog_link(tuple, id_or_fd, flags, type); /* If no relevant flag is set and no id_or_fd was passed, then * tuple link/prog is just NULLed. This is the case when before/ * after selects first/last position without passing fd. */ if (!id && !id_or_fd) return 0; return bpf_mprog_prog(tuple, id_or_fd, flags, type); } static void bpf_mprog_tuple_put(struct bpf_tuple *tuple) { if (tuple->link) bpf_link_put(tuple->link); else if (tuple->prog) bpf_prog_put(tuple->prog); } /* The bpf_mprog_{replace,delete}() operate on exact idx position with the * one exception that for deletion we support delete from front/back. In * case of front idx is -1, in case of back idx is bpf_mprog_total(entry). * Adjustment to first and last entry is trivial. The bpf_mprog_insert() * we have to deal with the following cases: * * idx + before: * * Insert P4 before P3: idx for old array is 1, idx for new array is 2, * hence we adjust target idx for the new array, so that memmove copies * P1 and P2 to the new entry, and we insert P4 into idx 2. Inserting * before P1 would have old idx -1 and new idx 0. * * +--+--+--+ +--+--+--+--+ +--+--+--+--+ * |P1|P2|P3| ==> |P1|P2| |P3| ==> |P1|P2|P4|P3| * +--+--+--+ +--+--+--+--+ +--+--+--+--+ * * idx + after: * * Insert P4 after P2: idx for old array is 2, idx for new array is 2. * Again, memmove copies P1 and P2 to the new entry, and we insert P4 * into idx 2. Inserting after P3 would have both old/new idx at 4 aka * bpf_mprog_total(entry). * * +--+--+--+ +--+--+--+--+ +--+--+--+--+ * |P1|P2|P3| ==> |P1|P2| |P3| ==> |P1|P2|P4|P3| * +--+--+--+ +--+--+--+--+ +--+--+--+--+ */ static int bpf_mprog_replace(struct bpf_mprog_entry *entry, struct bpf_mprog_entry **entry_new, struct bpf_tuple *ntuple, int idx) { struct bpf_mprog_fp *fp; struct bpf_mprog_cp *cp; struct bpf_prog *oprog; bpf_mprog_read(entry, idx, &fp, &cp); oprog = READ_ONCE(fp->prog); bpf_mprog_write(fp, cp, ntuple); if (!ntuple->link) { WARN_ON_ONCE(cp->link); bpf_prog_put(oprog); } *entry_new = entry; return 0; } static int bpf_mprog_insert(struct bpf_mprog_entry *entry, struct bpf_mprog_entry **entry_new, struct bpf_tuple *ntuple, int idx, u32 flags) { int total = bpf_mprog_total(entry); struct bpf_mprog_entry *peer; struct bpf_mprog_fp *fp; struct bpf_mprog_cp *cp; peer = bpf_mprog_peer(entry); bpf_mprog_entry_copy(peer, entry); if (idx == total) goto insert; else if (flags & BPF_F_BEFORE) idx += 1; bpf_mprog_entry_grow(peer, idx); insert: bpf_mprog_read(peer, idx, &fp, &cp); bpf_mprog_write(fp, cp, ntuple); bpf_mprog_inc(peer); *entry_new = peer; return 0; } static int bpf_mprog_delete(struct bpf_mprog_entry *entry, struct bpf_mprog_entry **entry_new, struct bpf_tuple *dtuple, int idx) { int total = bpf_mprog_total(entry); struct bpf_mprog_entry *peer; peer = bpf_mprog_peer(entry); bpf_mprog_entry_copy(peer, entry); if (idx == -1) idx = 0; else if (idx == total) idx = total - 1; bpf_mprog_entry_shrink(peer, idx); bpf_mprog_dec(peer); bpf_mprog_mark_for_release(peer, dtuple); *entry_new = peer; return 0; } /* In bpf_mprog_pos_*() we evaluate the target position for the BPF * program/link that needs to be replaced, inserted or deleted for * each "rule" independently. If all rules agree on that position * or existing element, then enact replacement, addition or deletion. * If this is not the case, then the request cannot be satisfied and * we bail out with an error. */ static int bpf_mprog_pos_exact(struct bpf_mprog_entry *entry, struct bpf_tuple *tuple) { struct bpf_mprog_fp *fp; struct bpf_mprog_cp *cp; int i; for (i = 0; i < bpf_mprog_total(entry); i++) { bpf_mprog_read(entry, i, &fp, &cp); if (tuple->prog == READ_ONCE(fp->prog)) return tuple->link == cp->link ? i : -EBUSY; } return -ENOENT; } static int bpf_mprog_pos_before(struct bpf_mprog_entry *entry, struct bpf_tuple *tuple) { struct bpf_mprog_fp *fp; struct bpf_mprog_cp *cp; int i; for (i = 0; i < bpf_mprog_total(entry); i++) { bpf_mprog_read(entry, i, &fp, &cp); if (tuple->prog == READ_ONCE(fp->prog) && (!tuple->link || tuple->link == cp->link)) return i - 1; } return tuple->prog ? -ENOENT : -1; } static int bpf_mprog_pos_after(struct bpf_mprog_entry *entry, struct bpf_tuple *tuple) { struct bpf_mprog_fp *fp; struct bpf_mprog_cp *cp; int i; for (i = 0; i < bpf_mprog_total(entry); i++) { bpf_mprog_read(entry, i, &fp, &cp); if (tuple->prog == READ_ONCE(fp->prog) && (!tuple->link || tuple->link == cp->link)) return i + 1; } return tuple->prog ? -ENOENT : bpf_mprog_total(entry); } int bpf_mprog_attach(struct bpf_mprog_entry *entry, struct bpf_mprog_entry **entry_new, struct bpf_prog *prog_new, struct bpf_link *link, struct bpf_prog *prog_old, u32 flags, u32 id_or_fd, u64 revision) { struct bpf_tuple rtuple, ntuple = { .prog = prog_new, .link = link, }, otuple = { .prog = prog_old, .link = link, }; int ret, idx = -ERANGE, tidx; if (revision && revision != bpf_mprog_revision(entry)) return -ESTALE; if (bpf_mprog_exists(entry, prog_new)) return -EEXIST; ret = bpf_mprog_tuple_relative(&rtuple, id_or_fd, flags & ~BPF_F_REPLACE, prog_new->type); if (ret) return ret; if (flags & BPF_F_REPLACE) { tidx = bpf_mprog_pos_exact(entry, &otuple); if (tidx < 0) { ret = tidx; goto out; } idx = tidx; } else if (bpf_mprog_total(entry) == bpf_mprog_max()) { ret = -ERANGE; goto out; } if (flags & BPF_F_BEFORE) { tidx = bpf_mprog_pos_before(entry, &rtuple); if (tidx < -1 || (idx >= -1 && tidx != idx)) { ret = tidx < -1 ? tidx : -ERANGE; goto out; } idx = tidx; } if (flags & BPF_F_AFTER) { tidx = bpf_mprog_pos_after(entry, &rtuple); if (tidx < -1 || (idx >= -1 && tidx != idx)) { ret = tidx < 0 ? tidx : -ERANGE; goto out; } idx = tidx; } if (idx < -1) { if (rtuple.prog || flags) { ret = -EINVAL; goto out; } idx = bpf_mprog_total(entry); flags = BPF_F_AFTER; } if (idx >= bpf_mprog_max()) { ret = -ERANGE; goto out; } if (flags & BPF_F_REPLACE) ret = bpf_mprog_replace(entry, entry_new, &ntuple, idx); else ret = bpf_mprog_insert(entry, entry_new, &ntuple, idx, flags); out: bpf_mprog_tuple_put(&rtuple); return ret; } static int bpf_mprog_fetch(struct bpf_mprog_entry *entry, struct bpf_tuple *tuple, int idx) { int total = bpf_mprog_total(entry); struct bpf_mprog_cp *cp; struct bpf_mprog_fp *fp; struct bpf_prog *prog; struct bpf_link *link; if (idx == -1) idx = 0; else if (idx == total) idx = total - 1; bpf_mprog_read(entry, idx, &fp, &cp); prog = READ_ONCE(fp->prog); link = cp->link; /* The deletion request can either be without filled tuple in which * case it gets populated here based on idx, or with filled tuple * where the only thing we end up doing is the WARN_ON_ONCE() assert. * If we hit a BPF link at the given index, it must not be removed * from opts path. */ if (link && !tuple->link) return -EBUSY; WARN_ON_ONCE(tuple->prog && tuple->prog != prog); WARN_ON_ONCE(tuple->link && tuple->link != link); tuple->prog = prog; tuple->link = link; return 0; } int bpf_mprog_detach(struct bpf_mprog_entry *entry, struct bpf_mprog_entry **entry_new, struct bpf_prog *prog, struct bpf_link *link, u32 flags, u32 id_or_fd, u64 revision) { struct bpf_tuple rtuple, dtuple = { .prog = prog, .link = link, }; int ret, idx = -ERANGE, tidx; if (flags & BPF_F_REPLACE) return -EINVAL; if (revision && revision != bpf_mprog_revision(entry)) return -ESTALE; if (!bpf_mprog_total(entry)) return -ENOENT; ret = bpf_mprog_tuple_relative(&rtuple, id_or_fd, flags, prog ? prog->type : BPF_PROG_TYPE_UNSPEC); if (ret) return ret; if (dtuple.prog) { tidx = bpf_mprog_pos_exact(entry, &dtuple); if (tidx < 0) { ret = tidx; goto out; } idx = tidx; } if (flags & BPF_F_BEFORE) { tidx = bpf_mprog_pos_before(entry, &rtuple); if (tidx < -1 || (idx >= -1 && tidx != idx)) { ret = tidx < -1 ? tidx : -ERANGE; goto out; } idx = tidx; } if (flags & BPF_F_AFTER) { tidx = bpf_mprog_pos_after(entry, &rtuple); if (tidx < -1 || (idx >= -1 && tidx != idx)) { ret = tidx < 0 ? tidx : -ERANGE; goto out; } idx = tidx; } if (idx < -1) { if (rtuple.prog || flags) { ret = -EINVAL; goto out; } idx = bpf_mprog_total(entry); flags = BPF_F_AFTER; } if (idx >= bpf_mprog_max()) { ret = -ERANGE; goto out; } ret = bpf_mprog_fetch(entry, &dtuple, idx); if (ret) goto out; ret = bpf_mprog_delete(entry, entry_new, &dtuple, idx); out: bpf_mprog_tuple_put(&rtuple); return ret; } int bpf_mprog_query(const union bpf_attr *attr, union bpf_attr __user *uattr, struct bpf_mprog_entry *entry) { u32 __user *uprog_flags, *ulink_flags; u32 __user *uprog_id, *ulink_id; struct bpf_mprog_fp *fp; struct bpf_mprog_cp *cp; struct bpf_prog *prog; const u32 flags = 0; u32 id, count = 0; u64 revision = 1; int i, ret = 0; if (attr->query.query_flags || attr->query.attach_flags) return -EINVAL; if (entry) { revision = bpf_mprog_revision(entry); count = bpf_mprog_total(entry); } if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) return -EFAULT; if (copy_to_user(&uattr->query.revision, &revision, sizeof(revision))) return -EFAULT; if (copy_to_user(&uattr->query.count, &count, sizeof(count))) return -EFAULT; uprog_id = u64_to_user_ptr(attr->query.prog_ids); uprog_flags = u64_to_user_ptr(attr->query.prog_attach_flags); ulink_id = u64_to_user_ptr(attr->query.link_ids); ulink_flags = u64_to_user_ptr(attr->query.link_attach_flags); if (attr->query.count == 0 || !uprog_id || !count) return 0; if (attr->query.count < count) { count = attr->query.count; ret = -ENOSPC; } for (i = 0; i < bpf_mprog_max(); i++) { bpf_mprog_read(entry, i, &fp, &cp); prog = READ_ONCE(fp->prog); if (!prog) break; id = prog->aux->id; if (copy_to_user(uprog_id + i, &id, sizeof(id))) return -EFAULT; if (uprog_flags && copy_to_user(uprog_flags + i, &flags, sizeof(flags))) return -EFAULT; id = cp->link ? cp->link->id : 0; if (ulink_id && copy_to_user(ulink_id + i, &id, sizeof(id))) return -EFAULT; if (ulink_flags && copy_to_user(ulink_flags + i, &flags, sizeof(flags))) return -EFAULT; if (i + 1 == count) break; } return ret; }
9 9 9 9 9 9 9 9 9 10 10 10 10 9 9 9 9 9 9 9 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/module.h> #include <linux/backing-dev.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/mm.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/workqueue.h> #include <linux/smp.h> #include "blk.h" #include "blk-mq.h" static void blk_mq_sysfs_release(struct kobject *kobj) { struct blk_mq_ctxs *ctxs = container_of(kobj, struct blk_mq_ctxs, kobj); free_percpu(ctxs->queue_ctx); kfree(ctxs); } static void blk_mq_ctx_sysfs_release(struct kobject *kobj) { struct blk_mq_ctx *ctx = container_of(kobj, struct blk_mq_ctx, kobj); /* ctx->ctxs won't be released until all ctx are freed */ kobject_put(&ctx->ctxs->kobj); } static void blk_mq_hw_sysfs_release(struct kobject *kobj) { struct blk_mq_hw_ctx *hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj); sbitmap_free(&hctx->ctx_map); free_cpumask_var(hctx->cpumask); kfree(hctx->ctxs); kfree(hctx); } struct blk_mq_hw_ctx_sysfs_entry { struct attribute attr; ssize_t (*show)(struct blk_mq_hw_ctx *, char *); }; static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj, struct attribute *attr, char *page) { struct blk_mq_hw_ctx_sysfs_entry *entry; struct blk_mq_hw_ctx *hctx; struct request_queue *q; ssize_t res; entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr); hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj); q = hctx->queue; if (!entry->show) return -EIO; mutex_lock(&q->elevator_lock); res = entry->show(hctx, page); mutex_unlock(&q->elevator_lock); return res; } static ssize_t blk_mq_hw_sysfs_nr_tags_show(struct blk_mq_hw_ctx *hctx, char *page) { return sprintf(page, "%u\n", hctx->tags->nr_tags); } static ssize_t blk_mq_hw_sysfs_nr_reserved_tags_show(struct blk_mq_hw_ctx *hctx, char *page) { return sprintf(page, "%u\n", hctx->tags->nr_reserved_tags); } static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) { const size_t size = PAGE_SIZE - 1; unsigned int i, first = 1; int ret = 0, pos = 0; for_each_cpu(i, hctx->cpumask) { if (first) ret = snprintf(pos + page, size - pos, "%u", i); else ret = snprintf(pos + page, size - pos, ", %u", i); if (ret >= size - pos) break; first = 0; pos += ret; } ret = snprintf(pos + page, size + 1 - pos, "\n"); return pos + ret; } static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_tags = { .attr = {.name = "nr_tags", .mode = 0444 }, .show = blk_mq_hw_sysfs_nr_tags_show, }; static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_reserved_tags = { .attr = {.name = "nr_reserved_tags", .mode = 0444 }, .show = blk_mq_hw_sysfs_nr_reserved_tags_show, }; static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = { .attr = {.name = "cpu_list", .mode = 0444 }, .show = blk_mq_hw_sysfs_cpus_show, }; static struct attribute *default_hw_ctx_attrs[] = { &blk_mq_hw_sysfs_nr_tags.attr, &blk_mq_hw_sysfs_nr_reserved_tags.attr, &blk_mq_hw_sysfs_cpus.attr, NULL, }; ATTRIBUTE_GROUPS(default_hw_ctx); static const struct sysfs_ops blk_mq_hw_sysfs_ops = { .show = blk_mq_hw_sysfs_show, }; static const struct kobj_type blk_mq_ktype = { .release = blk_mq_sysfs_release, }; static const struct kobj_type blk_mq_ctx_ktype = { .release = blk_mq_ctx_sysfs_release, }; static const struct kobj_type blk_mq_hw_ktype = { .sysfs_ops = &blk_mq_hw_sysfs_ops, .default_groups = default_hw_ctx_groups, .release = blk_mq_hw_sysfs_release, }; static void blk_mq_unregister_hctx(struct blk_mq_hw_ctx *hctx) { struct blk_mq_ctx *ctx; int i; if (!hctx->nr_ctx) return; hctx_for_each_ctx(hctx, ctx, i) if (ctx->kobj.state_in_sysfs) kobject_del(&ctx->kobj); if (hctx->kobj.state_in_sysfs) kobject_del(&hctx->kobj); } static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct blk_mq_ctx *ctx; int i, j, ret; if (!hctx->nr_ctx) return 0; ret = kobject_add(&hctx->kobj, q->mq_kobj, "%u", hctx->queue_num); if (ret) return ret; hctx_for_each_ctx(hctx, ctx, i) { ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu); if (ret) goto out; } return 0; out: hctx_for_each_ctx(hctx, ctx, j) { if (j < i) kobject_del(&ctx->kobj); } kobject_del(&hctx->kobj); return ret; } void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx) { kobject_init(&hctx->kobj, &blk_mq_hw_ktype); } void blk_mq_sysfs_deinit(struct request_queue *q) { struct blk_mq_ctx *ctx; int cpu; for_each_possible_cpu(cpu) { ctx = per_cpu_ptr(q->queue_ctx, cpu); kobject_put(&ctx->kobj); } kobject_put(q->mq_kobj); } void blk_mq_sysfs_init(struct request_queue *q) { struct blk_mq_ctx *ctx; int cpu; kobject_init(q->mq_kobj, &blk_mq_ktype); for_each_possible_cpu(cpu) { ctx = per_cpu_ptr(q->queue_ctx, cpu); kobject_get(q->mq_kobj); kobject_init(&ctx->kobj, &blk_mq_ctx_ktype); } } int blk_mq_sysfs_register(struct gendisk *disk) { struct request_queue *q = disk->queue; struct blk_mq_hw_ctx *hctx; unsigned long i, j; int ret; ret = kobject_add(q->mq_kobj, &disk_to_dev(disk)->kobj, "mq"); if (ret < 0) return ret; kobject_uevent(q->mq_kobj, KOBJ_ADD); mutex_lock(&q->tag_set->tag_list_lock); queue_for_each_hw_ctx(q, hctx, i) { ret = blk_mq_register_hctx(hctx); if (ret) goto out_unreg; } mutex_unlock(&q->tag_set->tag_list_lock); return 0; out_unreg: queue_for_each_hw_ctx(q, hctx, j) { if (j < i) blk_mq_unregister_hctx(hctx); } mutex_unlock(&q->tag_set->tag_list_lock); kobject_uevent(q->mq_kobj, KOBJ_REMOVE); kobject_del(q->mq_kobj); return ret; } void blk_mq_sysfs_unregister(struct gendisk *disk) { struct request_queue *q = disk->queue; struct blk_mq_hw_ctx *hctx; unsigned long i; mutex_lock(&q->tag_set->tag_list_lock); queue_for_each_hw_ctx(q, hctx, i) blk_mq_unregister_hctx(hctx); mutex_unlock(&q->tag_set->tag_list_lock); kobject_uevent(q->mq_kobj, KOBJ_REMOVE); kobject_del(q->mq_kobj); } void blk_mq_sysfs_unregister_hctxs(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; if (!blk_queue_registered(q)) return; queue_for_each_hw_ctx(q, hctx, i) blk_mq_unregister_hctx(hctx); } int blk_mq_sysfs_register_hctxs(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; int ret = 0; if (!blk_queue_registered(q)) goto out; queue_for_each_hw_ctx(q, hctx, i) { ret = blk_mq_register_hctx(hctx); if (ret) break; } out: return ret; }
2 2 2 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Cryptographic API. * * ARIA Cipher Algorithm. * * Documentation of ARIA can be found in RFC 5794. * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com> * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com> * * Information for ARIA * http://210.104.33.10/ARIA/index-e.html (English) * http://seed.kisa.or.kr/ (Korean) * * Public domain version is distributed above. */ #ifndef _CRYPTO_ARIA_H #define _CRYPTO_ARIA_H #include <crypto/algapi.h> #include <linux/module.h> #include <linux/init.h> #include <linux/types.h> #include <linux/errno.h> #include <asm/byteorder.h> #define ARIA_MIN_KEY_SIZE 16 #define ARIA_MAX_KEY_SIZE 32 #define ARIA_BLOCK_SIZE 16 #define ARIA_MAX_RD_KEYS 17 #define ARIA_RD_KEY_WORDS (ARIA_BLOCK_SIZE / sizeof(u32)) struct aria_ctx { u32 enc_key[ARIA_MAX_RD_KEYS][ARIA_RD_KEY_WORDS]; u32 dec_key[ARIA_MAX_RD_KEYS][ARIA_RD_KEY_WORDS]; int rounds; int key_length; }; static const u32 s1[256] = { 0x00636363, 0x007c7c7c, 0x00777777, 0x007b7b7b, 0x00f2f2f2, 0x006b6b6b, 0x006f6f6f, 0x00c5c5c5, 0x00303030, 0x00010101, 0x00676767, 0x002b2b2b, 0x00fefefe, 0x00d7d7d7, 0x00ababab, 0x00767676, 0x00cacaca, 0x00828282, 0x00c9c9c9, 0x007d7d7d, 0x00fafafa, 0x00595959, 0x00474747, 0x00f0f0f0, 0x00adadad, 0x00d4d4d4, 0x00a2a2a2, 0x00afafaf, 0x009c9c9c, 0x00a4a4a4, 0x00727272, 0x00c0c0c0, 0x00b7b7b7, 0x00fdfdfd, 0x00939393, 0x00262626, 0x00363636, 0x003f3f3f, 0x00f7f7f7, 0x00cccccc, 0x00343434, 0x00a5a5a5, 0x00e5e5e5, 0x00f1f1f1, 0x00717171, 0x00d8d8d8, 0x00313131, 0x00151515, 0x00040404, 0x00c7c7c7, 0x00232323, 0x00c3c3c3, 0x00181818, 0x00969696, 0x00050505, 0x009a9a9a, 0x00070707, 0x00121212, 0x00808080, 0x00e2e2e2, 0x00ebebeb, 0x00272727, 0x00b2b2b2, 0x00757575, 0x00090909, 0x00838383, 0x002c2c2c, 0x001a1a1a, 0x001b1b1b, 0x006e6e6e, 0x005a5a5a, 0x00a0a0a0, 0x00525252, 0x003b3b3b, 0x00d6d6d6, 0x00b3b3b3, 0x00292929, 0x00e3e3e3, 0x002f2f2f, 0x00848484, 0x00535353, 0x00d1d1d1, 0x00000000, 0x00ededed, 0x00202020, 0x00fcfcfc, 0x00b1b1b1, 0x005b5b5b, 0x006a6a6a, 0x00cbcbcb, 0x00bebebe, 0x00393939, 0x004a4a4a, 0x004c4c4c, 0x00585858, 0x00cfcfcf, 0x00d0d0d0, 0x00efefef, 0x00aaaaaa, 0x00fbfbfb, 0x00434343, 0x004d4d4d, 0x00333333, 0x00858585, 0x00454545, 0x00f9f9f9, 0x00020202, 0x007f7f7f, 0x00505050, 0x003c3c3c, 0x009f9f9f, 0x00a8a8a8, 0x00515151, 0x00a3a3a3, 0x00404040, 0x008f8f8f, 0x00929292, 0x009d9d9d, 0x00383838, 0x00f5f5f5, 0x00bcbcbc, 0x00b6b6b6, 0x00dadada, 0x00212121, 0x00101010, 0x00ffffff, 0x00f3f3f3, 0x00d2d2d2, 0x00cdcdcd, 0x000c0c0c, 0x00131313, 0x00ececec, 0x005f5f5f, 0x00979797, 0x00444444, 0x00171717, 0x00c4c4c4, 0x00a7a7a7, 0x007e7e7e, 0x003d3d3d, 0x00646464, 0x005d5d5d, 0x00191919, 0x00737373, 0x00606060, 0x00818181, 0x004f4f4f, 0x00dcdcdc, 0x00222222, 0x002a2a2a, 0x00909090, 0x00888888, 0x00464646, 0x00eeeeee, 0x00b8b8b8, 0x00141414, 0x00dedede, 0x005e5e5e, 0x000b0b0b, 0x00dbdbdb, 0x00e0e0e0, 0x00323232, 0x003a3a3a, 0x000a0a0a, 0x00494949, 0x00060606, 0x00242424, 0x005c5c5c, 0x00c2c2c2, 0x00d3d3d3, 0x00acacac, 0x00626262, 0x00919191, 0x00959595, 0x00e4e4e4, 0x00797979, 0x00e7e7e7, 0x00c8c8c8, 0x00373737, 0x006d6d6d, 0x008d8d8d, 0x00d5d5d5, 0x004e4e4e, 0x00a9a9a9, 0x006c6c6c, 0x00565656, 0x00f4f4f4, 0x00eaeaea, 0x00656565, 0x007a7a7a, 0x00aeaeae, 0x00080808, 0x00bababa, 0x00787878, 0x00252525, 0x002e2e2e, 0x001c1c1c, 0x00a6a6a6, 0x00b4b4b4, 0x00c6c6c6, 0x00e8e8e8, 0x00dddddd, 0x00747474, 0x001f1f1f, 0x004b4b4b, 0x00bdbdbd, 0x008b8b8b, 0x008a8a8a, 0x00707070, 0x003e3e3e, 0x00b5b5b5, 0x00666666, 0x00484848, 0x00030303, 0x00f6f6f6, 0x000e0e0e, 0x00616161, 0x00353535, 0x00575757, 0x00b9b9b9, 0x00868686, 0x00c1c1c1, 0x001d1d1d, 0x009e9e9e, 0x00e1e1e1, 0x00f8f8f8, 0x00989898, 0x00111111, 0x00696969, 0x00d9d9d9, 0x008e8e8e, 0x00949494, 0x009b9b9b, 0x001e1e1e, 0x00878787, 0x00e9e9e9, 0x00cecece, 0x00555555, 0x00282828, 0x00dfdfdf, 0x008c8c8c, 0x00a1a1a1, 0x00898989, 0x000d0d0d, 0x00bfbfbf, 0x00e6e6e6, 0x00424242, 0x00686868, 0x00414141, 0x00999999, 0x002d2d2d, 0x000f0f0f, 0x00b0b0b0, 0x00545454, 0x00bbbbbb, 0x00161616 }; static const u32 s2[256] = { 0xe200e2e2, 0x4e004e4e, 0x54005454, 0xfc00fcfc, 0x94009494, 0xc200c2c2, 0x4a004a4a, 0xcc00cccc, 0x62006262, 0x0d000d0d, 0x6a006a6a, 0x46004646, 0x3c003c3c, 0x4d004d4d, 0x8b008b8b, 0xd100d1d1, 0x5e005e5e, 0xfa00fafa, 0x64006464, 0xcb00cbcb, 0xb400b4b4, 0x97009797, 0xbe00bebe, 0x2b002b2b, 0xbc00bcbc, 0x77007777, 0x2e002e2e, 0x03000303, 0xd300d3d3, 0x19001919, 0x59005959, 0xc100c1c1, 0x1d001d1d, 0x06000606, 0x41004141, 0x6b006b6b, 0x55005555, 0xf000f0f0, 0x99009999, 0x69006969, 0xea00eaea, 0x9c009c9c, 0x18001818, 0xae00aeae, 0x63006363, 0xdf00dfdf, 0xe700e7e7, 0xbb00bbbb, 0x00000000, 0x73007373, 0x66006666, 0xfb00fbfb, 0x96009696, 0x4c004c4c, 0x85008585, 0xe400e4e4, 0x3a003a3a, 0x09000909, 0x45004545, 0xaa00aaaa, 0x0f000f0f, 0xee00eeee, 0x10001010, 0xeb00ebeb, 0x2d002d2d, 0x7f007f7f, 0xf400f4f4, 0x29002929, 0xac00acac, 0xcf00cfcf, 0xad00adad, 0x91009191, 0x8d008d8d, 0x78007878, 0xc800c8c8, 0x95009595, 0xf900f9f9, 0x2f002f2f, 0xce00cece, 0xcd00cdcd, 0x08000808, 0x7a007a7a, 0x88008888, 0x38003838, 0x5c005c5c, 0x83008383, 0x2a002a2a, 0x28002828, 0x47004747, 0xdb00dbdb, 0xb800b8b8, 0xc700c7c7, 0x93009393, 0xa400a4a4, 0x12001212, 0x53005353, 0xff00ffff, 0x87008787, 0x0e000e0e, 0x31003131, 0x36003636, 0x21002121, 0x58005858, 0x48004848, 0x01000101, 0x8e008e8e, 0x37003737, 0x74007474, 0x32003232, 0xca00caca, 0xe900e9e9, 0xb100b1b1, 0xb700b7b7, 0xab00abab, 0x0c000c0c, 0xd700d7d7, 0xc400c4c4, 0x56005656, 0x42004242, 0x26002626, 0x07000707, 0x98009898, 0x60006060, 0xd900d9d9, 0xb600b6b6, 0xb900b9b9, 0x11001111, 0x40004040, 0xec00ecec, 0x20002020, 0x8c008c8c, 0xbd00bdbd, 0xa000a0a0, 0xc900c9c9, 0x84008484, 0x04000404, 0x49004949, 0x23002323, 0xf100f1f1, 0x4f004f4f, 0x50005050, 0x1f001f1f, 0x13001313, 0xdc00dcdc, 0xd800d8d8, 0xc000c0c0, 0x9e009e9e, 0x57005757, 0xe300e3e3, 0xc300c3c3, 0x7b007b7b, 0x65006565, 0x3b003b3b, 0x02000202, 0x8f008f8f, 0x3e003e3e, 0xe800e8e8, 0x25002525, 0x92009292, 0xe500e5e5, 0x15001515, 0xdd00dddd, 0xfd00fdfd, 0x17001717, 0xa900a9a9, 0xbf00bfbf, 0xd400d4d4, 0x9a009a9a, 0x7e007e7e, 0xc500c5c5, 0x39003939, 0x67006767, 0xfe00fefe, 0x76007676, 0x9d009d9d, 0x43004343, 0xa700a7a7, 0xe100e1e1, 0xd000d0d0, 0xf500f5f5, 0x68006868, 0xf200f2f2, 0x1b001b1b, 0x34003434, 0x70007070, 0x05000505, 0xa300a3a3, 0x8a008a8a, 0xd500d5d5, 0x79007979, 0x86008686, 0xa800a8a8, 0x30003030, 0xc600c6c6, 0x51005151, 0x4b004b4b, 0x1e001e1e, 0xa600a6a6, 0x27002727, 0xf600f6f6, 0x35003535, 0xd200d2d2, 0x6e006e6e, 0x24002424, 0x16001616, 0x82008282, 0x5f005f5f, 0xda00dada, 0xe600e6e6, 0x75007575, 0xa200a2a2, 0xef00efef, 0x2c002c2c, 0xb200b2b2, 0x1c001c1c, 0x9f009f9f, 0x5d005d5d, 0x6f006f6f, 0x80008080, 0x0a000a0a, 0x72007272, 0x44004444, 0x9b009b9b, 0x6c006c6c, 0x90009090, 0x0b000b0b, 0x5b005b5b, 0x33003333, 0x7d007d7d, 0x5a005a5a, 0x52005252, 0xf300f3f3, 0x61006161, 0xa100a1a1, 0xf700f7f7, 0xb000b0b0, 0xd600d6d6, 0x3f003f3f, 0x7c007c7c, 0x6d006d6d, 0xed00eded, 0x14001414, 0xe000e0e0, 0xa500a5a5, 0x3d003d3d, 0x22002222, 0xb300b3b3, 0xf800f8f8, 0x89008989, 0xde00dede, 0x71007171, 0x1a001a1a, 0xaf00afaf, 0xba00baba, 0xb500b5b5, 0x81008181 }; static const u32 x1[256] = { 0x52520052, 0x09090009, 0x6a6a006a, 0xd5d500d5, 0x30300030, 0x36360036, 0xa5a500a5, 0x38380038, 0xbfbf00bf, 0x40400040, 0xa3a300a3, 0x9e9e009e, 0x81810081, 0xf3f300f3, 0xd7d700d7, 0xfbfb00fb, 0x7c7c007c, 0xe3e300e3, 0x39390039, 0x82820082, 0x9b9b009b, 0x2f2f002f, 0xffff00ff, 0x87870087, 0x34340034, 0x8e8e008e, 0x43430043, 0x44440044, 0xc4c400c4, 0xdede00de, 0xe9e900e9, 0xcbcb00cb, 0x54540054, 0x7b7b007b, 0x94940094, 0x32320032, 0xa6a600a6, 0xc2c200c2, 0x23230023, 0x3d3d003d, 0xeeee00ee, 0x4c4c004c, 0x95950095, 0x0b0b000b, 0x42420042, 0xfafa00fa, 0xc3c300c3, 0x4e4e004e, 0x08080008, 0x2e2e002e, 0xa1a100a1, 0x66660066, 0x28280028, 0xd9d900d9, 0x24240024, 0xb2b200b2, 0x76760076, 0x5b5b005b, 0xa2a200a2, 0x49490049, 0x6d6d006d, 0x8b8b008b, 0xd1d100d1, 0x25250025, 0x72720072, 0xf8f800f8, 0xf6f600f6, 0x64640064, 0x86860086, 0x68680068, 0x98980098, 0x16160016, 0xd4d400d4, 0xa4a400a4, 0x5c5c005c, 0xcccc00cc, 0x5d5d005d, 0x65650065, 0xb6b600b6, 0x92920092, 0x6c6c006c, 0x70700070, 0x48480048, 0x50500050, 0xfdfd00fd, 0xeded00ed, 0xb9b900b9, 0xdada00da, 0x5e5e005e, 0x15150015, 0x46460046, 0x57570057, 0xa7a700a7, 0x8d8d008d, 0x9d9d009d, 0x84840084, 0x90900090, 0xd8d800d8, 0xabab00ab, 0x00000000, 0x8c8c008c, 0xbcbc00bc, 0xd3d300d3, 0x0a0a000a, 0xf7f700f7, 0xe4e400e4, 0x58580058, 0x05050005, 0xb8b800b8, 0xb3b300b3, 0x45450045, 0x06060006, 0xd0d000d0, 0x2c2c002c, 0x1e1e001e, 0x8f8f008f, 0xcaca00ca, 0x3f3f003f, 0x0f0f000f, 0x02020002, 0xc1c100c1, 0xafaf00af, 0xbdbd00bd, 0x03030003, 0x01010001, 0x13130013, 0x8a8a008a, 0x6b6b006b, 0x3a3a003a, 0x91910091, 0x11110011, 0x41410041, 0x4f4f004f, 0x67670067, 0xdcdc00dc, 0xeaea00ea, 0x97970097, 0xf2f200f2, 0xcfcf00cf, 0xcece00ce, 0xf0f000f0, 0xb4b400b4, 0xe6e600e6, 0x73730073, 0x96960096, 0xacac00ac, 0x74740074, 0x22220022, 0xe7e700e7, 0xadad00ad, 0x35350035, 0x85850085, 0xe2e200e2, 0xf9f900f9, 0x37370037, 0xe8e800e8, 0x1c1c001c, 0x75750075, 0xdfdf00df, 0x6e6e006e, 0x47470047, 0xf1f100f1, 0x1a1a001a, 0x71710071, 0x1d1d001d, 0x29290029, 0xc5c500c5, 0x89890089, 0x6f6f006f, 0xb7b700b7, 0x62620062, 0x0e0e000e, 0xaaaa00aa, 0x18180018, 0xbebe00be, 0x1b1b001b, 0xfcfc00fc, 0x56560056, 0x3e3e003e, 0x4b4b004b, 0xc6c600c6, 0xd2d200d2, 0x79790079, 0x20200020, 0x9a9a009a, 0xdbdb00db, 0xc0c000c0, 0xfefe00fe, 0x78780078, 0xcdcd00cd, 0x5a5a005a, 0xf4f400f4, 0x1f1f001f, 0xdddd00dd, 0xa8a800a8, 0x33330033, 0x88880088, 0x07070007, 0xc7c700c7, 0x31310031, 0xb1b100b1, 0x12120012, 0x10100010, 0x59590059, 0x27270027, 0x80800080, 0xecec00ec, 0x5f5f005f, 0x60600060, 0x51510051, 0x7f7f007f, 0xa9a900a9, 0x19190019, 0xb5b500b5, 0x4a4a004a, 0x0d0d000d, 0x2d2d002d, 0xe5e500e5, 0x7a7a007a, 0x9f9f009f, 0x93930093, 0xc9c900c9, 0x9c9c009c, 0xefef00ef, 0xa0a000a0, 0xe0e000e0, 0x3b3b003b, 0x4d4d004d, 0xaeae00ae, 0x2a2a002a, 0xf5f500f5, 0xb0b000b0, 0xc8c800c8, 0xebeb00eb, 0xbbbb00bb, 0x3c3c003c, 0x83830083, 0x53530053, 0x99990099, 0x61610061, 0x17170017, 0x2b2b002b, 0x04040004, 0x7e7e007e, 0xbaba00ba, 0x77770077, 0xd6d600d6, 0x26260026, 0xe1e100e1, 0x69690069, 0x14140014, 0x63630063, 0x55550055, 0x21210021, 0x0c0c000c, 0x7d7d007d }; static const u32 x2[256] = { 0x30303000, 0x68686800, 0x99999900, 0x1b1b1b00, 0x87878700, 0xb9b9b900, 0x21212100, 0x78787800, 0x50505000, 0x39393900, 0xdbdbdb00, 0xe1e1e100, 0x72727200, 0x09090900, 0x62626200, 0x3c3c3c00, 0x3e3e3e00, 0x7e7e7e00, 0x5e5e5e00, 0x8e8e8e00, 0xf1f1f100, 0xa0a0a000, 0xcccccc00, 0xa3a3a300, 0x2a2a2a00, 0x1d1d1d00, 0xfbfbfb00, 0xb6b6b600, 0xd6d6d600, 0x20202000, 0xc4c4c400, 0x8d8d8d00, 0x81818100, 0x65656500, 0xf5f5f500, 0x89898900, 0xcbcbcb00, 0x9d9d9d00, 0x77777700, 0xc6c6c600, 0x57575700, 0x43434300, 0x56565600, 0x17171700, 0xd4d4d400, 0x40404000, 0x1a1a1a00, 0x4d4d4d00, 0xc0c0c000, 0x63636300, 0x6c6c6c00, 0xe3e3e300, 0xb7b7b700, 0xc8c8c800, 0x64646400, 0x6a6a6a00, 0x53535300, 0xaaaaaa00, 0x38383800, 0x98989800, 0x0c0c0c00, 0xf4f4f400, 0x9b9b9b00, 0xededed00, 0x7f7f7f00, 0x22222200, 0x76767600, 0xafafaf00, 0xdddddd00, 0x3a3a3a00, 0x0b0b0b00, 0x58585800, 0x67676700, 0x88888800, 0x06060600, 0xc3c3c300, 0x35353500, 0x0d0d0d00, 0x01010100, 0x8b8b8b00, 0x8c8c8c00, 0xc2c2c200, 0xe6e6e600, 0x5f5f5f00, 0x02020200, 0x24242400, 0x75757500, 0x93939300, 0x66666600, 0x1e1e1e00, 0xe5e5e500, 0xe2e2e200, 0x54545400, 0xd8d8d800, 0x10101000, 0xcecece00, 0x7a7a7a00, 0xe8e8e800, 0x08080800, 0x2c2c2c00, 0x12121200, 0x97979700, 0x32323200, 0xababab00, 0xb4b4b400, 0x27272700, 0x0a0a0a00, 0x23232300, 0xdfdfdf00, 0xefefef00, 0xcacaca00, 0xd9d9d900, 0xb8b8b800, 0xfafafa00, 0xdcdcdc00, 0x31313100, 0x6b6b6b00, 0xd1d1d100, 0xadadad00, 0x19191900, 0x49494900, 0xbdbdbd00, 0x51515100, 0x96969600, 0xeeeeee00, 0xe4e4e400, 0xa8a8a800, 0x41414100, 0xdadada00, 0xffffff00, 0xcdcdcd00, 0x55555500, 0x86868600, 0x36363600, 0xbebebe00, 0x61616100, 0x52525200, 0xf8f8f800, 0xbbbbbb00, 0x0e0e0e00, 0x82828200, 0x48484800, 0x69696900, 0x9a9a9a00, 0xe0e0e000, 0x47474700, 0x9e9e9e00, 0x5c5c5c00, 0x04040400, 0x4b4b4b00, 0x34343400, 0x15151500, 0x79797900, 0x26262600, 0xa7a7a700, 0xdedede00, 0x29292900, 0xaeaeae00, 0x92929200, 0xd7d7d700, 0x84848400, 0xe9e9e900, 0xd2d2d200, 0xbababa00, 0x5d5d5d00, 0xf3f3f300, 0xc5c5c500, 0xb0b0b000, 0xbfbfbf00, 0xa4a4a400, 0x3b3b3b00, 0x71717100, 0x44444400, 0x46464600, 0x2b2b2b00, 0xfcfcfc00, 0xebebeb00, 0x6f6f6f00, 0xd5d5d500, 0xf6f6f600, 0x14141400, 0xfefefe00, 0x7c7c7c00, 0x70707000, 0x5a5a5a00, 0x7d7d7d00, 0xfdfdfd00, 0x2f2f2f00, 0x18181800, 0x83838300, 0x16161600, 0xa5a5a500, 0x91919100, 0x1f1f1f00, 0x05050500, 0x95959500, 0x74747400, 0xa9a9a900, 0xc1c1c100, 0x5b5b5b00, 0x4a4a4a00, 0x85858500, 0x6d6d6d00, 0x13131300, 0x07070700, 0x4f4f4f00, 0x4e4e4e00, 0x45454500, 0xb2b2b200, 0x0f0f0f00, 0xc9c9c900, 0x1c1c1c00, 0xa6a6a600, 0xbcbcbc00, 0xececec00, 0x73737300, 0x90909000, 0x7b7b7b00, 0xcfcfcf00, 0x59595900, 0x8f8f8f00, 0xa1a1a100, 0xf9f9f900, 0x2d2d2d00, 0xf2f2f200, 0xb1b1b100, 0x00000000, 0x94949400, 0x37373700, 0x9f9f9f00, 0xd0d0d000, 0x2e2e2e00, 0x9c9c9c00, 0x6e6e6e00, 0x28282800, 0x3f3f3f00, 0x80808000, 0xf0f0f000, 0x3d3d3d00, 0xd3d3d300, 0x25252500, 0x8a8a8a00, 0xb5b5b500, 0xe7e7e700, 0x42424200, 0xb3b3b300, 0xc7c7c700, 0xeaeaea00, 0xf7f7f700, 0x4c4c4c00, 0x11111100, 0x33333300, 0x03030300, 0xa2a2a200, 0xacacac00, 0x60606000 }; static inline u32 rotl32(u32 v, u32 r) { return ((v << r) | (v >> (32 - r))); } static inline u32 rotr32(u32 v, u32 r) { return ((v >> r) | (v << (32 - r))); } static inline u32 bswap32(u32 v) { return ((v << 24) ^ (v >> 24) ^ ((v & 0x0000ff00) << 8) ^ ((v & 0x00ff0000) >> 8)); } static inline u8 get_u8(u32 x, u32 y) { return (x >> ((3 - y) * 8)); } static inline u32 make_u32(u8 v0, u8 v1, u8 v2, u8 v3) { return ((u32)v0 << 24) | ((u32)v1 << 16) | ((u32)v2 << 8) | ((u32)v3); } static inline u32 aria_m(u32 t0) { return rotr32(t0, 8) ^ rotr32(t0 ^ rotr32(t0, 8), 16); } /* S-Box Layer 1 + M */ static inline void aria_sbox_layer1_with_pre_diff(u32 *t0, u32 *t1, u32 *t2, u32 *t3) { *t0 = s1[get_u8(*t0, 0)] ^ s2[get_u8(*t0, 1)] ^ x1[get_u8(*t0, 2)] ^ x2[get_u8(*t0, 3)]; *t1 = s1[get_u8(*t1, 0)] ^ s2[get_u8(*t1, 1)] ^ x1[get_u8(*t1, 2)] ^ x2[get_u8(*t1, 3)]; *t2 = s1[get_u8(*t2, 0)] ^ s2[get_u8(*t2, 1)] ^ x1[get_u8(*t2, 2)] ^ x2[get_u8(*t2, 3)]; *t3 = s1[get_u8(*t3, 0)] ^ s2[get_u8(*t3, 1)] ^ x1[get_u8(*t3, 2)] ^ x2[get_u8(*t3, 3)]; } /* S-Box Layer 2 + M */ static inline void aria_sbox_layer2_with_pre_diff(u32 *t0, u32 *t1, u32 *t2, u32 *t3) { *t0 = x1[get_u8(*t0, 0)] ^ x2[get_u8(*t0, 1)] ^ s1[get_u8(*t0, 2)] ^ s2[get_u8(*t0, 3)]; *t1 = x1[get_u8(*t1, 0)] ^ x2[get_u8(*t1, 1)] ^ s1[get_u8(*t1, 2)] ^ s2[get_u8(*t1, 3)]; *t2 = x1[get_u8(*t2, 0)] ^ x2[get_u8(*t2, 1)] ^ s1[get_u8(*t2, 2)] ^ s2[get_u8(*t2, 3)]; *t3 = x1[get_u8(*t3, 0)] ^ x2[get_u8(*t3, 1)] ^ s1[get_u8(*t3, 2)] ^ s2[get_u8(*t3, 3)]; } /* Word-level diffusion */ static inline void aria_diff_word(u32 *t0, u32 *t1, u32 *t2, u32 *t3) { *t1 ^= *t2; *t2 ^= *t3; *t0 ^= *t1; *t3 ^= *t1; *t2 ^= *t0; *t1 ^= *t2; } /* Byte-level diffusion */ static inline void aria_diff_byte(u32 *t1, u32 *t2, u32 *t3) { *t1 = ((*t1 << 8) & 0xff00ff00) ^ ((*t1 >> 8) & 0x00ff00ff); *t2 = rotr32(*t2, 16); *t3 = bswap32(*t3); } /* Key XOR Layer */ static inline void aria_add_round_key(u32 *rk, u32 *t0, u32 *t1, u32 *t2, u32 *t3) { *t0 ^= rk[0]; *t1 ^= rk[1]; *t2 ^= rk[2]; *t3 ^= rk[3]; } /* Odd round Substitution & Diffusion */ static inline void aria_subst_diff_odd(u32 *t0, u32 *t1, u32 *t2, u32 *t3) { aria_sbox_layer1_with_pre_diff(t0, t1, t2, t3); aria_diff_word(t0, t1, t2, t3); aria_diff_byte(t1, t2, t3); aria_diff_word(t0, t1, t2, t3); } /* Even round Substitution & Diffusion */ static inline void aria_subst_diff_even(u32 *t0, u32 *t1, u32 *t2, u32 *t3) { aria_sbox_layer2_with_pre_diff(t0, t1, t2, t3); aria_diff_word(t0, t1, t2, t3); aria_diff_byte(t3, t0, t1); aria_diff_word(t0, t1, t2, t3); } /* Q, R Macro expanded ARIA GSRK */ static inline void aria_gsrk(u32 *rk, u32 *x, u32 *y, u32 n) { int q = 4 - (n / 32); int r = n % 32; rk[0] = (x[0]) ^ ((y[q % 4]) >> r) ^ ((y[(q + 3) % 4]) << (32 - r)); rk[1] = (x[1]) ^ ((y[(q + 1) % 4]) >> r) ^ ((y[q % 4]) << (32 - r)); rk[2] = (x[2]) ^ ((y[(q + 2) % 4]) >> r) ^ ((y[(q + 1) % 4]) << (32 - r)); rk[3] = (x[3]) ^ ((y[(q + 3) % 4]) >> r) ^ ((y[(q + 2) % 4]) << (32 - r)); } void aria_encrypt(void *ctx, u8 *out, const u8 *in); void aria_decrypt(void *ctx, u8 *out, const u8 *in); int aria_set_key(struct crypto_tfm *tfm, const u8 *in_key, unsigned int key_len); #endif
19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef LINUX_KEXEC_H #define LINUX_KEXEC_H #define IND_DESTINATION_BIT 0 #define IND_INDIRECTION_BIT 1 #define IND_DONE_BIT 2 #define IND_SOURCE_BIT 3 #define IND_DESTINATION (1 << IND_DESTINATION_BIT) #define IND_INDIRECTION (1 << IND_INDIRECTION_BIT) #define IND_DONE (1 << IND_DONE_BIT) #define IND_SOURCE (1 << IND_SOURCE_BIT) #define IND_FLAGS (IND_DESTINATION | IND_INDIRECTION | IND_DONE | IND_SOURCE) #if !defined(__ASSEMBLY__) #include <linux/vmcore_info.h> #include <linux/crash_reserve.h> #include <asm/io.h> #include <linux/range.h> #include <uapi/linux/kexec.h> #include <linux/verification.h> extern note_buf_t __percpu *crash_notes; #ifdef CONFIG_CRASH_DUMP #include <linux/prandom.h> #endif #ifdef CONFIG_KEXEC_CORE #include <linux/list.h> #include <linux/compat.h> #include <linux/ioport.h> #include <linux/module.h> #include <linux/highmem.h> #include <asm/kexec.h> #include <linux/crash_core.h> /* Verify architecture specific macros are defined */ #ifndef KEXEC_SOURCE_MEMORY_LIMIT #error KEXEC_SOURCE_MEMORY_LIMIT not defined #endif #ifndef KEXEC_DESTINATION_MEMORY_LIMIT #error KEXEC_DESTINATION_MEMORY_LIMIT not defined #endif #ifndef KEXEC_CONTROL_MEMORY_LIMIT #error KEXEC_CONTROL_MEMORY_LIMIT not defined #endif #ifndef KEXEC_CONTROL_MEMORY_GFP #define KEXEC_CONTROL_MEMORY_GFP (GFP_KERNEL | __GFP_NORETRY) #endif #ifndef KEXEC_CONTROL_PAGE_SIZE #error KEXEC_CONTROL_PAGE_SIZE not defined #endif #ifndef KEXEC_ARCH #error KEXEC_ARCH not defined #endif #ifndef KEXEC_CRASH_CONTROL_MEMORY_LIMIT #define KEXEC_CRASH_CONTROL_MEMORY_LIMIT KEXEC_CONTROL_MEMORY_LIMIT #endif #ifndef KEXEC_CRASH_MEM_ALIGN #define KEXEC_CRASH_MEM_ALIGN PAGE_SIZE #endif /* * This structure is used to hold the arguments that are used when loading * kernel binaries. */ typedef unsigned long kimage_entry_t; /* * This is a copy of the UAPI struct kexec_segment and must be identical * to it because it gets copied straight from user space into kernel * memory. Do not modify this structure unless you change the way segments * get ingested from user space. */ struct kexec_segment { /* * This pointer can point to user memory if kexec_load() system * call is used or will point to kernel memory if * kexec_file_load() system call is used. * * Use ->buf when expecting to deal with user memory and use ->kbuf * when expecting to deal with kernel memory. */ union { void __user *buf; void *kbuf; }; size_t bufsz; unsigned long mem; size_t memsz; }; #ifdef CONFIG_COMPAT struct compat_kexec_segment { compat_uptr_t buf; compat_size_t bufsz; compat_ulong_t mem; /* User space sees this as a (void *) ... */ compat_size_t memsz; }; #endif #ifdef CONFIG_KEXEC_FILE struct purgatory_info { /* * Pointer to elf header at the beginning of kexec_purgatory. * Note: kexec_purgatory is read only */ const Elf_Ehdr *ehdr; /* * Temporary, modifiable buffer for sechdrs used for relocation. * This memory can be freed post image load. */ Elf_Shdr *sechdrs; /* * Temporary, modifiable buffer for stripped purgatory used for * relocation. This memory can be freed post image load. */ void *purgatory_buf; }; struct kimage; typedef int (kexec_probe_t)(const char *kernel_buf, unsigned long kernel_size); typedef void *(kexec_load_t)(struct kimage *image, char *kernel_buf, unsigned long kernel_len, char *initrd, unsigned long initrd_len, char *cmdline, unsigned long cmdline_len); typedef int (kexec_cleanup_t)(void *loader_data); #ifdef CONFIG_KEXEC_SIG typedef int (kexec_verify_sig_t)(const char *kernel_buf, unsigned long kernel_len); #endif struct kexec_file_ops { kexec_probe_t *probe; kexec_load_t *load; kexec_cleanup_t *cleanup; #ifdef CONFIG_KEXEC_SIG kexec_verify_sig_t *verify_sig; #endif }; extern const struct kexec_file_ops * const kexec_file_loaders[]; int kexec_image_probe_default(struct kimage *image, void *buf, unsigned long buf_len); int kexec_image_post_load_cleanup_default(struct kimage *image); /* * If kexec_buf.mem is set to this value, kexec_locate_mem_hole() * will try to allocate free memory. Arch may overwrite it. */ #ifndef KEXEC_BUF_MEM_UNKNOWN #define KEXEC_BUF_MEM_UNKNOWN 0 #endif /** * struct kexec_buf - parameters for finding a place for a buffer in memory * @image: kexec image in which memory to search. * @buffer: Contents which will be copied to the allocated memory. * @bufsz: Size of @buffer. * @mem: On return will have address of the buffer in memory. * @memsz: Size for the buffer in memory. * @buf_align: Minimum alignment needed. * @buf_min: The buffer can't be placed below this address. * @buf_max: The buffer can't be placed above this address. * @cma: CMA page if the buffer is backed by CMA. * @top_down: Allocate from top of memory. * @random: Place the buffer at a random position. */ struct kexec_buf { struct kimage *image; void *buffer; unsigned long bufsz; unsigned long mem; unsigned long memsz; unsigned long buf_align; unsigned long buf_min; unsigned long buf_max; struct page *cma; bool top_down; #ifdef CONFIG_CRASH_DUMP bool random; #endif }; #ifdef CONFIG_CRASH_DUMP static inline void kexec_random_range_start(unsigned long start, unsigned long end, struct kexec_buf *kbuf, unsigned long *temp_start) { unsigned short i; if (kbuf->random) { get_random_bytes(&i, sizeof(unsigned short)); *temp_start = start + (end - start) / USHRT_MAX * i; } } #else static inline void kexec_random_range_start(unsigned long start, unsigned long end, struct kexec_buf *kbuf, unsigned long *temp_start) {} #endif int kexec_load_purgatory(struct kimage *image, struct kexec_buf *kbuf); int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, void *buf, unsigned int size, bool get_value); void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name); #ifndef arch_kexec_kernel_image_probe static inline int arch_kexec_kernel_image_probe(struct kimage *image, void *buf, unsigned long buf_len) { return kexec_image_probe_default(image, buf, buf_len); } #endif #ifndef arch_kimage_file_post_load_cleanup static inline int arch_kimage_file_post_load_cleanup(struct kimage *image) { return kexec_image_post_load_cleanup_default(image); } #endif #ifndef arch_check_excluded_range static inline int arch_check_excluded_range(struct kimage *image, unsigned long start, unsigned long end) { return 0; } #endif #ifdef CONFIG_KEXEC_SIG #ifdef CONFIG_SIGNED_PE_FILE_VERIFICATION int kexec_kernel_verify_pe_sig(const char *kernel, unsigned long kernel_len); #endif #endif extern int kexec_add_buffer(struct kexec_buf *kbuf); int kexec_locate_mem_hole(struct kexec_buf *kbuf); #ifndef arch_kexec_locate_mem_hole /** * arch_kexec_locate_mem_hole - Find free memory to place the segments. * @kbuf: Parameters for the memory search. * * On success, kbuf->mem will have the start address of the memory region found. * * Return: 0 on success, negative errno on error. */ static inline int arch_kexec_locate_mem_hole(struct kexec_buf *kbuf) { return kexec_locate_mem_hole(kbuf); } #endif #ifndef arch_kexec_apply_relocations_add /* * arch_kexec_apply_relocations_add - apply relocations of type RELA * @pi: Purgatory to be relocated. * @section: Section relocations applying to. * @relsec: Section containing RELAs. * @symtab: Corresponding symtab. * * Return: 0 on success, negative errno on error. */ static inline int arch_kexec_apply_relocations_add(struct purgatory_info *pi, Elf_Shdr *section, const Elf_Shdr *relsec, const Elf_Shdr *symtab) { pr_err("RELA relocation unsupported.\n"); return -ENOEXEC; } #endif #ifndef arch_kexec_apply_relocations /* * arch_kexec_apply_relocations - apply relocations of type REL * @pi: Purgatory to be relocated. * @section: Section relocations applying to. * @relsec: Section containing RELs. * @symtab: Corresponding symtab. * * Return: 0 on success, negative errno on error. */ static inline int arch_kexec_apply_relocations(struct purgatory_info *pi, Elf_Shdr *section, const Elf_Shdr *relsec, const Elf_Shdr *symtab) { pr_err("REL relocation unsupported.\n"); return -ENOEXEC; } #endif #endif /* CONFIG_KEXEC_FILE */ #ifdef CONFIG_KEXEC_ELF struct kexec_elf_info { /* * Where the ELF binary contents are kept. * Memory managed by the user of the struct. */ const char *buffer; const struct elfhdr *ehdr; const struct elf_phdr *proghdrs; }; int kexec_build_elf_info(const char *buf, size_t len, struct elfhdr *ehdr, struct kexec_elf_info *elf_info); int kexec_elf_load(struct kimage *image, struct elfhdr *ehdr, struct kexec_elf_info *elf_info, struct kexec_buf *kbuf, unsigned long *lowest_load_addr); void kexec_free_elf_info(struct kexec_elf_info *elf_info); int kexec_elf_probe(const char *buf, unsigned long len); #endif struct kimage { kimage_entry_t head; kimage_entry_t *entry; kimage_entry_t *last_entry; unsigned long start; struct page *control_code_page; struct page *swap_page; void *vmcoreinfo_data_copy; /* locates in the crash memory */ unsigned long nr_segments; struct kexec_segment segment[KEXEC_SEGMENT_MAX]; struct page *segment_cma[KEXEC_SEGMENT_MAX]; struct list_head control_pages; struct list_head dest_pages; struct list_head unusable_pages; /* Address of next control page to allocate for crash kernels. */ unsigned long control_page; /* Flags to indicate special processing */ unsigned int type : 1; #define KEXEC_TYPE_DEFAULT 0 #define KEXEC_TYPE_CRASH 1 unsigned int preserve_context : 1; /* If set, we are using file mode kexec syscall */ unsigned int file_mode:1; #ifdef CONFIG_CRASH_HOTPLUG /* If set, it is safe to update kexec segments that are * excluded from SHA calculation. */ unsigned int hotplug_support:1; #endif unsigned int no_cma:1; #ifdef ARCH_HAS_KIMAGE_ARCH struct kimage_arch arch; #endif #ifdef CONFIG_KEXEC_FILE /* Additional fields for file based kexec syscall */ void *kernel_buf; unsigned long kernel_buf_len; void *initrd_buf; unsigned long initrd_buf_len; char *cmdline_buf; unsigned long cmdline_buf_len; /* File operations provided by image loader */ const struct kexec_file_ops *fops; /* Image loader handling the kernel can store a pointer here */ void *image_loader_data; /* Information for loading purgatory */ struct purgatory_info purgatory_info; /* Force carrying over the DTB from the current boot */ bool force_dtb; #endif #ifdef CONFIG_CRASH_HOTPLUG int hp_action; int elfcorehdr_index; bool elfcorehdr_updated; #endif #ifdef CONFIG_IMA_KEXEC /* Virtual address of IMA measurement buffer for kexec syscall */ void *ima_buffer; phys_addr_t ima_buffer_addr; size_t ima_buffer_size; unsigned long ima_segment_index; bool is_ima_segment_index_set; #endif struct { struct kexec_segment *scratch; phys_addr_t fdt; } kho; /* Core ELF header buffer */ void *elf_headers; unsigned long elf_headers_sz; unsigned long elf_load_addr; /* dm crypt keys buffer */ unsigned long dm_crypt_keys_addr; unsigned long dm_crypt_keys_sz; }; /* kexec interface functions */ extern void machine_kexec(struct kimage *image); extern int machine_kexec_prepare(struct kimage *image); extern void machine_kexec_cleanup(struct kimage *image); extern int kernel_kexec(void); extern struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order); #ifndef machine_kexec_post_load static inline int machine_kexec_post_load(struct kimage *image) { return 0; } #endif extern struct kimage *kexec_image; extern struct kimage *kexec_crash_image; bool kexec_load_permitted(int kexec_image_type); #ifndef kexec_flush_icache_page #define kexec_flush_icache_page(page) #endif /* List of defined/legal kexec flags */ #ifndef CONFIG_KEXEC_JUMP #define KEXEC_FLAGS (KEXEC_ON_CRASH | KEXEC_UPDATE_ELFCOREHDR | KEXEC_CRASH_HOTPLUG_SUPPORT) #else #define KEXEC_FLAGS (KEXEC_ON_CRASH | KEXEC_PRESERVE_CONTEXT | KEXEC_UPDATE_ELFCOREHDR | \ KEXEC_CRASH_HOTPLUG_SUPPORT) #endif /* List of defined/legal kexec file flags */ #define KEXEC_FILE_FLAGS (KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \ KEXEC_FILE_NO_INITRAMFS | KEXEC_FILE_DEBUG | \ KEXEC_FILE_NO_CMA | KEXEC_FILE_FORCE_DTB) /* flag to track if kexec reboot is in progress */ extern bool kexec_in_progress; #ifndef page_to_boot_pfn static inline unsigned long page_to_boot_pfn(struct page *page) { return page_to_pfn(page); } #endif #ifndef boot_pfn_to_page static inline struct page *boot_pfn_to_page(unsigned long boot_pfn) { return pfn_to_page(boot_pfn); } #endif #ifndef phys_to_boot_phys static inline unsigned long phys_to_boot_phys(phys_addr_t phys) { return phys; } #endif #ifndef boot_phys_to_phys static inline phys_addr_t boot_phys_to_phys(unsigned long boot_phys) { return boot_phys; } #endif #ifndef crash_free_reserved_phys_range static inline void crash_free_reserved_phys_range(unsigned long begin, unsigned long end) { unsigned long addr; for (addr = begin; addr < end; addr += PAGE_SIZE) free_reserved_page(boot_pfn_to_page(addr >> PAGE_SHIFT)); } #endif static inline unsigned long virt_to_boot_phys(void *addr) { return phys_to_boot_phys(__pa((unsigned long)addr)); } static inline void *boot_phys_to_virt(unsigned long entry) { return phys_to_virt(boot_phys_to_phys(entry)); } #ifndef arch_kexec_post_alloc_pages static inline int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp) { return 0; } #endif #ifndef arch_kexec_pre_free_pages static inline void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) { } #endif extern bool kexec_file_dbg_print; #define kexec_dprintk(fmt, arg...) \ do { if (kexec_file_dbg_print) pr_info(fmt, ##arg); } while (0) extern void *kimage_map_segment(struct kimage *image, unsigned long addr, unsigned long size); extern void kimage_unmap_segment(void *buffer); #else /* !CONFIG_KEXEC_CORE */ struct pt_regs; struct task_struct; struct kimage; static inline void __crash_kexec(struct pt_regs *regs) { } static inline void crash_kexec(struct pt_regs *regs) { } static inline int kexec_should_crash(struct task_struct *p) { return 0; } static inline int kexec_crash_loaded(void) { return 0; } static inline void *kimage_map_segment(struct kimage *image, unsigned long addr, unsigned long size) { return NULL; } static inline void kimage_unmap_segment(void *buffer) { } #define kexec_in_progress false #endif /* CONFIG_KEXEC_CORE */ #ifdef CONFIG_KEXEC_SIG void set_kexec_sig_enforced(void); #else static inline void set_kexec_sig_enforced(void) {} #endif #endif /* !defined(__ASSEBMLY__) */ #endif /* LINUX_KEXEC_H */
50 50 50 34 46 41 36 41 32 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 // SPDX-License-Identifier: GPL-2.0-or-later /* Key garbage collector * * Copyright (C) 2009-2011 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/slab.h> #include <linux/security.h> #include <keys/keyring-type.h> #include "internal.h" /* * Delay between key revocation/expiry in seconds */ unsigned key_gc_delay = 5 * 60; /* * Reaper for unused keys. */ static void key_garbage_collector(struct work_struct *work); DECLARE_WORK(key_gc_work, key_garbage_collector); /* * Reaper for links from keyrings to dead keys. */ static void key_gc_timer_func(struct timer_list *); static DEFINE_TIMER(key_gc_timer, key_gc_timer_func); static time64_t key_gc_next_run = TIME64_MAX; static struct key_type *key_gc_dead_keytype; static unsigned long key_gc_flags; #define KEY_GC_KEY_EXPIRED 0 /* A key expired and needs unlinking */ #define KEY_GC_REAP_KEYTYPE 1 /* A keytype is being unregistered */ #define KEY_GC_REAPING_KEYTYPE 2 /* Cleared when keytype reaped */ /* * Any key whose type gets unregistered will be re-typed to this if it can't be * immediately unlinked. */ struct key_type key_type_dead = { .name = ".dead", }; /* * Schedule a garbage collection run. * - time precision isn't particularly important */ void key_schedule_gc(time64_t gc_at) { unsigned long expires; time64_t now = ktime_get_real_seconds(); kenter("%lld", gc_at - now); if (gc_at <= now || test_bit(KEY_GC_REAP_KEYTYPE, &key_gc_flags)) { kdebug("IMMEDIATE"); schedule_work(&key_gc_work); } else if (gc_at < key_gc_next_run) { kdebug("DEFERRED"); key_gc_next_run = gc_at; expires = jiffies + (gc_at - now) * HZ; mod_timer(&key_gc_timer, expires); } } /* * Set the expiration time on a key. */ void key_set_expiry(struct key *key, time64_t expiry) { key->expiry = expiry; if (expiry != TIME64_MAX) { if (!(key->type->flags & KEY_TYPE_INSTANT_REAP)) expiry += key_gc_delay; key_schedule_gc(expiry); } } /* * Schedule a dead links collection run. */ void key_schedule_gc_links(void) { set_bit(KEY_GC_KEY_EXPIRED, &key_gc_flags); schedule_work(&key_gc_work); } /* * Some key's cleanup time was met after it expired, so we need to get the * reaper to go through a cycle finding expired keys. */ static void key_gc_timer_func(struct timer_list *unused) { kenter(""); key_gc_next_run = TIME64_MAX; key_schedule_gc_links(); } /* * Reap keys of dead type. * * We use three flags to make sure we see three complete cycles of the garbage * collector: the first to mark keys of that type as being dead, the second to * collect dead links and the third to clean up the dead keys. We have to be * careful as there may already be a cycle in progress. * * The caller must be holding key_types_sem. */ void key_gc_keytype(struct key_type *ktype) { kenter("%s", ktype->name); key_gc_dead_keytype = ktype; set_bit(KEY_GC_REAPING_KEYTYPE, &key_gc_flags); smp_mb(); set_bit(KEY_GC_REAP_KEYTYPE, &key_gc_flags); kdebug("schedule"); schedule_work(&key_gc_work); kdebug("sleep"); wait_on_bit(&key_gc_flags, KEY_GC_REAPING_KEYTYPE, TASK_UNINTERRUPTIBLE); key_gc_dead_keytype = NULL; kleave(""); } /* * Garbage collect a list of unreferenced, detached keys */ static noinline void key_gc_unused_keys(struct list_head *keys) { while (!list_empty(keys)) { struct key *key = list_entry(keys->next, struct key, graveyard_link); short state = key->state; list_del(&key->graveyard_link); kdebug("- %u", key->serial); key_check(key); #ifdef CONFIG_KEY_NOTIFICATIONS remove_watch_list(key->watchers, key->serial); key->watchers = NULL; #endif /* Throw away the key data if the key is instantiated */ if (state == KEY_IS_POSITIVE && key->type->destroy) key->type->destroy(key); security_key_free(key); atomic_dec(&key->user->nkeys); if (state != KEY_IS_UNINSTANTIATED) atomic_dec(&key->user->nikeys); key_user_put(key->user); key_put_tag(key->domain_tag); kfree(key->description); memzero_explicit(key, sizeof(*key)); kmem_cache_free(key_jar, key); } } /* * Garbage collector for unused keys. * * This is done in process context so that we don't have to disable interrupts * all over the place. key_put() schedules this rather than trying to do the * cleanup itself, which means key_put() doesn't have to sleep. */ static void key_garbage_collector(struct work_struct *work) { static LIST_HEAD(graveyard); static u8 gc_state; /* Internal persistent state */ #define KEY_GC_REAP_AGAIN 0x01 /* - Need another cycle */ #define KEY_GC_REAPING_LINKS 0x02 /* - We need to reap links */ #define KEY_GC_REAPING_DEAD_1 0x10 /* - We need to mark dead keys */ #define KEY_GC_REAPING_DEAD_2 0x20 /* - We need to reap dead key links */ #define KEY_GC_REAPING_DEAD_3 0x40 /* - We need to reap dead keys */ #define KEY_GC_FOUND_DEAD_KEY 0x80 /* - We found at least one dead key */ struct rb_node *cursor; struct key *key; time64_t new_timer, limit, expiry; kenter("[%lx,%x]", key_gc_flags, gc_state); limit = ktime_get_real_seconds(); /* Work out what we're going to be doing in this pass */ gc_state &= KEY_GC_REAPING_DEAD_1 | KEY_GC_REAPING_DEAD_2; gc_state <<= 1; if (test_and_clear_bit(KEY_GC_KEY_EXPIRED, &key_gc_flags)) gc_state |= KEY_GC_REAPING_LINKS; if (test_and_clear_bit(KEY_GC_REAP_KEYTYPE, &key_gc_flags)) gc_state |= KEY_GC_REAPING_DEAD_1; kdebug("new pass %x", gc_state); new_timer = TIME64_MAX; /* As only this function is permitted to remove things from the key * serial tree, if cursor is non-NULL then it will always point to a * valid node in the tree - even if lock got dropped. */ spin_lock(&key_serial_lock); cursor = rb_first(&key_serial_tree); continue_scanning: while (cursor) { key = rb_entry(cursor, struct key, serial_node); cursor = rb_next(cursor); if (!test_bit_acquire(KEY_FLAG_USER_ALIVE, &key->flags)) { /* Clobber key->user after final put seen. */ goto found_unreferenced_key; } if (unlikely(gc_state & KEY_GC_REAPING_DEAD_1)) { if (key->type == key_gc_dead_keytype) { gc_state |= KEY_GC_FOUND_DEAD_KEY; set_bit(KEY_FLAG_DEAD, &key->flags); key->perm = 0; goto skip_dead_key; } else if (key->type == &key_type_keyring && key->restrict_link) { goto found_restricted_keyring; } } expiry = key->expiry; if (expiry != TIME64_MAX) { if (!(key->type->flags & KEY_TYPE_INSTANT_REAP)) expiry += key_gc_delay; if (expiry > limit && expiry < new_timer) { kdebug("will expire %x in %lld", key_serial(key), key->expiry - limit); new_timer = key->expiry; } } if (unlikely(gc_state & KEY_GC_REAPING_DEAD_2)) if (key->type == key_gc_dead_keytype) gc_state |= KEY_GC_FOUND_DEAD_KEY; if ((gc_state & KEY_GC_REAPING_LINKS) || unlikely(gc_state & KEY_GC_REAPING_DEAD_2)) { if (key->type == &key_type_keyring) goto found_keyring; } if (unlikely(gc_state & KEY_GC_REAPING_DEAD_3)) if (key->type == key_gc_dead_keytype) goto destroy_dead_key; skip_dead_key: if (spin_is_contended(&key_serial_lock) || need_resched()) goto contended; } contended: spin_unlock(&key_serial_lock); maybe_resched: if (cursor) { cond_resched(); spin_lock(&key_serial_lock); goto continue_scanning; } /* We've completed the pass. Set the timer if we need to and queue a * new cycle if necessary. We keep executing cycles until we find one * where we didn't reap any keys. */ kdebug("pass complete"); if (new_timer != TIME64_MAX) { new_timer += key_gc_delay; key_schedule_gc(new_timer); } if (unlikely(gc_state & KEY_GC_REAPING_DEAD_2) || !list_empty(&graveyard)) { /* Make sure that all pending keyring payload destructions are * fulfilled and that people aren't now looking at dead or * dying keys that they don't have a reference upon or a link * to. */ kdebug("gc sync"); synchronize_rcu(); } if (!list_empty(&graveyard)) { kdebug("gc keys"); key_gc_unused_keys(&graveyard); } if (unlikely(gc_state & (KEY_GC_REAPING_DEAD_1 | KEY_GC_REAPING_DEAD_2))) { if (!(gc_state & KEY_GC_FOUND_DEAD_KEY)) { /* No remaining dead keys: short circuit the remaining * keytype reap cycles. */ kdebug("dead short"); gc_state &= ~(KEY_GC_REAPING_DEAD_1 | KEY_GC_REAPING_DEAD_2); gc_state |= KEY_GC_REAPING_DEAD_3; } else { gc_state |= KEY_GC_REAP_AGAIN; } } if (unlikely(gc_state & KEY_GC_REAPING_DEAD_3)) { kdebug("dead wake"); smp_mb(); clear_bit(KEY_GC_REAPING_KEYTYPE, &key_gc_flags); wake_up_bit(&key_gc_flags, KEY_GC_REAPING_KEYTYPE); } if (gc_state & KEY_GC_REAP_AGAIN) schedule_work(&key_gc_work); kleave(" [end %x]", gc_state); return; /* We found an unreferenced key - once we've removed it from the tree, * we can safely drop the lock. */ found_unreferenced_key: kdebug("unrefd key %d", key->serial); rb_erase(&key->serial_node, &key_serial_tree); spin_unlock(&key_serial_lock); list_add_tail(&key->graveyard_link, &graveyard); gc_state |= KEY_GC_REAP_AGAIN; goto maybe_resched; /* We found a restricted keyring and need to update the restriction if * it is associated with the dead key type. */ found_restricted_keyring: spin_unlock(&key_serial_lock); keyring_restriction_gc(key, key_gc_dead_keytype); goto maybe_resched; /* We found a keyring and we need to check the payload for links to * dead or expired keys. We don't flag another reap immediately as we * have to wait for the old payload to be destroyed by RCU before we * can reap the keys to which it refers. */ found_keyring: spin_unlock(&key_serial_lock); keyring_gc(key, limit); goto maybe_resched; /* We found a dead key that is still referenced. Reset its type and * destroy its payload with its semaphore held. */ destroy_dead_key: spin_unlock(&key_serial_lock); kdebug("destroy key %d", key->serial); down_write(&key->sem); key->type = &key_type_dead; if (key_gc_dead_keytype->destroy) key_gc_dead_keytype->destroy(key); memset(&key->payload, KEY_DESTROY, sizeof(key->payload)); up_write(&key->sem); goto maybe_resched; }
49 42 50 46 45 44 49 46 42 49 6 42 42 42 42 42 34 34 34 34 33 44 40 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 // SPDX-License-Identifier: GPL-2.0-or-later /* Diffie-Hellman Key Agreement Method [RFC2631] * * Copyright (c) 2016, Intel Corporation * Authors: Salvatore Benedetto <salvatore.benedetto@intel.com> */ #include <linux/fips.h> #include <linux/module.h> #include <crypto/internal/kpp.h> #include <crypto/kpp.h> #include <crypto/dh.h> #include <crypto/rng.h> #include <linux/mpi.h> struct dh_ctx { MPI p; /* Value is guaranteed to be set. */ MPI g; /* Value is guaranteed to be set. */ MPI xa; /* Value is guaranteed to be set. */ }; static void dh_clear_ctx(struct dh_ctx *ctx) { mpi_free(ctx->p); mpi_free(ctx->g); mpi_free(ctx->xa); memset(ctx, 0, sizeof(*ctx)); } /* * If base is g we compute the public key * ya = g^xa mod p; [RFC2631 sec 2.1.1] * else if base if the counterpart public key we compute the shared secret * ZZ = yb^xa mod p; [RFC2631 sec 2.1.1] */ static int _compute_val(const struct dh_ctx *ctx, MPI base, MPI val) { /* val = base^xa mod p */ return mpi_powm(val, base, ctx->xa, ctx->p); } static inline struct dh_ctx *dh_get_ctx(struct crypto_kpp *tfm) { return kpp_tfm_ctx(tfm); } static int dh_check_params_length(unsigned int p_len) { if (fips_enabled) return (p_len < 2048) ? -EINVAL : 0; return (p_len < 1536) ? -EINVAL : 0; } static int dh_set_params(struct dh_ctx *ctx, struct dh *params) { if (dh_check_params_length(params->p_size << 3)) return -EINVAL; ctx->p = mpi_read_raw_data(params->p, params->p_size); if (!ctx->p) return -EINVAL; ctx->g = mpi_read_raw_data(params->g, params->g_size); if (!ctx->g) return -EINVAL; return 0; } static int dh_set_secret(struct crypto_kpp *tfm, const void *buf, unsigned int len) { struct dh_ctx *ctx = dh_get_ctx(tfm); struct dh params; /* Free the old MPI key if any */ dh_clear_ctx(ctx); if (crypto_dh_decode_key(buf, len, &params) < 0) goto err_clear_ctx; if (dh_set_params(ctx, &params) < 0) goto err_clear_ctx; ctx->xa = mpi_read_raw_data(params.key, params.key_size); if (!ctx->xa) goto err_clear_ctx; return 0; err_clear_ctx: dh_clear_ctx(ctx); return -EINVAL; } /* * SP800-56A public key verification: * * * For the safe-prime groups in FIPS mode, Q can be computed * trivially from P and a full validation according to SP800-56A * section 5.6.2.3.1 is performed. * * * For all other sets of group parameters, only a partial validation * according to SP800-56A section 5.6.2.3.2 is performed. */ static int dh_is_pubkey_valid(struct dh_ctx *ctx, MPI y) { MPI val, q; int ret; if (!fips_enabled) return 0; if (unlikely(!ctx->p)) return -EINVAL; /* * Step 1: Verify that 2 <= y <= p - 2. * * The upper limit check is actually y < p instead of y < p - 1 * in order to save one mpi_sub_ui() invocation here. Note that * p - 1 is the non-trivial element of the subgroup of order 2 and * thus, the check on y^q below would fail if y == p - 1. */ if (mpi_cmp_ui(y, 1) < 1 || mpi_cmp(y, ctx->p) >= 0) return -EINVAL; /* * Step 2: Verify that 1 = y^q mod p * * For the safe-prime groups q = (p - 1)/2. */ val = mpi_alloc(0); if (!val) return -ENOMEM; q = mpi_alloc(mpi_get_nlimbs(ctx->p)); if (!q) { mpi_free(val); return -ENOMEM; } /* * ->p is odd, so no need to explicitly subtract one * from it before shifting to the right. */ ret = mpi_rshift(q, ctx->p, 1) ?: mpi_powm(val, y, q, ctx->p); mpi_free(q); if (ret) { mpi_free(val); return ret; } ret = mpi_cmp_ui(val, 1); mpi_free(val); if (ret != 0) return -EINVAL; return 0; } static int dh_compute_value(struct kpp_request *req) { struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); struct dh_ctx *ctx = dh_get_ctx(tfm); MPI base, val = mpi_alloc(0); int ret = 0; int sign; if (!val) return -ENOMEM; if (unlikely(!ctx->xa)) { ret = -EINVAL; goto err_free_val; } if (req->src) { base = mpi_read_raw_from_sgl(req->src, req->src_len); if (!base) { ret = -EINVAL; goto err_free_val; } ret = dh_is_pubkey_valid(ctx, base); if (ret) goto err_free_base; } else { base = ctx->g; } ret = _compute_val(ctx, base, val); if (ret) goto err_free_base; if (fips_enabled) { /* SP800-56A rev3 5.7.1.1 check: Validation of shared secret */ if (req->src) { MPI pone; /* z <= 1 */ if (mpi_cmp_ui(val, 1) < 1) { ret = -EBADMSG; goto err_free_base; } /* z == p - 1 */ pone = mpi_alloc(0); if (!pone) { ret = -ENOMEM; goto err_free_base; } ret = mpi_sub_ui(pone, ctx->p, 1); if (!ret && !mpi_cmp(pone, val)) ret = -EBADMSG; mpi_free(pone); if (ret) goto err_free_base; /* SP800-56A rev 3 5.6.2.1.3 key check */ } else { if (dh_is_pubkey_valid(ctx, val)) { ret = -EAGAIN; goto err_free_val; } } } ret = mpi_write_to_sgl(val, req->dst, req->dst_len, &sign); if (ret) goto err_free_base; if (sign < 0) ret = -EBADMSG; err_free_base: if (req->src) mpi_free(base); err_free_val: mpi_free(val); return ret; } static unsigned int dh_max_size(struct crypto_kpp *tfm) { struct dh_ctx *ctx = dh_get_ctx(tfm); return mpi_get_size(ctx->p); } static void dh_exit_tfm(struct crypto_kpp *tfm) { struct dh_ctx *ctx = dh_get_ctx(tfm); dh_clear_ctx(ctx); } static struct kpp_alg dh = { .set_secret = dh_set_secret, .generate_public_key = dh_compute_value, .compute_shared_secret = dh_compute_value, .max_size = dh_max_size, .exit = dh_exit_tfm, .base = { .cra_name = "dh", .cra_driver_name = "dh-generic", .cra_priority = 100, .cra_module = THIS_MODULE, .cra_ctxsize = sizeof(struct dh_ctx), }, }; struct dh_safe_prime { unsigned int max_strength; unsigned int p_size; const char *p; }; static const char safe_prime_g[] = { 2 }; struct dh_safe_prime_instance_ctx { struct crypto_kpp_spawn dh_spawn; const struct dh_safe_prime *safe_prime; }; struct dh_safe_prime_tfm_ctx { struct crypto_kpp *dh_tfm; }; static void dh_safe_prime_free_instance(struct kpp_instance *inst) { struct dh_safe_prime_instance_ctx *ctx = kpp_instance_ctx(inst); crypto_drop_kpp(&ctx->dh_spawn); kfree(inst); } static inline struct dh_safe_prime_instance_ctx *dh_safe_prime_instance_ctx( struct crypto_kpp *tfm) { return kpp_instance_ctx(kpp_alg_instance(tfm)); } static int dh_safe_prime_init_tfm(struct crypto_kpp *tfm) { struct dh_safe_prime_instance_ctx *inst_ctx = dh_safe_prime_instance_ctx(tfm); struct dh_safe_prime_tfm_ctx *tfm_ctx = kpp_tfm_ctx(tfm); tfm_ctx->dh_tfm = crypto_spawn_kpp(&inst_ctx->dh_spawn); if (IS_ERR(tfm_ctx->dh_tfm)) return PTR_ERR(tfm_ctx->dh_tfm); kpp_set_reqsize(tfm, sizeof(struct kpp_request) + crypto_kpp_reqsize(tfm_ctx->dh_tfm)); return 0; } static void dh_safe_prime_exit_tfm(struct crypto_kpp *tfm) { struct dh_safe_prime_tfm_ctx *tfm_ctx = kpp_tfm_ctx(tfm); crypto_free_kpp(tfm_ctx->dh_tfm); } static u64 __add_u64_to_be(__be64 *dst, unsigned int n, u64 val) { unsigned int i; for (i = n; val && i > 0; --i) { u64 tmp = be64_to_cpu(dst[i - 1]); tmp += val; val = tmp >= val ? 0 : 1; dst[i - 1] = cpu_to_be64(tmp); } return val; } static void *dh_safe_prime_gen_privkey(const struct dh_safe_prime *safe_prime, unsigned int *key_size) { unsigned int n, oversampling_size; __be64 *key; int err; u64 h, o; /* * Generate a private key following NIST SP800-56Ar3, * sec. 5.6.1.1.1 and 5.6.1.1.3 resp.. * * 5.6.1.1.1: choose key length N such that * 2 * ->max_strength <= N <= log2(q) + 1 = ->p_size * 8 - 1 * with q = (p - 1) / 2 for the safe-prime groups. * Choose the lower bound's next power of two for N in order to * avoid excessively large private keys while still * maintaining some extra reserve beyond the bare minimum in * most cases. Note that for each entry in safe_prime_groups[], * the following holds for such N: * - N >= 256, in particular it is a multiple of 2^6 = 64 * bits and * - N < log2(q) + 1, i.e. N respects the upper bound. */ n = roundup_pow_of_two(2 * safe_prime->max_strength); WARN_ON_ONCE(n & ((1u << 6) - 1)); n >>= 6; /* Convert N into units of u64. */ /* * Reserve one extra u64 to hold the extra random bits * required as per 5.6.1.1.3. */ oversampling_size = (n + 1) * sizeof(__be64); key = kmalloc(oversampling_size, GFP_KERNEL); if (!key) return ERR_PTR(-ENOMEM); /* * 5.6.1.1.3, step 3 (and implicitly step 4): obtain N + 64 * random bits and interpret them as a big endian integer. */ err = -EFAULT; if (crypto_get_default_rng()) goto out_err; err = crypto_rng_get_bytes(crypto_default_rng, (u8 *)key, oversampling_size); crypto_put_default_rng(); if (err) goto out_err; /* * 5.6.1.1.3, step 5 is implicit: 2^N < q and thus, * M = min(2^N, q) = 2^N. * * For step 6, calculate * key = (key[] mod (M - 1)) + 1 = (key[] mod (2^N - 1)) + 1. * * In order to avoid expensive divisions, note that * 2^N mod (2^N - 1) = 1 and thus, for any integer h, * 2^N * h mod (2^N - 1) = h mod (2^N - 1) always holds. * The big endian integer key[] composed of n + 1 64bit words * may be written as key[] = h * 2^N + l, with h = key[0] * representing the 64 most significant bits and l * corresponding to the remaining 2^N bits. With the remark * from above, * h * 2^N + l mod (2^N - 1) = l + h mod (2^N - 1). * As both, l and h are less than 2^N, their sum after * this first reduction is guaranteed to be <= 2^(N + 1) - 2. * Or equivalently, that their sum can again be written as * h' * 2^N + l' with h' now either zero or one and if one, * then l' <= 2^N - 2. Thus, all bits at positions >= N will * be zero after a second reduction: * h' * 2^N + l' mod (2^N - 1) = l' + h' mod (2^N - 1). * At this point, it is still possible that * l' + h' = 2^N - 1, i.e. that l' + h' mod (2^N - 1) * is zero. This condition will be detected below by means of * the final increment overflowing in this case. */ h = be64_to_cpu(key[0]); h = __add_u64_to_be(key + 1, n, h); h = __add_u64_to_be(key + 1, n, h); WARN_ON_ONCE(h); /* Increment to obtain the final result. */ o = __add_u64_to_be(key + 1, n, 1); /* * The overflow bit o from the increment is either zero or * one. If zero, key[1:n] holds the final result in big-endian * order. If one, key[1:n] is zero now, but needs to be set to * one, c.f. above. */ if (o) key[n] = cpu_to_be64(1); /* n is in units of u64, convert to bytes. */ *key_size = n << 3; /* Strip the leading extra __be64, which is (virtually) zero by now. */ memmove(key, &key[1], *key_size); return key; out_err: kfree_sensitive(key); return ERR_PTR(err); } static int dh_safe_prime_set_secret(struct crypto_kpp *tfm, const void *buffer, unsigned int len) { struct dh_safe_prime_instance_ctx *inst_ctx = dh_safe_prime_instance_ctx(tfm); struct dh_safe_prime_tfm_ctx *tfm_ctx = kpp_tfm_ctx(tfm); struct dh params = {}; void *buf = NULL, *key = NULL; unsigned int buf_size; int err; if (buffer) { err = __crypto_dh_decode_key(buffer, len, &params); if (err) return err; if (params.p_size || params.g_size) return -EINVAL; } params.p = inst_ctx->safe_prime->p; params.p_size = inst_ctx->safe_prime->p_size; params.g = safe_prime_g; params.g_size = sizeof(safe_prime_g); if (!params.key_size) { key = dh_safe_prime_gen_privkey(inst_ctx->safe_prime, &params.key_size); if (IS_ERR(key)) return PTR_ERR(key); params.key = key; } buf_size = crypto_dh_key_len(&params); buf = kmalloc(buf_size, GFP_KERNEL); if (!buf) { err = -ENOMEM; goto out; } err = crypto_dh_encode_key(buf, buf_size, &params); if (err) goto out; err = crypto_kpp_set_secret(tfm_ctx->dh_tfm, buf, buf_size); out: kfree_sensitive(buf); kfree_sensitive(key); return err; } static void dh_safe_prime_complete_req(void *data, int err) { struct kpp_request *req = data; kpp_request_complete(req, err); } static struct kpp_request *dh_safe_prime_prepare_dh_req(struct kpp_request *req) { struct dh_safe_prime_tfm_ctx *tfm_ctx = kpp_tfm_ctx(crypto_kpp_reqtfm(req)); struct kpp_request *dh_req = kpp_request_ctx(req); kpp_request_set_tfm(dh_req, tfm_ctx->dh_tfm); kpp_request_set_callback(dh_req, req->base.flags, dh_safe_prime_complete_req, req); kpp_request_set_input(dh_req, req->src, req->src_len); kpp_request_set_output(dh_req, req->dst, req->dst_len); return dh_req; } static int dh_safe_prime_generate_public_key(struct kpp_request *req) { struct kpp_request *dh_req = dh_safe_prime_prepare_dh_req(req); return crypto_kpp_generate_public_key(dh_req); } static int dh_safe_prime_compute_shared_secret(struct kpp_request *req) { struct kpp_request *dh_req = dh_safe_prime_prepare_dh_req(req); return crypto_kpp_compute_shared_secret(dh_req); } static unsigned int dh_safe_prime_max_size(struct crypto_kpp *tfm) { struct dh_safe_prime_tfm_ctx *tfm_ctx = kpp_tfm_ctx(tfm); return crypto_kpp_maxsize(tfm_ctx->dh_tfm); } static int __maybe_unused __dh_safe_prime_create( struct crypto_template *tmpl, struct rtattr **tb, const struct dh_safe_prime *safe_prime) { struct kpp_instance *inst; struct dh_safe_prime_instance_ctx *ctx; const char *dh_name; struct kpp_alg *dh_alg; u32 mask; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_KPP, &mask); if (err) return err; dh_name = crypto_attr_alg_name(tb[1]); if (IS_ERR(dh_name)) return PTR_ERR(dh_name); inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL); if (!inst) return -ENOMEM; ctx = kpp_instance_ctx(inst); err = crypto_grab_kpp(&ctx->dh_spawn, kpp_crypto_instance(inst), dh_name, 0, mask); if (err) goto err_free_inst; err = -EINVAL; dh_alg = crypto_spawn_kpp_alg(&ctx->dh_spawn); if (strcmp(dh_alg->base.cra_name, "dh")) goto err_free_inst; ctx->safe_prime = safe_prime; err = crypto_inst_setname(kpp_crypto_instance(inst), tmpl->name, &dh_alg->base); if (err) goto err_free_inst; inst->alg.set_secret = dh_safe_prime_set_secret; inst->alg.generate_public_key = dh_safe_prime_generate_public_key; inst->alg.compute_shared_secret = dh_safe_prime_compute_shared_secret; inst->alg.max_size = dh_safe_prime_max_size; inst->alg.init = dh_safe_prime_init_tfm; inst->alg.exit = dh_safe_prime_exit_tfm; inst->alg.base.cra_priority = dh_alg->base.cra_priority; inst->alg.base.cra_module = THIS_MODULE; inst->alg.base.cra_ctxsize = sizeof(struct dh_safe_prime_tfm_ctx); inst->free = dh_safe_prime_free_instance; err = kpp_register_instance(tmpl, inst); if (err) goto err_free_inst; return 0; err_free_inst: dh_safe_prime_free_instance(inst); return err; } #ifdef CONFIG_CRYPTO_DH_RFC7919_GROUPS static const struct dh_safe_prime ffdhe2048_prime = { .max_strength = 112, .p_size = 256, .p = "\xff\xff\xff\xff\xff\xff\xff\xff\xad\xf8\x54\x58\xa2\xbb\x4a\x9a" "\xaf\xdc\x56\x20\x27\x3d\x3c\xf1\xd8\xb9\xc5\x83\xce\x2d\x36\x95" "\xa9\xe1\x36\x41\x14\x64\x33\xfb\xcc\x93\x9d\xce\x24\x9b\x3e\xf9" "\x7d\x2f\xe3\x63\x63\x0c\x75\xd8\xf6\x81\xb2\x02\xae\xc4\x61\x7a" "\xd3\xdf\x1e\xd5\xd5\xfd\x65\x61\x24\x33\xf5\x1f\x5f\x06\x6e\xd0" "\x85\x63\x65\x55\x3d\xed\x1a\xf3\xb5\x57\x13\x5e\x7f\x57\xc9\x35" "\x98\x4f\x0c\x70\xe0\xe6\x8b\x77\xe2\xa6\x89\xda\xf3\xef\xe8\x72" "\x1d\xf1\x58\xa1\x36\xad\xe7\x35\x30\xac\xca\x4f\x48\x3a\x79\x7a" "\xbc\x0a\xb1\x82\xb3\x24\xfb\x61\xd1\x08\xa9\x4b\xb2\xc8\xe3\xfb" "\xb9\x6a\xda\xb7\x60\xd7\xf4\x68\x1d\x4f\x42\xa3\xde\x39\x4d\xf4" "\xae\x56\xed\xe7\x63\x72\xbb\x19\x0b\x07\xa7\xc8\xee\x0a\x6d\x70" "\x9e\x02\xfc\xe1\xcd\xf7\xe2\xec\xc0\x34\x04\xcd\x28\x34\x2f\x61" "\x91\x72\xfe\x9c\xe9\x85\x83\xff\x8e\x4f\x12\x32\xee\xf2\x81\x83" "\xc3\xfe\x3b\x1b\x4c\x6f\xad\x73\x3b\xb5\xfc\xbc\x2e\xc2\x20\x05" "\xc5\x8e\xf1\x83\x7d\x16\x83\xb2\xc6\xf3\x4a\x26\xc1\xb2\xef\xfa" "\x88\x6b\x42\x38\x61\x28\x5c\x97\xff\xff\xff\xff\xff\xff\xff\xff", }; static const struct dh_safe_prime ffdhe3072_prime = { .max_strength = 128, .p_size = 384, .p = "\xff\xff\xff\xff\xff\xff\xff\xff\xad\xf8\x54\x58\xa2\xbb\x4a\x9a" "\xaf\xdc\x56\x20\x27\x3d\x3c\xf1\xd8\xb9\xc5\x83\xce\x2d\x36\x95" "\xa9\xe1\x36\x41\x14\x64\x33\xfb\xcc\x93\x9d\xce\x24\x9b\x3e\xf9" "\x7d\x2f\xe3\x63\x63\x0c\x75\xd8\xf6\x81\xb2\x02\xae\xc4\x61\x7a" "\xd3\xdf\x1e\xd5\xd5\xfd\x65\x61\x24\x33\xf5\x1f\x5f\x06\x6e\xd0" "\x85\x63\x65\x55\x3d\xed\x1a\xf3\xb5\x57\x13\x5e\x7f\x57\xc9\x35" "\x98\x4f\x0c\x70\xe0\xe6\x8b\x77\xe2\xa6\x89\xda\xf3\xef\xe8\x72" "\x1d\xf1\x58\xa1\x36\xad\xe7\x35\x30\xac\xca\x4f\x48\x3a\x79\x7a" "\xbc\x0a\xb1\x82\xb3\x24\xfb\x61\xd1\x08\xa9\x4b\xb2\xc8\xe3\xfb" "\xb9\x6a\xda\xb7\x60\xd7\xf4\x68\x1d\x4f\x42\xa3\xde\x39\x4d\xf4" "\xae\x56\xed\xe7\x63\x72\xbb\x19\x0b\x07\xa7\xc8\xee\x0a\x6d\x70" "\x9e\x02\xfc\xe1\xcd\xf7\xe2\xec\xc0\x34\x04\xcd\x28\x34\x2f\x61" "\x91\x72\xfe\x9c\xe9\x85\x83\xff\x8e\x4f\x12\x32\xee\xf2\x81\x83" "\xc3\xfe\x3b\x1b\x4c\x6f\xad\x73\x3b\xb5\xfc\xbc\x2e\xc2\x20\x05" "\xc5\x8e\xf1\x83\x7d\x16\x83\xb2\xc6\xf3\x4a\x26\xc1\xb2\xef\xfa" "\x88\x6b\x42\x38\x61\x1f\xcf\xdc\xde\x35\x5b\x3b\x65\x19\x03\x5b" "\xbc\x34\xf4\xde\xf9\x9c\x02\x38\x61\xb4\x6f\xc9\xd6\xe6\xc9\x07" "\x7a\xd9\x1d\x26\x91\xf7\xf7\xee\x59\x8c\xb0\xfa\xc1\x86\xd9\x1c" "\xae\xfe\x13\x09\x85\x13\x92\x70\xb4\x13\x0c\x93\xbc\x43\x79\x44" "\xf4\xfd\x44\x52\xe2\xd7\x4d\xd3\x64\xf2\xe2\x1e\x71\xf5\x4b\xff" "\x5c\xae\x82\xab\x9c\x9d\xf6\x9e\xe8\x6d\x2b\xc5\x22\x36\x3a\x0d" "\xab\xc5\x21\x97\x9b\x0d\xea\xda\x1d\xbf\x9a\x42\xd5\xc4\x48\x4e" "\x0a\xbc\xd0\x6b\xfa\x53\xdd\xef\x3c\x1b\x20\xee\x3f\xd5\x9d\x7c" "\x25\xe4\x1d\x2b\x66\xc6\x2e\x37\xff\xff\xff\xff\xff\xff\xff\xff", }; static const struct dh_safe_prime ffdhe4096_prime = { .max_strength = 152, .p_size = 512, .p = "\xff\xff\xff\xff\xff\xff\xff\xff\xad\xf8\x54\x58\xa2\xbb\x4a\x9a" "\xaf\xdc\x56\x20\x27\x3d\x3c\xf1\xd8\xb9\xc5\x83\xce\x2d\x36\x95" "\xa9\xe1\x36\x41\x14\x64\x33\xfb\xcc\x93\x9d\xce\x24\x9b\x3e\xf9" "\x7d\x2f\xe3\x63\x63\x0c\x75\xd8\xf6\x81\xb2\x02\xae\xc4\x61\x7a" "\xd3\xdf\x1e\xd5\xd5\xfd\x65\x61\x24\x33\xf5\x1f\x5f\x06\x6e\xd0" "\x85\x63\x65\x55\x3d\xed\x1a\xf3\xb5\x57\x13\x5e\x7f\x57\xc9\x35" "\x98\x4f\x0c\x70\xe0\xe6\x8b\x77\xe2\xa6\x89\xda\xf3\xef\xe8\x72" "\x1d\xf1\x58\xa1\x36\xad\xe7\x35\x30\xac\xca\x4f\x48\x3a\x79\x7a" "\xbc\x0a\xb1\x82\xb3\x24\xfb\x61\xd1\x08\xa9\x4b\xb2\xc8\xe3\xfb" "\xb9\x6a\xda\xb7\x60\xd7\xf4\x68\x1d\x4f\x42\xa3\xde\x39\x4d\xf4" "\xae\x56\xed\xe7\x63\x72\xbb\x19\x0b\x07\xa7\xc8\xee\x0a\x6d\x70" "\x9e\x02\xfc\xe1\xcd\xf7\xe2\xec\xc0\x34\x04\xcd\x28\x34\x2f\x61" "\x91\x72\xfe\x9c\xe9\x85\x83\xff\x8e\x4f\x12\x32\xee\xf2\x81\x83" "\xc3\xfe\x3b\x1b\x4c\x6f\xad\x73\x3b\xb5\xfc\xbc\x2e\xc2\x20\x05" "\xc5\x8e\xf1\x83\x7d\x16\x83\xb2\xc6\xf3\x4a\x26\xc1\xb2\xef\xfa" "\x88\x6b\x42\x38\x61\x1f\xcf\xdc\xde\x35\x5b\x3b\x65\x19\x03\x5b" "\xbc\x34\xf4\xde\xf9\x9c\x02\x38\x61\xb4\x6f\xc9\xd6\xe6\xc9\x07" "\x7a\xd9\x1d\x26\x91\xf7\xf7\xee\x59\x8c\xb0\xfa\xc1\x86\xd9\x1c" "\xae\xfe\x13\x09\x85\x13\x92\x70\xb4\x13\x0c\x93\xbc\x43\x79\x44" "\xf4\xfd\x44\x52\xe2\xd7\x4d\xd3\x64\xf2\xe2\x1e\x71\xf5\x4b\xff" "\x5c\xae\x82\xab\x9c\x9d\xf6\x9e\xe8\x6d\x2b\xc5\x22\x36\x3a\x0d" "\xab\xc5\x21\x97\x9b\x0d\xea\xda\x1d\xbf\x9a\x42\xd5\xc4\x48\x4e" "\x0a\xbc\xd0\x6b\xfa\x53\xdd\xef\x3c\x1b\x20\xee\x3f\xd5\x9d\x7c" "\x25\xe4\x1d\x2b\x66\x9e\x1e\xf1\x6e\x6f\x52\xc3\x16\x4d\xf4\xfb" "\x79\x30\xe9\xe4\xe5\x88\x57\xb6\xac\x7d\x5f\x42\xd6\x9f\x6d\x18" "\x77\x63\xcf\x1d\x55\x03\x40\x04\x87\xf5\x5b\xa5\x7e\x31\xcc\x7a" "\x71\x35\xc8\x86\xef\xb4\x31\x8a\xed\x6a\x1e\x01\x2d\x9e\x68\x32" "\xa9\x07\x60\x0a\x91\x81\x30\xc4\x6d\xc7\x78\xf9\x71\xad\x00\x38" "\x09\x29\x99\xa3\x33\xcb\x8b\x7a\x1a\x1d\xb9\x3d\x71\x40\x00\x3c" "\x2a\x4e\xce\xa9\xf9\x8d\x0a\xcc\x0a\x82\x91\xcd\xce\xc9\x7d\xcf" "\x8e\xc9\xb5\x5a\x7f\x88\xa4\x6b\x4d\xb5\xa8\x51\xf4\x41\x82\xe1" "\xc6\x8a\x00\x7e\x5e\x65\x5f\x6a\xff\xff\xff\xff\xff\xff\xff\xff", }; static const struct dh_safe_prime ffdhe6144_prime = { .max_strength = 176, .p_size = 768, .p = "\xff\xff\xff\xff\xff\xff\xff\xff\xad\xf8\x54\x58\xa2\xbb\x4a\x9a" "\xaf\xdc\x56\x20\x27\x3d\x3c\xf1\xd8\xb9\xc5\x83\xce\x2d\x36\x95" "\xa9\xe1\x36\x41\x14\x64\x33\xfb\xcc\x93\x9d\xce\x24\x9b\x3e\xf9" "\x7d\x2f\xe3\x63\x63\x0c\x75\xd8\xf6\x81\xb2\x02\xae\xc4\x61\x7a" "\xd3\xdf\x1e\xd5\xd5\xfd\x65\x61\x24\x33\xf5\x1f\x5f\x06\x6e\xd0" "\x85\x63\x65\x55\x3d\xed\x1a\xf3\xb5\x57\x13\x5e\x7f\x57\xc9\x35" "\x98\x4f\x0c\x70\xe0\xe6\x8b\x77\xe2\xa6\x89\xda\xf3\xef\xe8\x72" "\x1d\xf1\x58\xa1\x36\xad\xe7\x35\x30\xac\xca\x4f\x48\x3a\x79\x7a" "\xbc\x0a\xb1\x82\xb3\x24\xfb\x61\xd1\x08\xa9\x4b\xb2\xc8\xe3\xfb" "\xb9\x6a\xda\xb7\x60\xd7\xf4\x68\x1d\x4f\x42\xa3\xde\x39\x4d\xf4" "\xae\x56\xed\xe7\x63\x72\xbb\x19\x0b\x07\xa7\xc8\xee\x0a\x6d\x70" "\x9e\x02\xfc\xe1\xcd\xf7\xe2\xec\xc0\x34\x04\xcd\x28\x34\x2f\x61" "\x91\x72\xfe\x9c\xe9\x85\x83\xff\x8e\x4f\x12\x32\xee\xf2\x81\x83" "\xc3\xfe\x3b\x1b\x4c\x6f\xad\x73\x3b\xb5\xfc\xbc\x2e\xc2\x20\x05" "\xc5\x8e\xf1\x83\x7d\x16\x83\xb2\xc6\xf3\x4a\x26\xc1\xb2\xef\xfa" "\x88\x6b\x42\x38\x61\x1f\xcf\xdc\xde\x35\x5b\x3b\x65\x19\x03\x5b" "\xbc\x34\xf4\xde\xf9\x9c\x02\x38\x61\xb4\x6f\xc9\xd6\xe6\xc9\x07" "\x7a\xd9\x1d\x26\x91\xf7\xf7\xee\x59\x8c\xb0\xfa\xc1\x86\xd9\x1c" "\xae\xfe\x13\x09\x85\x13\x92\x70\xb4\x13\x0c\x93\xbc\x43\x79\x44" "\xf4\xfd\x44\x52\xe2\xd7\x4d\xd3\x64\xf2\xe2\x1e\x71\xf5\x4b\xff" "\x5c\xae\x82\xab\x9c\x9d\xf6\x9e\xe8\x6d\x2b\xc5\x22\x36\x3a\x0d" "\xab\xc5\x21\x97\x9b\x0d\xea\xda\x1d\xbf\x9a\x42\xd5\xc4\x48\x4e" "\x0a\xbc\xd0\x6b\xfa\x53\xdd\xef\x3c\x1b\x20\xee\x3f\xd5\x9d\x7c" "\x25\xe4\x1d\x2b\x66\x9e\x1e\xf1\x6e\x6f\x52\xc3\x16\x4d\xf4\xfb" "\x79\x30\xe9\xe4\xe5\x88\x57\xb6\xac\x7d\x5f\x42\xd6\x9f\x6d\x18" "\x77\x63\xcf\x1d\x55\x03\x40\x04\x87\xf5\x5b\xa5\x7e\x31\xcc\x7a" "\x71\x35\xc8\x86\xef\xb4\x31\x8a\xed\x6a\x1e\x01\x2d\x9e\x68\x32" "\xa9\x07\x60\x0a\x91\x81\x30\xc4\x6d\xc7\x78\xf9\x71\xad\x00\x38" "\x09\x29\x99\xa3\x33\xcb\x8b\x7a\x1a\x1d\xb9\x3d\x71\x40\x00\x3c" "\x2a\x4e\xce\xa9\xf9\x8d\x0a\xcc\x0a\x82\x91\xcd\xce\xc9\x7d\xcf" "\x8e\xc9\xb5\x5a\x7f\x88\xa4\x6b\x4d\xb5\xa8\x51\xf4\x41\x82\xe1" "\xc6\x8a\x00\x7e\x5e\x0d\xd9\x02\x0b\xfd\x64\xb6\x45\x03\x6c\x7a" "\x4e\x67\x7d\x2c\x38\x53\x2a\x3a\x23\xba\x44\x42\xca\xf5\x3e\xa6" "\x3b\xb4\x54\x32\x9b\x76\x24\xc8\x91\x7b\xdd\x64\xb1\xc0\xfd\x4c" "\xb3\x8e\x8c\x33\x4c\x70\x1c\x3a\xcd\xad\x06\x57\xfc\xcf\xec\x71" "\x9b\x1f\x5c\x3e\x4e\x46\x04\x1f\x38\x81\x47\xfb\x4c\xfd\xb4\x77" "\xa5\x24\x71\xf7\xa9\xa9\x69\x10\xb8\x55\x32\x2e\xdb\x63\x40\xd8" "\xa0\x0e\xf0\x92\x35\x05\x11\xe3\x0a\xbe\xc1\xff\xf9\xe3\xa2\x6e" "\x7f\xb2\x9f\x8c\x18\x30\x23\xc3\x58\x7e\x38\xda\x00\x77\xd9\xb4" "\x76\x3e\x4e\x4b\x94\xb2\xbb\xc1\x94\xc6\x65\x1e\x77\xca\xf9\x92" "\xee\xaa\xc0\x23\x2a\x28\x1b\xf6\xb3\xa7\x39\xc1\x22\x61\x16\x82" "\x0a\xe8\xdb\x58\x47\xa6\x7c\xbe\xf9\xc9\x09\x1b\x46\x2d\x53\x8c" "\xd7\x2b\x03\x74\x6a\xe7\x7f\x5e\x62\x29\x2c\x31\x15\x62\xa8\x46" "\x50\x5d\xc8\x2d\xb8\x54\x33\x8a\xe4\x9f\x52\x35\xc9\x5b\x91\x17" "\x8c\xcf\x2d\xd5\xca\xce\xf4\x03\xec\x9d\x18\x10\xc6\x27\x2b\x04" "\x5b\x3b\x71\xf9\xdc\x6b\x80\xd6\x3f\xdd\x4a\x8e\x9a\xdb\x1e\x69" "\x62\xa6\x95\x26\xd4\x31\x61\xc1\xa4\x1d\x57\x0d\x79\x38\xda\xd4" "\xa4\x0e\x32\x9c\xd0\xe4\x0e\x65\xff\xff\xff\xff\xff\xff\xff\xff", }; static const struct dh_safe_prime ffdhe8192_prime = { .max_strength = 200, .p_size = 1024, .p = "\xff\xff\xff\xff\xff\xff\xff\xff\xad\xf8\x54\x58\xa2\xbb\x4a\x9a" "\xaf\xdc\x56\x20\x27\x3d\x3c\xf1\xd8\xb9\xc5\x83\xce\x2d\x36\x95" "\xa9\xe1\x36\x41\x14\x64\x33\xfb\xcc\x93\x9d\xce\x24\x9b\x3e\xf9" "\x7d\x2f\xe3\x63\x63\x0c\x75\xd8\xf6\x81\xb2\x02\xae\xc4\x61\x7a" "\xd3\xdf\x1e\xd5\xd5\xfd\x65\x61\x24\x33\xf5\x1f\x5f\x06\x6e\xd0" "\x85\x63\x65\x55\x3d\xed\x1a\xf3\xb5\x57\x13\x5e\x7f\x57\xc9\x35" "\x98\x4f\x0c\x70\xe0\xe6\x8b\x77\xe2\xa6\x89\xda\xf3\xef\xe8\x72" "\x1d\xf1\x58\xa1\x36\xad\xe7\x35\x30\xac\xca\x4f\x48\x3a\x79\x7a" "\xbc\x0a\xb1\x82\xb3\x24\xfb\x61\xd1\x08\xa9\x4b\xb2\xc8\xe3\xfb" "\xb9\x6a\xda\xb7\x60\xd7\xf4\x68\x1d\x4f\x42\xa3\xde\x39\x4d\xf4" "\xae\x56\xed\xe7\x63\x72\xbb\x19\x0b\x07\xa7\xc8\xee\x0a\x6d\x70" "\x9e\x02\xfc\xe1\xcd\xf7\xe2\xec\xc0\x34\x04\xcd\x28\x34\x2f\x61" "\x91\x72\xfe\x9c\xe9\x85\x83\xff\x8e\x4f\x12\x32\xee\xf2\x81\x83" "\xc3\xfe\x3b\x1b\x4c\x6f\xad\x73\x3b\xb5\xfc\xbc\x2e\xc2\x20\x05" "\xc5\x8e\xf1\x83\x7d\x16\x83\xb2\xc6\xf3\x4a\x26\xc1\xb2\xef\xfa" "\x88\x6b\x42\x38\x61\x1f\xcf\xdc\xde\x35\x5b\x3b\x65\x19\x03\x5b" "\xbc\x34\xf4\xde\xf9\x9c\x02\x38\x61\xb4\x6f\xc9\xd6\xe6\xc9\x07" "\x7a\xd9\x1d\x26\x91\xf7\xf7\xee\x59\x8c\xb0\xfa\xc1\x86\xd9\x1c" "\xae\xfe\x13\x09\x85\x13\x92\x70\xb4\x13\x0c\x93\xbc\x43\x79\x44" "\xf4\xfd\x44\x52\xe2\xd7\x4d\xd3\x64\xf2\xe2\x1e\x71\xf5\x4b\xff" "\x5c\xae\x82\xab\x9c\x9d\xf6\x9e\xe8\x6d\x2b\xc5\x22\x36\x3a\x0d" "\xab\xc5\x21\x97\x9b\x0d\xea\xda\x1d\xbf\x9a\x42\xd5\xc4\x48\x4e" "\x0a\xbc\xd0\x6b\xfa\x53\xdd\xef\x3c\x1b\x20\xee\x3f\xd5\x9d\x7c" "\x25\xe4\x1d\x2b\x66\x9e\x1e\xf1\x6e\x6f\x52\xc3\x16\x4d\xf4\xfb" "\x79\x30\xe9\xe4\xe5\x88\x57\xb6\xac\x7d\x5f\x42\xd6\x9f\x6d\x18" "\x77\x63\xcf\x1d\x55\x03\x40\x04\x87\xf5\x5b\xa5\x7e\x31\xcc\x7a" "\x71\x35\xc8\x86\xef\xb4\x31\x8a\xed\x6a\x1e\x01\x2d\x9e\x68\x32" "\xa9\x07\x60\x0a\x91\x81\x30\xc4\x6d\xc7\x78\xf9\x71\xad\x00\x38" "\x09\x29\x99\xa3\x33\xcb\x8b\x7a\x1a\x1d\xb9\x3d\x71\x40\x00\x3c" "\x2a\x4e\xce\xa9\xf9\x8d\x0a\xcc\x0a\x82\x91\xcd\xce\xc9\x7d\xcf" "\x8e\xc9\xb5\x5a\x7f\x88\xa4\x6b\x4d\xb5\xa8\x51\xf4\x41\x82\xe1" "\xc6\x8a\x00\x7e\x5e\x0d\xd9\x02\x0b\xfd\x64\xb6\x45\x03\x6c\x7a" "\x4e\x67\x7d\x2c\x38\x53\x2a\x3a\x23\xba\x44\x42\xca\xf5\x3e\xa6" "\x3b\xb4\x54\x32\x9b\x76\x24\xc8\x91\x7b\xdd\x64\xb1\xc0\xfd\x4c" "\xb3\x8e\x8c\x33\x4c\x70\x1c\x3a\xcd\xad\x06\x57\xfc\xcf\xec\x71" "\x9b\x1f\x5c\x3e\x4e\x46\x04\x1f\x38\x81\x47\xfb\x4c\xfd\xb4\x77" "\xa5\x24\x71\xf7\xa9\xa9\x69\x10\xb8\x55\x32\x2e\xdb\x63\x40\xd8" "\xa0\x0e\xf0\x92\x35\x05\x11\xe3\x0a\xbe\xc1\xff\xf9\xe3\xa2\x6e" "\x7f\xb2\x9f\x8c\x18\x30\x23\xc3\x58\x7e\x38\xda\x00\x77\xd9\xb4" "\x76\x3e\x4e\x4b\x94\xb2\xbb\xc1\x94\xc6\x65\x1e\x77\xca\xf9\x92" "\xee\xaa\xc0\x23\x2a\x28\x1b\xf6\xb3\xa7\x39\xc1\x22\x61\x16\x82" "\x0a\xe8\xdb\x58\x47\xa6\x7c\xbe\xf9\xc9\x09\x1b\x46\x2d\x53\x8c" "\xd7\x2b\x03\x74\x6a\xe7\x7f\x5e\x62\x29\x2c\x31\x15\x62\xa8\x46" "\x50\x5d\xc8\x2d\xb8\x54\x33\x8a\xe4\x9f\x52\x35\xc9\x5b\x91\x17" "\x8c\xcf\x2d\xd5\xca\xce\xf4\x03\xec\x9d\x18\x10\xc6\x27\x2b\x04" "\x5b\x3b\x71\xf9\xdc\x6b\x80\xd6\x3f\xdd\x4a\x8e\x9a\xdb\x1e\x69" "\x62\xa6\x95\x26\xd4\x31\x61\xc1\xa4\x1d\x57\x0d\x79\x38\xda\xd4" "\xa4\x0e\x32\x9c\xcf\xf4\x6a\xaa\x36\xad\x00\x4c\xf6\x00\xc8\x38" "\x1e\x42\x5a\x31\xd9\x51\xae\x64\xfd\xb2\x3f\xce\xc9\x50\x9d\x43" "\x68\x7f\xeb\x69\xed\xd1\xcc\x5e\x0b\x8c\xc3\xbd\xf6\x4b\x10\xef" "\x86\xb6\x31\x42\xa3\xab\x88\x29\x55\x5b\x2f\x74\x7c\x93\x26\x65" "\xcb\x2c\x0f\x1c\xc0\x1b\xd7\x02\x29\x38\x88\x39\xd2\xaf\x05\xe4" "\x54\x50\x4a\xc7\x8b\x75\x82\x82\x28\x46\xc0\xba\x35\xc3\x5f\x5c" "\x59\x16\x0c\xc0\x46\xfd\x82\x51\x54\x1f\xc6\x8c\x9c\x86\xb0\x22" "\xbb\x70\x99\x87\x6a\x46\x0e\x74\x51\xa8\xa9\x31\x09\x70\x3f\xee" "\x1c\x21\x7e\x6c\x38\x26\xe5\x2c\x51\xaa\x69\x1e\x0e\x42\x3c\xfc" "\x99\xe9\xe3\x16\x50\xc1\x21\x7b\x62\x48\x16\xcd\xad\x9a\x95\xf9" "\xd5\xb8\x01\x94\x88\xd9\xc0\xa0\xa1\xfe\x30\x75\xa5\x77\xe2\x31" "\x83\xf8\x1d\x4a\x3f\x2f\xa4\x57\x1e\xfc\x8c\xe0\xba\x8a\x4f\xe8" "\xb6\x85\x5d\xfe\x72\xb0\xa6\x6e\xde\xd2\xfb\xab\xfb\xe5\x8a\x30" "\xfa\xfa\xbe\x1c\x5d\x71\xa8\x7e\x2f\x74\x1e\xf8\xc1\xfe\x86\xfe" "\xa6\xbb\xfd\xe5\x30\x67\x7f\x0d\x97\xd1\x1d\x49\xf7\xa8\x44\x3d" "\x08\x22\xe5\x06\xa9\xf4\x61\x4e\x01\x1e\x2a\x94\x83\x8f\xf8\x8c" "\xd6\x8c\x8b\xb7\xc5\xc6\x42\x4c\xff\xff\xff\xff\xff\xff\xff\xff", }; static int dh_ffdhe2048_create(struct crypto_template *tmpl, struct rtattr **tb) { return __dh_safe_prime_create(tmpl, tb, &ffdhe2048_prime); } static int dh_ffdhe3072_create(struct crypto_template *tmpl, struct rtattr **tb) { return __dh_safe_prime_create(tmpl, tb, &ffdhe3072_prime); } static int dh_ffdhe4096_create(struct crypto_template *tmpl, struct rtattr **tb) { return __dh_safe_prime_create(tmpl, tb, &ffdhe4096_prime); } static int dh_ffdhe6144_create(struct crypto_template *tmpl, struct rtattr **tb) { return __dh_safe_prime_create(tmpl, tb, &ffdhe6144_prime); } static int dh_ffdhe8192_create(struct crypto_template *tmpl, struct rtattr **tb) { return __dh_safe_prime_create(tmpl, tb, &ffdhe8192_prime); } static struct crypto_template crypto_ffdhe_templates[] = { { .name = "ffdhe2048", .create = dh_ffdhe2048_create, .module = THIS_MODULE, }, { .name = "ffdhe3072", .create = dh_ffdhe3072_create, .module = THIS_MODULE, }, { .name = "ffdhe4096", .create = dh_ffdhe4096_create, .module = THIS_MODULE, }, { .name = "ffdhe6144", .create = dh_ffdhe6144_create, .module = THIS_MODULE, }, { .name = "ffdhe8192", .create = dh_ffdhe8192_create, .module = THIS_MODULE, }, }; #else /* ! CONFIG_CRYPTO_DH_RFC7919_GROUPS */ static struct crypto_template crypto_ffdhe_templates[] = {}; #endif /* CONFIG_CRYPTO_DH_RFC7919_GROUPS */ static int __init dh_init(void) { int err; err = crypto_register_kpp(&dh); if (err) return err; err = crypto_register_templates(crypto_ffdhe_templates, ARRAY_SIZE(crypto_ffdhe_templates)); if (err) { crypto_unregister_kpp(&dh); return err; } return 0; } static void __exit dh_exit(void) { crypto_unregister_templates(crypto_ffdhe_templates, ARRAY_SIZE(crypto_ffdhe_templates)); crypto_unregister_kpp(&dh); } module_init(dh_init); module_exit(dh_exit); MODULE_ALIAS_CRYPTO("dh"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("DH generic algorithm");
8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 /* SPDX-License-Identifier: GPL-2.0-or-later */ /*************************************************************************** * Linux PPP over X - Generic PPP transport layer sockets * Linux PPP over Ethernet (PPPoE) Socket Implementation (RFC 2516) * * This file supplies definitions required by the PPP over Ethernet driver * (pppox.c). All version information wrt this file is located in pppox.c */ #ifndef __LINUX_IF_PPPOX_H #define __LINUX_IF_PPPOX_H #include <linux/if.h> #include <linux/netdevice.h> #include <linux/ppp_channel.h> #include <linux/skbuff.h> #include <linux/workqueue.h> #include <uapi/linux/if_pppox.h> static inline struct pppoe_hdr *pppoe_hdr(const struct sk_buff *skb) { return (struct pppoe_hdr *)skb_network_header(skb); } struct pppoe_opt { struct net_device *dev; /* device associated with socket*/ int ifindex; /* ifindex of device associated with socket */ struct pppoe_addr pa; /* what this socket is bound to*/ struct sockaddr_pppox relay; /* what socket data will be relayed to (PPPoE relaying) */ struct work_struct padt_work;/* Work item for handling PADT */ }; struct pptp_opt { struct pptp_addr src_addr; struct pptp_addr dst_addr; u32 ack_sent, ack_recv; u32 seq_sent, seq_recv; int ppp_flags; }; #include <net/sock.h> struct pppox_sock { /* struct sock must be the first member of pppox_sock */ struct sock sk; struct ppp_channel chan; struct pppox_sock __rcu *next; /* for hash table */ union { struct pppoe_opt pppoe; struct pptp_opt pptp; } proto; __be16 num; }; #define pppoe_dev proto.pppoe.dev #define pppoe_ifindex proto.pppoe.ifindex #define pppoe_pa proto.pppoe.pa #define pppoe_relay proto.pppoe.relay static inline struct pppox_sock *pppox_sk(struct sock *sk) { return (struct pppox_sock *)sk; } static inline struct sock *sk_pppox(struct pppox_sock *po) { return (struct sock *)po; } struct module; struct pppox_proto { int (*create)(struct net *net, struct socket *sock, int kern); int (*ioctl)(struct socket *sock, unsigned int cmd, unsigned long arg); struct module *owner; }; extern int register_pppox_proto(int proto_num, const struct pppox_proto *pp); extern void unregister_pppox_proto(int proto_num); extern void pppox_unbind_sock(struct sock *sk);/* delete ppp-channel binding */ extern int pppox_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); extern int pppox_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); #define PPPOEIOCSFWD32 _IOW(0xB1 ,0, compat_size_t) /* PPPoX socket states */ enum { PPPOX_NONE = 0, /* initial state */ PPPOX_CONNECTED = 1, /* connection established ==TCP_ESTABLISHED */ PPPOX_BOUND = 2, /* bound to ppp device */ PPPOX_RELAY = 4, /* forwarding is enabled */ PPPOX_DEAD = 16 /* dead, useless, please clean me up!*/ }; #endif /* !(__LINUX_IF_PPPOX_H) */
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 // SPDX-License-Identifier: GPL-2.0-only /* * * Copyright (C) 2005 Mike Isely <isely@pobox.com> */ #include "pvrusb2-context.h" #include "pvrusb2-io.h" #include "pvrusb2-ioread.h" #include "pvrusb2-hdw.h" #include "pvrusb2-debug.h" #include <linux/wait.h> #include <linux/kthread.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/slab.h> static struct pvr2_context *pvr2_context_exist_first; static struct pvr2_context *pvr2_context_exist_last; static struct pvr2_context *pvr2_context_notify_first; static struct pvr2_context *pvr2_context_notify_last; static DEFINE_MUTEX(pvr2_context_mutex); static DECLARE_WAIT_QUEUE_HEAD(pvr2_context_sync_data); static DECLARE_WAIT_QUEUE_HEAD(pvr2_context_cleanup_data); static int pvr2_context_cleanup_flag; static int pvr2_context_cleaned_flag; static struct task_struct *pvr2_context_thread_ptr; static void pvr2_context_set_notify(struct pvr2_context *mp, int fl) { int signal_flag = 0; mutex_lock(&pvr2_context_mutex); if (fl) { if (!mp->notify_flag) { signal_flag = (pvr2_context_notify_first == NULL); mp->notify_prev = pvr2_context_notify_last; mp->notify_next = NULL; pvr2_context_notify_last = mp; if (mp->notify_prev) { mp->notify_prev->notify_next = mp; } else { pvr2_context_notify_first = mp; } mp->notify_flag = !0; } } else { if (mp->notify_flag) { mp->notify_flag = 0; if (mp->notify_next) { mp->notify_next->notify_prev = mp->notify_prev; } else { pvr2_context_notify_last = mp->notify_prev; } if (mp->notify_prev) { mp->notify_prev->notify_next = mp->notify_next; } else { pvr2_context_notify_first = mp->notify_next; } } } mutex_unlock(&pvr2_context_mutex); if (signal_flag) wake_up(&pvr2_context_sync_data); } static void pvr2_context_destroy(struct pvr2_context *mp) { pvr2_trace(PVR2_TRACE_CTXT,"pvr2_context %p (destroy)",mp); pvr2_hdw_destroy(mp->hdw); pvr2_context_set_notify(mp, 0); mutex_lock(&pvr2_context_mutex); if (mp->exist_next) { mp->exist_next->exist_prev = mp->exist_prev; } else { pvr2_context_exist_last = mp->exist_prev; } if (mp->exist_prev) { mp->exist_prev->exist_next = mp->exist_next; } else { pvr2_context_exist_first = mp->exist_next; } if (!pvr2_context_exist_first) { /* Trigger wakeup on control thread in case it is waiting for an exit condition. */ wake_up(&pvr2_context_sync_data); } mutex_unlock(&pvr2_context_mutex); kfree(mp); } static void pvr2_context_notify(void *ptr) { struct pvr2_context *mp = ptr; pvr2_context_set_notify(mp,!0); } static void pvr2_context_check(struct pvr2_context *mp) { struct pvr2_channel *ch1, *ch2; pvr2_trace(PVR2_TRACE_CTXT, "pvr2_context %p (notify)", mp); if (!mp->initialized_flag && !mp->disconnect_flag) { mp->initialized_flag = !0; pvr2_trace(PVR2_TRACE_CTXT, "pvr2_context %p (initialize)", mp); /* Finish hardware initialization */ if (pvr2_hdw_initialize(mp->hdw, pvr2_context_notify, mp)) { mp->video_stream.stream = pvr2_hdw_get_video_stream(mp->hdw); /* Trigger interface initialization. By doing this here initialization runs in our own safe and cozy thread context. */ if (mp->setup_func) mp->setup_func(mp); } else { pvr2_trace(PVR2_TRACE_CTXT, "pvr2_context %p (thread skipping setup)", mp); /* Even though initialization did not succeed, we're still going to continue anyway. We need to do this in order to await the expected disconnect (which we will detect in the normal course of operation). */ } } for (ch1 = mp->mc_first; ch1; ch1 = ch2) { ch2 = ch1->mc_next; if (ch1->check_func) ch1->check_func(ch1); } if (mp->disconnect_flag && !mp->mc_first) { /* Go away... */ pvr2_context_destroy(mp); return; } } static int pvr2_context_shutok(void) { return pvr2_context_cleanup_flag && (pvr2_context_exist_first == NULL); } static int pvr2_context_thread_func(void *foo) { struct pvr2_context *mp; pvr2_trace(PVR2_TRACE_CTXT,"pvr2_context thread start"); do { while ((mp = pvr2_context_notify_first) != NULL) { pvr2_context_set_notify(mp, 0); pvr2_context_check(mp); } wait_event_interruptible( pvr2_context_sync_data, ((pvr2_context_notify_first != NULL) || pvr2_context_shutok())); } while (!pvr2_context_shutok()); pvr2_context_cleaned_flag = !0; wake_up(&pvr2_context_cleanup_data); pvr2_trace(PVR2_TRACE_CTXT,"pvr2_context thread cleaned up"); wait_event_interruptible( pvr2_context_sync_data, kthread_should_stop()); pvr2_trace(PVR2_TRACE_CTXT,"pvr2_context thread end"); return 0; } int pvr2_context_global_init(void) { pvr2_context_thread_ptr = kthread_run(pvr2_context_thread_func, NULL, "pvrusb2-context"); return IS_ERR(pvr2_context_thread_ptr) ? -ENOMEM : 0; } void pvr2_context_global_done(void) { pvr2_context_cleanup_flag = !0; wake_up(&pvr2_context_sync_data); wait_event_interruptible( pvr2_context_cleanup_data, pvr2_context_cleaned_flag); kthread_stop(pvr2_context_thread_ptr); } struct pvr2_context *pvr2_context_create( struct usb_interface *intf, const struct usb_device_id *devid, void (*setup_func)(struct pvr2_context *)) { struct pvr2_context *mp = NULL; mp = kzalloc(sizeof(*mp),GFP_KERNEL); if (!mp) goto done; pvr2_trace(PVR2_TRACE_CTXT,"pvr2_context %p (create)",mp); mp->setup_func = setup_func; mutex_init(&mp->mutex); mutex_lock(&pvr2_context_mutex); mp->exist_prev = pvr2_context_exist_last; mp->exist_next = NULL; pvr2_context_exist_last = mp; if (mp->exist_prev) { mp->exist_prev->exist_next = mp; } else { pvr2_context_exist_first = mp; } mutex_unlock(&pvr2_context_mutex); mp->hdw = pvr2_hdw_create(intf,devid); if (!mp->hdw) { pvr2_context_destroy(mp); mp = NULL; goto done; } pvr2_context_set_notify(mp, !0); done: return mp; } static void pvr2_context_reset_input_limits(struct pvr2_context *mp) { unsigned int tmsk,mmsk; struct pvr2_channel *cp; struct pvr2_hdw *hdw = mp->hdw; mmsk = pvr2_hdw_get_input_available(hdw); tmsk = mmsk; for (cp = mp->mc_first; cp; cp = cp->mc_next) { if (!cp->input_mask) continue; tmsk &= cp->input_mask; } pvr2_hdw_set_input_allowed(hdw,mmsk,tmsk); pvr2_hdw_commit_ctl(hdw); } static void pvr2_context_enter(struct pvr2_context *mp) { mutex_lock(&mp->mutex); } static void pvr2_context_exit(struct pvr2_context *mp) { int destroy_flag = 0; if (!(mp->mc_first || !mp->disconnect_flag)) { destroy_flag = !0; } mutex_unlock(&mp->mutex); if (destroy_flag) pvr2_context_notify(mp); } void pvr2_context_disconnect(struct pvr2_context *mp) { pvr2_hdw_disconnect(mp->hdw); if (!pvr2_context_shutok()) pvr2_context_notify(mp); mp->disconnect_flag = !0; } void pvr2_channel_init(struct pvr2_channel *cp,struct pvr2_context *mp) { pvr2_context_enter(mp); cp->hdw = mp->hdw; cp->mc_head = mp; cp->mc_next = NULL; cp->mc_prev = mp->mc_last; if (mp->mc_last) { mp->mc_last->mc_next = cp; } else { mp->mc_first = cp; } mp->mc_last = cp; pvr2_context_exit(mp); } static void pvr2_channel_disclaim_stream(struct pvr2_channel *cp) { if (!cp->stream) return; pvr2_stream_kill(cp->stream->stream); cp->stream->user = NULL; cp->stream = NULL; } void pvr2_channel_done(struct pvr2_channel *cp) { struct pvr2_context *mp = cp->mc_head; pvr2_context_enter(mp); cp->input_mask = 0; pvr2_channel_disclaim_stream(cp); pvr2_context_reset_input_limits(mp); if (cp->mc_next) { cp->mc_next->mc_prev = cp->mc_prev; } else { mp->mc_last = cp->mc_prev; } if (cp->mc_prev) { cp->mc_prev->mc_next = cp->mc_next; } else { mp->mc_first = cp->mc_next; } cp->hdw = NULL; pvr2_context_exit(mp); } int pvr2_channel_limit_inputs(struct pvr2_channel *cp,unsigned int cmsk) { unsigned int tmsk,mmsk; int ret = 0; struct pvr2_channel *p2; struct pvr2_hdw *hdw = cp->hdw; mmsk = pvr2_hdw_get_input_available(hdw); cmsk &= mmsk; if (cmsk == cp->input_mask) { /* No change; nothing to do */ return 0; } pvr2_context_enter(cp->mc_head); do { if (!cmsk) { cp->input_mask = 0; pvr2_context_reset_input_limits(cp->mc_head); break; } tmsk = mmsk; for (p2 = cp->mc_head->mc_first; p2; p2 = p2->mc_next) { if (p2 == cp) continue; if (!p2->input_mask) continue; tmsk &= p2->input_mask; } if (!(tmsk & cmsk)) { ret = -EPERM; break; } tmsk &= cmsk; if ((ret = pvr2_hdw_set_input_allowed(hdw,mmsk,tmsk)) != 0) { /* Internal failure changing allowed list; probably should not happen, but react if it does. */ break; } cp->input_mask = cmsk; pvr2_hdw_commit_ctl(hdw); } while (0); pvr2_context_exit(cp->mc_head); return ret; } unsigned int pvr2_channel_get_limited_inputs(struct pvr2_channel *cp) { return cp->input_mask; } int pvr2_channel_claim_stream(struct pvr2_channel *cp, struct pvr2_context_stream *sp) { int code = 0; pvr2_context_enter(cp->mc_head); do { if (sp == cp->stream) break; if (sp && sp->user) { code = -EBUSY; break; } pvr2_channel_disclaim_stream(cp); if (!sp) break; sp->user = cp; cp->stream = sp; } while (0); pvr2_context_exit(cp->mc_head); return code; } // This is the marker for the real beginning of a legitimate mpeg2 stream. static char stream_sync_key[] = { 0x00, 0x00, 0x01, 0xba, }; struct pvr2_ioread *pvr2_channel_create_mpeg_stream( struct pvr2_context_stream *sp) { struct pvr2_ioread *cp; cp = pvr2_ioread_create(); if (!cp) return NULL; pvr2_ioread_setup(cp,sp->stream); pvr2_ioread_set_sync_key(cp,stream_sync_key,sizeof(stream_sync_key)); return cp; }
1 5 3 3 3 3 1 13 12 2 13 10 5 5 5 5 5 5 1 1 1 4 4 4 12 4 4 1 4 4 4 2 4 4 2 7 7 2 7 6 3 2 7 2 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> * Copyright (C) 2013 Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa> */ /* Kernel module implementing an IP set type: the hash:net type */ #include <linux/jhash.h> #include <linux/module.h> #include <linux/ip.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/random.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 /* 1 Forceadd support added */ /* 2 skbinfo support added */ /* 3 bucketsize, initval support added */ #define IPSET_TYPE_REV_MAX 4 /* bitmask support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>"); IP_SET_MODULE_DESC("hash:net,net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:net,net"); /* Type specific function prefix */ #define HTYPE hash_netnet #define IP_SET_HASH_WITH_NETS #define IP_SET_HASH_WITH_NETMASK #define IP_SET_HASH_WITH_BITMASK #define IPSET_NET_COUNT 2 /* IPv4 variants */ /* Member elements */ struct hash_netnet4_elem { union { __be32 ip[2]; __be64 ipcmp; }; u8 nomatch; u8 padding; union { u8 cidr[2]; u16 ccmp; }; }; /* Common functions */ static bool hash_netnet4_data_equal(const struct hash_netnet4_elem *ip1, const struct hash_netnet4_elem *ip2, u32 *multi) { return ip1->ipcmp == ip2->ipcmp && ip1->ccmp == ip2->ccmp; } static int hash_netnet4_do_data_match(const struct hash_netnet4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } static void hash_netnet4_data_set_flags(struct hash_netnet4_elem *elem, u32 flags) { elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } static void hash_netnet4_data_reset_flags(struct hash_netnet4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } static void hash_netnet4_data_reset_elem(struct hash_netnet4_elem *elem, struct hash_netnet4_elem *orig) { elem->ip[1] = orig->ip[1]; } static void hash_netnet4_data_netmask(struct hash_netnet4_elem *elem, u8 cidr, bool inner) { if (inner) { elem->ip[1] &= ip_set_netmask(cidr); elem->cidr[1] = cidr; } else { elem->ip[0] &= ip_set_netmask(cidr); elem->cidr[0] = cidr; } } static bool hash_netnet4_data_list(struct sk_buff *skb, const struct hash_netnet4_elem *data) { u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip[0]) || nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip[1]) || nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr[0]) || nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr[1]) || (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_netnet4_data_next(struct hash_netnet4_elem *next, const struct hash_netnet4_elem *d) { next->ipcmp = d->ipcmp; } #define MTYPE hash_netnet4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" static void hash_netnet4_init(struct hash_netnet4_elem *e) { e->cidr[0] = HOST_MASK; e->cidr[1] = HOST_MASK; } static int hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_netnet4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netnet4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); e.cidr[0] = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); e.cidr[1] = INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); if (adt == IPSET_TEST) e.ccmp = (HOST_MASK << (sizeof(e.cidr[0]) * 8)) | HOST_MASK; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0]); ip4addrptr(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.ip[1]); e.ip[0] &= (ip_set_netmask(e.cidr[0]) & h->bitmask.ip); e.ip[1] &= (ip_set_netmask(e.cidr[1]) & h->bitmask.ip); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { struct hash_netnet4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netnet4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0; u32 ip2 = 0, ip2_from = 0, ip2_to = 0, i = 0; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); hash_netnet4_init(&e); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); if (ret) return ret; ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (tb[IPSET_ATTR_CIDR]) { e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (!e.cidr[0] || e.cidr[0] > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } if (tb[IPSET_ATTR_CIDR2]) { e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]); if (!e.cidr[1] || e.cidr[1] > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO])) { e.ip[0] = htonl(ip & ntohl(h->bitmask.ip) & ip_set_hostmask(e.cidr[0])); e.ip[1] = htonl(ip2_from & ntohl(h->bitmask.ip) & ip_set_hostmask(e.cidr[1])); ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_enomatch(ret, flags, adt, set) ? -ret : ip_set_eexist(ret, flags) ? 0 : ret; } ip_to = ip; if (tb[IPSET_ATTR_IP_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) return ret; if (ip_to < ip) swap(ip, ip_to); if (unlikely(ip + UINT_MAX == ip_to)) return -IPSET_ERR_HASH_RANGE; } else { ip_set_mask_from_to(ip, ip_to, e.cidr[0]); } ip2_to = ip2_from; if (tb[IPSET_ATTR_IP2_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2_TO], &ip2_to); if (ret) return ret; if (ip2_to < ip2_from) swap(ip2_from, ip2_to); if (unlikely(ip2_from + UINT_MAX == ip2_to)) return -IPSET_ERR_HASH_RANGE; } else { ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]); } if (retried) { ip = ntohl(h->next.ip[0]); ip2 = ntohl(h->next.ip[1]); } else { ip2 = ip2_from; } do { e.ip[0] = htonl(ip); ip = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]); do { i++; e.ip[1] = htonl(ip2); if (i > IPSET_MAX_RANGE) { hash_netnet4_data_next(&h->next, &e); return -ERANGE; } ip2 = ip_set_range_to_cidr(ip2, ip2_to, &e.cidr[1]); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } while (ip2++ < ip2_to); ip2 = ip2_from; } while (ip++ < ip_to); return ret; } /* IPv6 variants */ struct hash_netnet6_elem { union nf_inet_addr ip[2]; u8 nomatch; u8 padding; union { u8 cidr[2]; u16 ccmp; }; }; /* Common functions */ static bool hash_netnet6_data_equal(const struct hash_netnet6_elem *ip1, const struct hash_netnet6_elem *ip2, u32 *multi) { return ipv6_addr_equal(&ip1->ip[0].in6, &ip2->ip[0].in6) && ipv6_addr_equal(&ip1->ip[1].in6, &ip2->ip[1].in6) && ip1->ccmp == ip2->ccmp; } static int hash_netnet6_do_data_match(const struct hash_netnet6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } static void hash_netnet6_data_set_flags(struct hash_netnet6_elem *elem, u32 flags) { elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } static void hash_netnet6_data_reset_flags(struct hash_netnet6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } static void hash_netnet6_data_reset_elem(struct hash_netnet6_elem *elem, struct hash_netnet6_elem *orig) { elem->ip[1] = orig->ip[1]; } static void hash_netnet6_data_netmask(struct hash_netnet6_elem *elem, u8 cidr, bool inner) { if (inner) { ip6_netmask(&elem->ip[1], cidr); elem->cidr[1] = cidr; } else { ip6_netmask(&elem->ip[0], cidr); elem->cidr[0] = cidr; } } static bool hash_netnet6_data_list(struct sk_buff *skb, const struct hash_netnet6_elem *data) { u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip[0].in6) || nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip[1].in6) || nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr[0]) || nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr[1]) || (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_netnet6_data_next(struct hash_netnet6_elem *next, const struct hash_netnet6_elem *d) { } #undef MTYPE #undef HOST_MASK #define MTYPE hash_netnet6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" static void hash_netnet6_init(struct hash_netnet6_elem *e) { e->cidr[0] = HOST_MASK; e->cidr[1] = HOST_MASK; } static int hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_netnet6 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netnet6_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); e.cidr[0] = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); e.cidr[1] = INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); if (adt == IPSET_TEST) e.ccmp = (HOST_MASK << (sizeof(u8) * 8)) | HOST_MASK; ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0].in6); ip6addrptr(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.ip[1].in6); ip6_netmask(&e.ip[0], e.cidr[0]); ip6_netmask(&e.ip[1], e.cidr[1]); nf_inet_addr_mask_inplace(&e.ip[0], &h->bitmask); nf_inet_addr_mask_inplace(&e.ip[1], &h->bitmask); if (e.cidr[0] == HOST_MASK && ipv6_addr_any(&e.ip[0].in6)) return -EINVAL; return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netnet6_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); const struct hash_netnet6 *h = set->data; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); hash_netnet6_init(&e); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]); if (ret) return ret; ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip[1]); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (tb[IPSET_ATTR_CIDR]) { e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (!e.cidr[0] || e.cidr[0] > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } if (tb[IPSET_ATTR_CIDR2]) { e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]); if (!e.cidr[1] || e.cidr[1] > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } ip6_netmask(&e.ip[0], e.cidr[0]); ip6_netmask(&e.ip[1], e.cidr[1]); nf_inet_addr_mask_inplace(&e.ip[0], &h->bitmask); nf_inet_addr_mask_inplace(&e.ip[1], &h->bitmask); if (e.cidr[0] == HOST_MASK && ipv6_addr_any(&e.ip[0].in6)) return -IPSET_ERR_HASH_ELEM; if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_enomatch(ret, flags, adt, set) ? -ret : ip_set_eexist(ret, flags) ? 0 : ret; } static struct ip_set_type hash_netnet_type __read_mostly = { .name = "hash:net,net", .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_IP | IPSET_TYPE_IP2 | IPSET_TYPE_NOMATCH, .dimension = IPSET_DIM_TWO, .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_netnet_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_NETMASK] = { .type = NLA_U8 }, [IPSET_ATTR_BITMASK] = { .type = NLA_NESTED }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_IP2] = { .type = NLA_NESTED }, [IPSET_ATTR_IP2_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_CIDR2] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; static int __init hash_netnet_init(void) { return ip_set_type_register(&hash_netnet_type); } static void __exit hash_netnet_fini(void) { rcu_barrier(); ip_set_type_unregister(&hash_netnet_type); } module_init(hash_netnet_init); module_exit(hash_netnet_fini);
30 30 425 35 427 427 37 306 13 4 427 428 4 36 422 422 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright © 2006-2015, Intel Corporation. * * Authors: Ashok Raj <ashok.raj@intel.com> * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com> * David Woodhouse <David.Woodhouse@intel.com> */ #ifndef _INTEL_IOMMU_H_ #define _INTEL_IOMMU_H_ #include <linux/types.h> #include <linux/iova.h> #include <linux/io.h> #include <linux/idr.h> #include <linux/mmu_notifier.h> #include <linux/list.h> #include <linux/iommu.h> #include <linux/io-64-nonatomic-lo-hi.h> #include <linux/dmar.h> #include <linux/bitfield.h> #include <linux/xarray.h> #include <linux/perf_event.h> #include <linux/pci.h> #include <asm/cacheflush.h> #include <asm/iommu.h> #include <uapi/linux/iommufd.h> /* * VT-d hardware uses 4KiB page size regardless of host page size. */ #define VTD_PAGE_SHIFT (12) #define VTD_PAGE_SIZE (1UL << VTD_PAGE_SHIFT) #define VTD_PAGE_MASK (((u64)-1) << VTD_PAGE_SHIFT) #define VTD_PAGE_ALIGN(addr) (((addr) + VTD_PAGE_SIZE - 1) & VTD_PAGE_MASK) #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) #define VTD_STRIDE_SHIFT (9) #define VTD_STRIDE_MASK (((u64)-1) << VTD_STRIDE_SHIFT) #define DMA_PTE_READ BIT_ULL(0) #define DMA_PTE_WRITE BIT_ULL(1) #define DMA_PTE_LARGE_PAGE BIT_ULL(7) #define DMA_PTE_SNP BIT_ULL(11) #define DMA_FL_PTE_PRESENT BIT_ULL(0) #define DMA_FL_PTE_US BIT_ULL(2) #define DMA_FL_PTE_ACCESS BIT_ULL(5) #define DMA_FL_PTE_DIRTY BIT_ULL(6) #define DMA_SL_PTE_DIRTY_BIT 9 #define DMA_SL_PTE_DIRTY BIT_ULL(DMA_SL_PTE_DIRTY_BIT) #define ADDR_WIDTH_5LEVEL (57) #define ADDR_WIDTH_4LEVEL (48) #define CONTEXT_TT_MULTI_LEVEL 0 #define CONTEXT_TT_DEV_IOTLB 1 #define CONTEXT_TT_PASS_THROUGH 2 #define CONTEXT_PASIDE BIT_ULL(3) /* * Intel IOMMU register specification per version 1.0 public spec. */ #define DMAR_VER_REG 0x0 /* Arch version supported by this IOMMU */ #define DMAR_CAP_REG 0x8 /* Hardware supported capabilities */ #define DMAR_ECAP_REG 0x10 /* Extended capabilities supported */ #define DMAR_GCMD_REG 0x18 /* Global command register */ #define DMAR_GSTS_REG 0x1c /* Global status register */ #define DMAR_RTADDR_REG 0x20 /* Root entry table */ #define DMAR_CCMD_REG 0x28 /* Context command reg */ #define DMAR_FSTS_REG 0x34 /* Fault Status register */ #define DMAR_FECTL_REG 0x38 /* Fault control register */ #define DMAR_FEDATA_REG 0x3c /* Fault event interrupt data register */ #define DMAR_FEADDR_REG 0x40 /* Fault event interrupt addr register */ #define DMAR_FEUADDR_REG 0x44 /* Upper address register */ #define DMAR_PMEN_REG 0x64 /* Enable Protected Memory Region */ #define DMAR_PLMBASE_REG 0x68 /* PMRR Low addr */ #define DMAR_PLMLIMIT_REG 0x6c /* PMRR low limit */ #define DMAR_PHMBASE_REG 0x70 /* pmrr high base addr */ #define DMAR_PHMLIMIT_REG 0x78 /* pmrr high limit */ #define DMAR_IQH_REG 0x80 /* Invalidation queue head register */ #define DMAR_IQT_REG 0x88 /* Invalidation queue tail register */ #define DMAR_IQ_SHIFT 4 /* Invalidation queue head/tail shift */ #define DMAR_IQA_REG 0x90 /* Invalidation queue addr register */ #define DMAR_ICS_REG 0x9c /* Invalidation complete status register */ #define DMAR_IQER_REG 0xb0 /* Invalidation queue error record register */ #define DMAR_IRTA_REG 0xb8 /* Interrupt remapping table addr register */ #define DMAR_PQH_REG 0xc0 /* Page request queue head register */ #define DMAR_PQT_REG 0xc8 /* Page request queue tail register */ #define DMAR_PQA_REG 0xd0 /* Page request queue address register */ #define DMAR_PRS_REG 0xdc /* Page request status register */ #define DMAR_PECTL_REG 0xe0 /* Page request event control register */ #define DMAR_PEDATA_REG 0xe4 /* Page request event interrupt data register */ #define DMAR_PEADDR_REG 0xe8 /* Page request event interrupt addr register */ #define DMAR_PEUADDR_REG 0xec /* Page request event Upper address register */ #define DMAR_MTRRCAP_REG 0x100 /* MTRR capability register */ #define DMAR_MTRRDEF_REG 0x108 /* MTRR default type register */ #define DMAR_MTRR_FIX64K_00000_REG 0x120 /* MTRR Fixed range registers */ #define DMAR_MTRR_FIX16K_80000_REG 0x128 #define DMAR_MTRR_FIX16K_A0000_REG 0x130 #define DMAR_MTRR_FIX4K_C0000_REG 0x138 #define DMAR_MTRR_FIX4K_C8000_REG 0x140 #define DMAR_MTRR_FIX4K_D0000_REG 0x148 #define DMAR_MTRR_FIX4K_D8000_REG 0x150 #define DMAR_MTRR_FIX4K_E0000_REG 0x158 #define DMAR_MTRR_FIX4K_E8000_REG 0x160 #define DMAR_MTRR_FIX4K_F0000_REG 0x168 #define DMAR_MTRR_FIX4K_F8000_REG 0x170 #define DMAR_MTRR_PHYSBASE0_REG 0x180 /* MTRR Variable range registers */ #define DMAR_MTRR_PHYSMASK0_REG 0x188 #define DMAR_MTRR_PHYSBASE1_REG 0x190 #define DMAR_MTRR_PHYSMASK1_REG 0x198 #define DMAR_MTRR_PHYSBASE2_REG 0x1a0 #define DMAR_MTRR_PHYSMASK2_REG 0x1a8 #define DMAR_MTRR_PHYSBASE3_REG 0x1b0 #define DMAR_MTRR_PHYSMASK3_REG 0x1b8 #define DMAR_MTRR_PHYSBASE4_REG 0x1c0 #define DMAR_MTRR_PHYSMASK4_REG 0x1c8 #define DMAR_MTRR_PHYSBASE5_REG 0x1d0 #define DMAR_MTRR_PHYSMASK5_REG 0x1d8 #define DMAR_MTRR_PHYSBASE6_REG 0x1e0 #define DMAR_MTRR_PHYSMASK6_REG 0x1e8 #define DMAR_MTRR_PHYSBASE7_REG 0x1f0 #define DMAR_MTRR_PHYSMASK7_REG 0x1f8 #define DMAR_MTRR_PHYSBASE8_REG 0x200 #define DMAR_MTRR_PHYSMASK8_REG 0x208 #define DMAR_MTRR_PHYSBASE9_REG 0x210 #define DMAR_MTRR_PHYSMASK9_REG 0x218 #define DMAR_PERFCAP_REG 0x300 #define DMAR_PERFCFGOFF_REG 0x310 #define DMAR_PERFOVFOFF_REG 0x318 #define DMAR_PERFCNTROFF_REG 0x31c #define DMAR_PERFINTRSTS_REG 0x324 #define DMAR_PERFINTRCTL_REG 0x328 #define DMAR_PERFEVNTCAP_REG 0x380 #define DMAR_ECMD_REG 0x400 #define DMAR_ECEO_REG 0x408 #define DMAR_ECRSP_REG 0x410 #define DMAR_ECCAP_REG 0x430 #define DMAR_IQER_REG_IQEI(reg) FIELD_GET(GENMASK_ULL(3, 0), reg) #define DMAR_IQER_REG_ITESID(reg) FIELD_GET(GENMASK_ULL(47, 32), reg) #define DMAR_IQER_REG_ICESID(reg) FIELD_GET(GENMASK_ULL(63, 48), reg) #define OFFSET_STRIDE (9) #define dmar_readq(a) readq(a) #define dmar_writeq(a,v) writeq(v,a) #define dmar_readl(a) readl(a) #define dmar_writel(a, v) writel(v, a) #define DMAR_VER_MAJOR(v) (((v) & 0xf0) >> 4) #define DMAR_VER_MINOR(v) ((v) & 0x0f) /* * Decoding Capability Register */ #define cap_esrtps(c) (((c) >> 63) & 1) #define cap_esirtps(c) (((c) >> 62) & 1) #define cap_ecmds(c) (((c) >> 61) & 1) #define cap_fl5lp_support(c) (((c) >> 60) & 1) #define cap_pi_support(c) (((c) >> 59) & 1) #define cap_fl1gp_support(c) (((c) >> 56) & 1) #define cap_read_drain(c) (((c) >> 55) & 1) #define cap_write_drain(c) (((c) >> 54) & 1) #define cap_max_amask_val(c) (((c) >> 48) & 0x3f) #define cap_num_fault_regs(c) ((((c) >> 40) & 0xff) + 1) #define cap_pgsel_inv(c) (((c) >> 39) & 1) #define cap_super_page_val(c) (((c) >> 34) & 0xf) #define cap_fault_reg_offset(c) ((((c) >> 24) & 0x3ff) * 16) #define cap_max_fault_reg_offset(c) \ (cap_fault_reg_offset(c) + cap_num_fault_regs(c) * 16) #define cap_zlr(c) (((c) >> 22) & 1) #define cap_isoch(c) (((c) >> 23) & 1) #define cap_mgaw(c) ((((c) >> 16) & 0x3f) + 1) #define cap_sagaw(c) (((c) >> 8) & 0x1f) #define cap_caching_mode(c) (((c) >> 7) & 1) #define cap_phmr(c) (((c) >> 6) & 1) #define cap_plmr(c) (((c) >> 5) & 1) #define cap_rwbf(c) (((c) >> 4) & 1) #define cap_afl(c) (((c) >> 3) & 1) #define cap_ndoms(c) (((unsigned long)1) << (4 + 2 * ((c) & 0x7))) /* * Extended Capability Register */ #define ecap_pms(e) (((e) >> 51) & 0x1) #define ecap_rps(e) (((e) >> 49) & 0x1) #define ecap_smpwc(e) (((e) >> 48) & 0x1) #define ecap_flts(e) (((e) >> 47) & 0x1) #define ecap_slts(e) (((e) >> 46) & 0x1) #define ecap_slads(e) (((e) >> 45) & 0x1) #define ecap_smts(e) (((e) >> 43) & 0x1) #define ecap_dit(e) (((e) >> 41) & 0x1) #define ecap_pds(e) (((e) >> 42) & 0x1) #define ecap_pasid(e) (((e) >> 40) & 0x1) #define ecap_pss(e) (((e) >> 35) & 0x1f) #define ecap_eafs(e) (((e) >> 34) & 0x1) #define ecap_nwfs(e) (((e) >> 33) & 0x1) #define ecap_srs(e) (((e) >> 31) & 0x1) #define ecap_ers(e) (((e) >> 30) & 0x1) #define ecap_prs(e) (((e) >> 29) & 0x1) #define ecap_broken_pasid(e) (((e) >> 28) & 0x1) #define ecap_dis(e) (((e) >> 27) & 0x1) #define ecap_nest(e) (((e) >> 26) & 0x1) #define ecap_mts(e) (((e) >> 25) & 0x1) #define ecap_iotlb_offset(e) ((((e) >> 8) & 0x3ff) * 16) #define ecap_max_iotlb_offset(e) (ecap_iotlb_offset(e) + 16) #define ecap_coherent(e) ((e) & 0x1) #define ecap_qis(e) ((e) & 0x2) #define ecap_pass_through(e) (((e) >> 6) & 0x1) #define ecap_eim_support(e) (((e) >> 4) & 0x1) #define ecap_ir_support(e) (((e) >> 3) & 0x1) #define ecap_dev_iotlb_support(e) (((e) >> 2) & 0x1) #define ecap_max_handle_mask(e) (((e) >> 20) & 0xf) #define ecap_sc_support(e) (((e) >> 7) & 0x1) /* Snooping Control */ /* * Decoding Perf Capability Register */ #define pcap_num_cntr(p) ((p) & 0xffff) #define pcap_cntr_width(p) (((p) >> 16) & 0x7f) #define pcap_num_event_group(p) (((p) >> 24) & 0x1f) #define pcap_filters_mask(p) (((p) >> 32) & 0x1f) #define pcap_interrupt(p) (((p) >> 50) & 0x1) /* The counter stride is calculated as 2 ^ (x+10) bytes */ #define pcap_cntr_stride(p) (1ULL << ((((p) >> 52) & 0x7) + 10)) /* * Decoding Perf Event Capability Register */ #define pecap_es(p) ((p) & 0xfffffff) /* Virtual command interface capability */ #define vccap_pasid(v) (((v) & DMA_VCS_PAS)) /* PASID allocation */ /* IOTLB_REG */ #define DMA_TLB_FLUSH_GRANU_OFFSET 60 #define DMA_TLB_GLOBAL_FLUSH (((u64)1) << 60) #define DMA_TLB_DSI_FLUSH (((u64)2) << 60) #define DMA_TLB_PSI_FLUSH (((u64)3) << 60) #define DMA_TLB_IIRG(type) ((type >> 60) & 3) #define DMA_TLB_IAIG(val) (((val) >> 57) & 3) #define DMA_TLB_READ_DRAIN (((u64)1) << 49) #define DMA_TLB_WRITE_DRAIN (((u64)1) << 48) #define DMA_TLB_DID(id) (((u64)((id) & 0xffff)) << 32) #define DMA_TLB_IVT (((u64)1) << 63) #define DMA_TLB_IH_NONLEAF (((u64)1) << 6) #define DMA_TLB_MAX_SIZE (0x3f) /* INVALID_DESC */ #define DMA_CCMD_INVL_GRANU_OFFSET 61 #define DMA_ID_TLB_GLOBAL_FLUSH (((u64)1) << 4) #define DMA_ID_TLB_DSI_FLUSH (((u64)2) << 4) #define DMA_ID_TLB_PSI_FLUSH (((u64)3) << 4) #define DMA_ID_TLB_READ_DRAIN (((u64)1) << 7) #define DMA_ID_TLB_WRITE_DRAIN (((u64)1) << 6) #define DMA_ID_TLB_DID(id) (((u64)((id & 0xffff) << 16))) #define DMA_ID_TLB_IH_NONLEAF (((u64)1) << 6) #define DMA_ID_TLB_ADDR(addr) (addr) #define DMA_ID_TLB_ADDR_MASK(mask) (mask) /* PMEN_REG */ #define DMA_PMEN_EPM (((u32)1)<<31) #define DMA_PMEN_PRS (((u32)1)<<0) /* GCMD_REG */ #define DMA_GCMD_TE (((u32)1) << 31) #define DMA_GCMD_SRTP (((u32)1) << 30) #define DMA_GCMD_SFL (((u32)1) << 29) #define DMA_GCMD_EAFL (((u32)1) << 28) #define DMA_GCMD_WBF (((u32)1) << 27) #define DMA_GCMD_QIE (((u32)1) << 26) #define DMA_GCMD_SIRTP (((u32)1) << 24) #define DMA_GCMD_IRE (((u32) 1) << 25) #define DMA_GCMD_CFI (((u32) 1) << 23) /* GSTS_REG */ #define DMA_GSTS_TES (((u32)1) << 31) #define DMA_GSTS_RTPS (((u32)1) << 30) #define DMA_GSTS_FLS (((u32)1) << 29) #define DMA_GSTS_AFLS (((u32)1) << 28) #define DMA_GSTS_WBFS (((u32)1) << 27) #define DMA_GSTS_QIES (((u32)1) << 26) #define DMA_GSTS_IRTPS (((u32)1) << 24) #define DMA_GSTS_IRES (((u32)1) << 25) #define DMA_GSTS_CFIS (((u32)1) << 23) /* DMA_RTADDR_REG */ #define DMA_RTADDR_SMT (((u64)1) << 10) /* CCMD_REG */ #define DMA_CCMD_ICC (((u64)1) << 63) #define DMA_CCMD_GLOBAL_INVL (((u64)1) << 61) #define DMA_CCMD_DOMAIN_INVL (((u64)2) << 61) #define DMA_CCMD_DEVICE_INVL (((u64)3) << 61) #define DMA_CCMD_FM(m) (((u64)((m) & 0x3)) << 32) #define DMA_CCMD_MASK_NOBIT 0 #define DMA_CCMD_MASK_1BIT 1 #define DMA_CCMD_MASK_2BIT 2 #define DMA_CCMD_MASK_3BIT 3 #define DMA_CCMD_SID(s) (((u64)((s) & 0xffff)) << 16) #define DMA_CCMD_DID(d) ((u64)((d) & 0xffff)) /* ECMD_REG */ #define DMA_MAX_NUM_ECMD 256 #define DMA_MAX_NUM_ECMDCAP (DMA_MAX_NUM_ECMD / 64) #define DMA_ECMD_REG_STEP 8 #define DMA_ECMD_ENABLE 0xf0 #define DMA_ECMD_DISABLE 0xf1 #define DMA_ECMD_FREEZE 0xf4 #define DMA_ECMD_UNFREEZE 0xf5 #define DMA_ECMD_OA_SHIFT 16 #define DMA_ECMD_ECRSP_IP 0x1 #define DMA_ECMD_ECCAP3 3 #define DMA_ECMD_ECCAP3_ECNTS BIT_ULL(48) #define DMA_ECMD_ECCAP3_DCNTS BIT_ULL(49) #define DMA_ECMD_ECCAP3_FCNTS BIT_ULL(52) #define DMA_ECMD_ECCAP3_UFCNTS BIT_ULL(53) #define DMA_ECMD_ECCAP3_ESSENTIAL (DMA_ECMD_ECCAP3_ECNTS | \ DMA_ECMD_ECCAP3_DCNTS | \ DMA_ECMD_ECCAP3_FCNTS | \ DMA_ECMD_ECCAP3_UFCNTS) /* FECTL_REG */ #define DMA_FECTL_IM (((u32)1) << 31) /* FSTS_REG */ #define DMA_FSTS_PFO (1 << 0) /* Primary Fault Overflow */ #define DMA_FSTS_PPF (1 << 1) /* Primary Pending Fault */ #define DMA_FSTS_IQE (1 << 4) /* Invalidation Queue Error */ #define DMA_FSTS_ICE (1 << 5) /* Invalidation Completion Error */ #define DMA_FSTS_ITE (1 << 6) /* Invalidation Time-out Error */ #define DMA_FSTS_PRO (1 << 7) /* Page Request Overflow */ #define dma_fsts_fault_record_index(s) (((s) >> 8) & 0xff) /* FRCD_REG, 32 bits access */ #define DMA_FRCD_F (((u32)1) << 31) #define dma_frcd_type(d) ((d >> 30) & 1) #define dma_frcd_fault_reason(c) (c & 0xff) #define dma_frcd_source_id(c) (c & 0xffff) #define dma_frcd_pasid_value(c) (((c) >> 8) & 0xfffff) #define dma_frcd_pasid_present(c) (((c) >> 31) & 1) /* low 64 bit */ #define dma_frcd_page_addr(d) (d & (((u64)-1) << PAGE_SHIFT)) /* PRS_REG */ #define DMA_PRS_PPR ((u32)1) #define DMA_PRS_PRO ((u32)2) #define DMA_VCS_PAS ((u64)1) /* PERFINTRSTS_REG */ #define DMA_PERFINTRSTS_PIS ((u32)1) #define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \ do { \ cycles_t start_time = get_cycles(); \ while (1) { \ sts = op(iommu->reg + offset); \ if (cond) \ break; \ if (DMAR_OPERATION_TIMEOUT < (get_cycles() - start_time))\ panic("DMAR hardware is malfunctioning\n"); \ cpu_relax(); \ } \ } while (0) #define QI_LENGTH 256 /* queue length */ enum { QI_FREE, QI_IN_USE, QI_DONE, QI_ABORT }; #define QI_CC_TYPE 0x1 #define QI_IOTLB_TYPE 0x2 #define QI_DIOTLB_TYPE 0x3 #define QI_IEC_TYPE 0x4 #define QI_IWD_TYPE 0x5 #define QI_EIOTLB_TYPE 0x6 #define QI_PC_TYPE 0x7 #define QI_DEIOTLB_TYPE 0x8 #define QI_PGRP_RESP_TYPE 0x9 #define QI_PSTRM_RESP_TYPE 0xa #define QI_IEC_SELECTIVE (((u64)1) << 4) #define QI_IEC_IIDEX(idx) (((u64)(idx & 0xffff) << 32)) #define QI_IEC_IM(m) (((u64)(m & 0x1f) << 27)) #define QI_IWD_STATUS_DATA(d) (((u64)d) << 32) #define QI_IWD_STATUS_WRITE (((u64)1) << 5) #define QI_IWD_FENCE (((u64)1) << 6) #define QI_IWD_PRQ_DRAIN (((u64)1) << 7) #define QI_IOTLB_DID(did) (((u64)did) << 16) #define QI_IOTLB_DR(dr) (((u64)dr) << 7) #define QI_IOTLB_DW(dw) (((u64)dw) << 6) #define QI_IOTLB_GRAN(gran) (((u64)gran) >> (DMA_TLB_FLUSH_GRANU_OFFSET-4)) #define QI_IOTLB_ADDR(addr) (((u64)addr) & VTD_PAGE_MASK) #define QI_IOTLB_IH(ih) (((u64)ih) << 6) #define QI_IOTLB_AM(am) (((u8)am) & 0x3f) #define QI_CC_FM(fm) (((u64)fm) << 48) #define QI_CC_SID(sid) (((u64)sid) << 32) #define QI_CC_DID(did) (((u64)did) << 16) #define QI_CC_GRAN(gran) (((u64)gran) >> (DMA_CCMD_INVL_GRANU_OFFSET-4)) #define QI_DEV_IOTLB_SID(sid) ((u64)((sid) & 0xffff) << 32) #define QI_DEV_IOTLB_QDEP(qdep) (((qdep) & 0x1f) << 16) #define QI_DEV_IOTLB_ADDR(addr) ((u64)(addr) & VTD_PAGE_MASK) #define QI_DEV_IOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | \ ((u64)((pfsid >> 4) & 0xfff) << 52)) #define QI_DEV_IOTLB_SIZE 1 #define QI_DEV_IOTLB_MAX_INVS 32 #define QI_PC_PASID(pasid) (((u64)pasid) << 32) #define QI_PC_DID(did) (((u64)did) << 16) #define QI_PC_GRAN(gran) (((u64)gran) << 4) /* PASID cache invalidation granu */ #define QI_PC_ALL_PASIDS 0 #define QI_PC_PASID_SEL 1 #define QI_PC_GLOBAL 3 #define QI_EIOTLB_ADDR(addr) ((u64)(addr) & VTD_PAGE_MASK) #define QI_EIOTLB_IH(ih) (((u64)ih) << 6) #define QI_EIOTLB_AM(am) (((u64)am) & 0x3f) #define QI_EIOTLB_PASID(pasid) (((u64)pasid) << 32) #define QI_EIOTLB_DID(did) (((u64)did) << 16) #define QI_EIOTLB_GRAN(gran) (((u64)gran) << 4) /* QI Dev-IOTLB inv granu */ #define QI_DEV_IOTLB_GRAN_ALL 1 #define QI_DEV_IOTLB_GRAN_PASID_SEL 0 #define QI_DEV_EIOTLB_ADDR(a) ((u64)(a) & VTD_PAGE_MASK) #define QI_DEV_EIOTLB_SIZE (((u64)1) << 11) #define QI_DEV_EIOTLB_PASID(p) ((u64)((p) & 0xfffff) << 32) #define QI_DEV_EIOTLB_SID(sid) ((u64)((sid) & 0xffff) << 16) #define QI_DEV_EIOTLB_QDEP(qd) ((u64)((qd) & 0x1f) << 4) #define QI_DEV_EIOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | \ ((u64)((pfsid >> 4) & 0xfff) << 52)) #define QI_DEV_EIOTLB_MAX_INVS 32 /* Page group response descriptor QW0 */ #define QI_PGRP_PASID_P(p) (((u64)(p)) << 4) #define QI_PGRP_RESP_CODE(res) (((u64)(res)) << 12) #define QI_PGRP_DID(rid) (((u64)(rid)) << 16) #define QI_PGRP_PASID(pasid) (((u64)(pasid)) << 32) /* Page group response descriptor QW1 */ #define QI_PGRP_IDX(idx) (((u64)(idx)) << 3) #define QI_RESP_SUCCESS 0x0 #define QI_RESP_INVALID 0x1 #define QI_RESP_FAILURE 0xf #define QI_GRAN_NONG_PASID 2 #define QI_GRAN_PSI_PASID 3 #define qi_shift(iommu) (DMAR_IQ_SHIFT + !!ecap_smts((iommu)->ecap)) struct qi_desc { u64 qw0; u64 qw1; u64 qw2; u64 qw3; }; struct q_inval { raw_spinlock_t q_lock; void *desc; /* invalidation queue */ int *desc_status; /* desc status */ int free_head; /* first free entry */ int free_tail; /* last free entry */ int free_cnt; }; /* Page Request Queue depth */ #define PRQ_ORDER 4 #define PRQ_SIZE (SZ_4K << PRQ_ORDER) #define PRQ_RING_MASK (PRQ_SIZE - 0x20) #define PRQ_DEPTH (PRQ_SIZE >> 5) struct dmar_pci_notify_info; #ifdef CONFIG_IRQ_REMAP #define INTR_REMAP_TABLE_REG_SIZE 0xf #define INTR_REMAP_TABLE_REG_SIZE_MASK 0xf #define INTR_REMAP_TABLE_ENTRIES 65536 struct irq_domain; struct ir_table { struct irte *base; unsigned long *bitmap; }; void intel_irq_remap_add_device(struct dmar_pci_notify_info *info); #else static inline void intel_irq_remap_add_device(struct dmar_pci_notify_info *info) { } #endif struct iommu_flush { void (*flush_context)(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm, u64 type); void (*flush_iotlb)(struct intel_iommu *iommu, u16 did, u64 addr, unsigned int size_order, u64 type); }; enum { SR_DMAR_FECTL_REG, SR_DMAR_FEDATA_REG, SR_DMAR_FEADDR_REG, SR_DMAR_FEUADDR_REG, MAX_SR_DMAR_REGS }; #define VTD_FLAG_TRANS_PRE_ENABLED (1 << 0) #define VTD_FLAG_IRQ_REMAP_PRE_ENABLED (1 << 1) #define VTD_FLAG_SVM_CAPABLE (1 << 2) #define sm_supported(iommu) (intel_iommu_sm && ecap_smts((iommu)->ecap)) #define pasid_supported(iommu) (sm_supported(iommu) && \ ecap_pasid((iommu)->ecap)) #define ssads_supported(iommu) (sm_supported(iommu) && \ ecap_slads((iommu)->ecap) && \ ecap_smpwc(iommu->ecap)) #define nested_supported(iommu) (sm_supported(iommu) && \ ecap_nest((iommu)->ecap)) struct pasid_entry; struct pasid_state_entry; struct page_req_dsc; /* * 0: Present * 1-11: Reserved * 12-63: Context Ptr (12 - (haw-1)) * 64-127: Reserved */ struct root_entry { u64 lo; u64 hi; }; /* * low 64 bits: * 0: present * 1: fault processing disable * 2-3: translation type * 12-63: address space root * high 64 bits: * 0-2: address width * 3-6: aval * 8-23: domain id */ struct context_entry { u64 lo; u64 hi; }; struct iommu_domain_info { struct intel_iommu *iommu; unsigned int refcnt; /* Refcount of devices per iommu */ u16 did; /* Domain ids per IOMMU. Use u16 since * domain ids are 16 bit wide according * to VT-d spec, section 9.3 */ }; /* * We start simply by using a fixed size for the batched descriptors. This * size is currently sufficient for our needs. Future improvements could * involve dynamically allocating the batch buffer based on actual demand, * allowing us to adjust the batch size for optimal performance in different * scenarios. */ #define QI_MAX_BATCHED_DESC_COUNT 16 struct qi_batch { struct qi_desc descs[QI_MAX_BATCHED_DESC_COUNT]; unsigned int index; }; struct dmar_domain { int nid; /* node id */ struct xarray iommu_array; /* Attached IOMMU array */ u8 iommu_coherency: 1; /* indicate coherency of iommu access */ u8 force_snooping : 1; /* Create IOPTEs with snoop control */ u8 set_pte_snp:1; u8 use_first_level:1; /* DMA translation for the domain goes * through the first level page table, * otherwise, goes through the second * level. */ u8 dirty_tracking:1; /* Dirty tracking is enabled */ u8 nested_parent:1; /* Has other domains nested on it */ u8 has_mappings:1; /* Has mappings configured through * iommu_map() interface. */ u8 iotlb_sync_map:1; /* Need to flush IOTLB cache or write * buffer when creating mappings. */ spinlock_t lock; /* Protect device tracking lists */ struct list_head devices; /* all devices' list */ struct list_head dev_pasids; /* all attached pasids */ spinlock_t cache_lock; /* Protect the cache tag list */ struct list_head cache_tags; /* Cache tag list */ struct qi_batch *qi_batch; /* Batched QI descriptors */ int iommu_superpage;/* Level of superpages supported: 0 == 4KiB (no superpages), 1 == 2MiB, 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */ union { /* DMA remapping domain */ struct { /* virtual address */ struct dma_pte *pgd; /* max guest address width */ int gaw; /* * adjusted guest address width: * 0: level 2 30-bit * 1: level 3 39-bit * 2: level 4 48-bit * 3: level 5 57-bit */ int agaw; /* maximum mapped address */ u64 max_addr; /* Protect the s1_domains list */ spinlock_t s1_lock; /* Track s1_domains nested on this domain */ struct list_head s1_domains; }; /* Nested user domain */ struct { /* parent page table which the user domain is nested on */ struct dmar_domain *s2_domain; /* page table attributes */ struct iommu_hwpt_vtd_s1 s1_cfg; /* link to parent domain siblings */ struct list_head s2_link; }; /* SVA domain */ struct { struct mmu_notifier notifier; }; }; struct iommu_domain domain; /* generic domain data structure for iommu core */ }; /* * In theory, the VT-d 4.0 spec can support up to 2 ^ 16 counters. * But in practice, there are only 14 counters for the existing * platform. Setting the max number of counters to 64 should be good * enough for a long time. Also, supporting more than 64 counters * requires more extras, e.g., extra freeze and overflow registers, * which is not necessary for now. */ #define IOMMU_PMU_IDX_MAX 64 struct iommu_pmu { struct intel_iommu *iommu; u32 num_cntr; /* Number of counters */ u32 num_eg; /* Number of event group */ u32 cntr_width; /* Counter width */ u32 cntr_stride; /* Counter Stride */ u32 filter; /* Bitmask of filter support */ void __iomem *base; /* the PerfMon base address */ void __iomem *cfg_reg; /* counter configuration base address */ void __iomem *cntr_reg; /* counter 0 address*/ void __iomem *overflow; /* overflow status register */ u64 *evcap; /* Indicates all supported events */ u32 **cntr_evcap; /* Supported events of each counter. */ struct pmu pmu; DECLARE_BITMAP(used_mask, IOMMU_PMU_IDX_MAX); struct perf_event *event_list[IOMMU_PMU_IDX_MAX]; unsigned char irq_name[16]; }; #define IOMMU_IRQ_ID_OFFSET_PRQ (DMAR_UNITS_SUPPORTED) #define IOMMU_IRQ_ID_OFFSET_PERF (2 * DMAR_UNITS_SUPPORTED) struct intel_iommu { void __iomem *reg; /* Pointer to hardware regs, virtual addr */ u64 reg_phys; /* physical address of hw register set */ u64 reg_size; /* size of hw register set */ u64 cap; u64 ecap; u64 vccap; u64 ecmdcap[DMA_MAX_NUM_ECMDCAP]; u32 gcmd; /* Holds TE, EAFL. Don't need SRTP, SFL, WBF */ raw_spinlock_t register_lock; /* protect register handling */ int seq_id; /* sequence id of the iommu */ int agaw; /* agaw of this iommu */ int msagaw; /* max sagaw of this iommu */ unsigned int irq, pr_irq, perf_irq; u16 segment; /* PCI segment# */ unsigned char name[16]; /* Device Name */ #ifdef CONFIG_INTEL_IOMMU /* mutex to protect domain_ida */ struct mutex did_lock; struct ida domain_ida; /* domain id allocator */ unsigned long *copied_tables; /* bitmap of copied tables */ spinlock_t lock; /* protect context, domain ids */ struct root_entry *root_entry; /* virtual address */ struct iommu_flush flush; #endif struct page_req_dsc *prq; unsigned char prq_name[16]; /* Name for PRQ interrupt */ unsigned long prq_seq_number; struct completion prq_complete; struct iopf_queue *iopf_queue; unsigned char iopfq_name[16]; /* Synchronization between fault report and iommu device release. */ struct mutex iopf_lock; struct q_inval *qi; /* Queued invalidation info */ u32 iommu_state[MAX_SR_DMAR_REGS]; /* Store iommu states between suspend and resume.*/ /* rb tree for all probed devices */ struct rb_root device_rbtree; /* protect the device_rbtree */ spinlock_t device_rbtree_lock; #ifdef CONFIG_IRQ_REMAP struct ir_table *ir_table; /* Interrupt remapping info */ struct irq_domain *ir_domain; #endif struct iommu_device iommu; /* IOMMU core code handle */ int node; u32 flags; /* Software defined flags */ struct dmar_drhd_unit *drhd; void *perf_statistic; struct iommu_pmu *pmu; }; /* PCI domain-device relationship */ struct device_domain_info { struct list_head link; /* link to domain siblings */ u32 segment; /* PCI segment number */ u8 bus; /* PCI bus number */ u8 devfn; /* PCI devfn number */ u16 pfsid; /* SRIOV physical function source ID */ u8 pasid_supported:3; u8 pasid_enabled:1; u8 pri_supported:1; u8 pri_enabled:1; u8 ats_supported:1; u8 ats_enabled:1; u8 dtlb_extra_inval:1; /* Quirk for devices need extra flush */ u8 domain_attached:1; /* Device has domain attached */ u8 ats_qdep; unsigned int iopf_refcount; struct device *dev; /* it's NULL for PCIe-to-PCI bridge */ struct intel_iommu *iommu; /* IOMMU used by this device */ struct dmar_domain *domain; /* pointer to domain */ struct pasid_table *pasid_table; /* pasid table */ /* device tracking node(lookup by PCI RID) */ struct rb_node node; #ifdef CONFIG_INTEL_IOMMU_DEBUGFS struct dentry *debugfs_dentry; /* pointer to device directory dentry */ #endif }; struct dev_pasid_info { struct list_head link_domain; /* link to domain siblings */ struct device *dev; ioasid_t pasid; #ifdef CONFIG_INTEL_IOMMU_DEBUGFS struct dentry *debugfs_dentry; /* pointer to pasid directory dentry */ #endif }; static inline void __iommu_flush_cache( struct intel_iommu *iommu, void *addr, int size) { if (!ecap_coherent(iommu->ecap)) clflush_cache_range(addr, size); } /* Convert generic struct iommu_domain to private struct dmar_domain */ static inline struct dmar_domain *to_dmar_domain(struct iommu_domain *dom) { return container_of(dom, struct dmar_domain, domain); } /* * Domain ID 0 and 1 are reserved: * * If Caching mode is set, then invalid translations are tagged * with domain-id 0, hence we need to pre-allocate it. We also * use domain-id 0 as a marker for non-allocated domain-id, so * make sure it is not used for a real domain. * * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid * entry for first-level or pass-through translation modes should * be programmed with a domain id different from those used for * second-level or nested translation. We reserve a domain id for * this purpose. This domain id is also used for identity domain * in legacy mode. */ #define FLPT_DEFAULT_DID 1 #define IDA_START_DID 2 /* Retrieve the domain ID which has allocated to the domain */ static inline u16 domain_id_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) { struct iommu_domain_info *info = xa_load(&domain->iommu_array, iommu->seq_id); return info->did; } static inline u16 iommu_domain_did(struct iommu_domain *domain, struct intel_iommu *iommu) { if (domain->type == IOMMU_DOMAIN_SVA || domain->type == IOMMU_DOMAIN_IDENTITY) return FLPT_DEFAULT_DID; return domain_id_iommu(to_dmar_domain(domain), iommu); } static inline bool dev_is_real_dma_subdevice(struct device *dev) { return dev && dev_is_pci(dev) && pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev); } /* * 0: readable * 1: writable * 2-6: reserved * 7: super page * 8-10: available * 11: snoop behavior * 12-63: Host physical address */ struct dma_pte { u64 val; }; static inline void dma_clear_pte(struct dma_pte *pte) { pte->val = 0; } static inline u64 dma_pte_addr(struct dma_pte *pte) { #ifdef CONFIG_64BIT return pte->val & VTD_PAGE_MASK; #else /* Must have a full atomic 64-bit read */ return __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK; #endif } static inline bool dma_pte_present(struct dma_pte *pte) { return (pte->val & 3) != 0; } static inline bool dma_sl_pte_test_and_clear_dirty(struct dma_pte *pte, unsigned long flags) { if (flags & IOMMU_DIRTY_NO_CLEAR) return (pte->val & DMA_SL_PTE_DIRTY) != 0; return test_and_clear_bit(DMA_SL_PTE_DIRTY_BIT, (unsigned long *)&pte->val); } static inline bool dma_pte_superpage(struct dma_pte *pte) { return (pte->val & DMA_PTE_LARGE_PAGE); } static inline bool first_pte_in_page(struct dma_pte *pte) { return IS_ALIGNED((unsigned long)pte, VTD_PAGE_SIZE); } static inline int nr_pte_to_next_page(struct dma_pte *pte) { return first_pte_in_page(pte) ? BIT_ULL(VTD_STRIDE_SHIFT) : (struct dma_pte *)ALIGN((unsigned long)pte, VTD_PAGE_SIZE) - pte; } static inline bool context_present(struct context_entry *context) { return (context->lo & 1); } #define LEVEL_STRIDE (9) #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1) #define MAX_AGAW_WIDTH (64) #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT) static inline int agaw_to_level(int agaw) { return agaw + 2; } static inline int agaw_to_width(int agaw) { return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH); } static inline int width_to_agaw(int width) { return DIV_ROUND_UP(width - 30, LEVEL_STRIDE); } static inline unsigned int level_to_offset_bits(int level) { return (level - 1) * LEVEL_STRIDE; } static inline int pfn_level_offset(u64 pfn, int level) { return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK; } static inline u64 level_mask(int level) { return -1ULL << level_to_offset_bits(level); } static inline u64 level_size(int level) { return 1ULL << level_to_offset_bits(level); } static inline u64 align_to_level(u64 pfn, int level) { return (pfn + level_size(level) - 1) & level_mask(level); } static inline unsigned long lvl_to_nr_pages(unsigned int lvl) { return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH); } static inline void context_set_present(struct context_entry *context) { context->lo |= 1; } static inline void context_set_fault_enable(struct context_entry *context) { context->lo &= (((u64)-1) << 2) | 1; } static inline void context_set_translation_type(struct context_entry *context, unsigned long value) { context->lo &= (((u64)-1) << 4) | 3; context->lo |= (value & 3) << 2; } static inline void context_set_address_root(struct context_entry *context, unsigned long value) { context->lo &= ~VTD_PAGE_MASK; context->lo |= value & VTD_PAGE_MASK; } static inline void context_set_address_width(struct context_entry *context, unsigned long value) { context->hi |= value & 7; } static inline void context_set_domain_id(struct context_entry *context, unsigned long value) { context->hi |= (value & ((1 << 16) - 1)) << 8; } static inline void context_set_pasid(struct context_entry *context) { context->lo |= CONTEXT_PASIDE; } static inline int context_domain_id(struct context_entry *c) { return((c->hi >> 8) & 0xffff); } static inline void context_clear_entry(struct context_entry *context) { context->lo = 0; context->hi = 0; } #ifdef CONFIG_INTEL_IOMMU static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) { if (!iommu->copied_tables) return false; return test_bit(((long)bus << 8) | devfn, iommu->copied_tables); } static inline void set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) { set_bit(((long)bus << 8) | devfn, iommu->copied_tables); } static inline void clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) { clear_bit(((long)bus << 8) | devfn, iommu->copied_tables); } #endif /* CONFIG_INTEL_IOMMU */ /* * Set the RID_PASID field of a scalable mode context entry. The * IOMMU hardware will use the PASID value set in this field for * DMA translations of DMA requests without PASID. */ static inline void context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid) { context->hi |= pasid & ((1 << 20) - 1); } /* * Set the DTE(Device-TLB Enable) field of a scalable mode context * entry. */ static inline void context_set_sm_dte(struct context_entry *context) { context->lo |= BIT_ULL(2); } /* * Set the PRE(Page Request Enable) field of a scalable mode context * entry. */ static inline void context_set_sm_pre(struct context_entry *context) { context->lo |= BIT_ULL(4); } /* * Clear the PRE(Page Request Enable) field of a scalable mode context * entry. */ static inline void context_clear_sm_pre(struct context_entry *context) { context->lo &= ~BIT_ULL(4); } /* Returns a number of VTD pages, but aligned to MM page size */ static inline unsigned long aligned_nrpages(unsigned long host_addr, size_t size) { host_addr &= ~PAGE_MASK; return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT; } /* Return a size from number of VTD pages. */ static inline unsigned long nrpages_to_size(unsigned long npages) { return npages << VTD_PAGE_SHIFT; } static inline void qi_desc_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, unsigned int size_order, u64 type, struct qi_desc *desc) { u8 dw = 0, dr = 0; int ih = 0; if (cap_write_drain(iommu->cap)) dw = 1; if (cap_read_drain(iommu->cap)) dr = 1; desc->qw0 = QI_IOTLB_DID(did) | QI_IOTLB_DR(dr) | QI_IOTLB_DW(dw) | QI_IOTLB_GRAN(type) | QI_IOTLB_TYPE; desc->qw1 = QI_IOTLB_ADDR(addr) | QI_IOTLB_IH(ih) | QI_IOTLB_AM(size_order); desc->qw2 = 0; desc->qw3 = 0; } static inline void qi_desc_dev_iotlb(u16 sid, u16 pfsid, u16 qdep, u64 addr, unsigned int mask, struct qi_desc *desc) { if (mask) { addr |= (1ULL << (VTD_PAGE_SHIFT + mask - 1)) - 1; desc->qw1 = QI_DEV_IOTLB_ADDR(addr) | QI_DEV_IOTLB_SIZE; } else { desc->qw1 = QI_DEV_IOTLB_ADDR(addr); } if (qdep >= QI_DEV_IOTLB_MAX_INVS) qdep = 0; desc->qw0 = QI_DEV_IOTLB_SID(sid) | QI_DEV_IOTLB_QDEP(qdep) | QI_DIOTLB_TYPE | QI_DEV_IOTLB_PFSID(pfsid); desc->qw2 = 0; desc->qw3 = 0; } static inline void qi_desc_piotlb(u16 did, u32 pasid, u64 addr, unsigned long npages, bool ih, struct qi_desc *desc) { if (npages == -1) { desc->qw0 = QI_EIOTLB_PASID(pasid) | QI_EIOTLB_DID(did) | QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | QI_EIOTLB_TYPE; desc->qw1 = 0; } else { int mask = ilog2(__roundup_pow_of_two(npages)); unsigned long align = (1ULL << (VTD_PAGE_SHIFT + mask)); if (WARN_ON_ONCE(!IS_ALIGNED(addr, align))) addr = ALIGN_DOWN(addr, align); desc->qw0 = QI_EIOTLB_PASID(pasid) | QI_EIOTLB_DID(did) | QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) | QI_EIOTLB_TYPE; desc->qw1 = QI_EIOTLB_ADDR(addr) | QI_EIOTLB_IH(ih) | QI_EIOTLB_AM(mask); } } static inline void qi_desc_dev_iotlb_pasid(u16 sid, u16 pfsid, u32 pasid, u16 qdep, u64 addr, unsigned int size_order, struct qi_desc *desc) { unsigned long mask = 1UL << (VTD_PAGE_SHIFT + size_order - 1); desc->qw0 = QI_DEV_EIOTLB_PASID(pasid) | QI_DEV_EIOTLB_SID(sid) | QI_DEV_EIOTLB_QDEP(qdep) | QI_DEIOTLB_TYPE | QI_DEV_IOTLB_PFSID(pfsid); /* * If S bit is 0, we only flush a single page. If S bit is set, * The least significant zero bit indicates the invalidation address * range. VT-d spec 6.5.2.6. * e.g. address bit 12[0] indicates 8KB, 13[0] indicates 16KB. * size order = 0 is PAGE_SIZE 4KB * Max Invs Pending (MIP) is set to 0 for now until we have DIT in * ECAP. */ if (!IS_ALIGNED(addr, VTD_PAGE_SIZE << size_order)) pr_warn_ratelimited("Invalidate non-aligned address %llx, order %d\n", addr, size_order); /* Take page address */ desc->qw1 = QI_DEV_EIOTLB_ADDR(addr); if (size_order) { /* * Existing 0s in address below size_order may be the least * significant bit, we must set them to 1s to avoid having * smaller size than desired. */ desc->qw1 |= GENMASK_ULL(size_order + VTD_PAGE_SHIFT - 1, VTD_PAGE_SHIFT); /* Clear size_order bit to indicate size */ desc->qw1 &= ~mask; /* Set the S bit to indicate flushing more than 1 page */ desc->qw1 |= QI_DEV_EIOTLB_SIZE; } } /* Convert value to context PASID directory size field coding. */ #define context_pdts(pds) (((pds) & 0x7) << 9) struct dmar_drhd_unit *dmar_find_matched_drhd_unit(struct pci_dev *dev); int dmar_enable_qi(struct intel_iommu *iommu); void dmar_disable_qi(struct intel_iommu *iommu); int dmar_reenable_qi(struct intel_iommu *iommu); void qi_global_iec(struct intel_iommu *iommu); void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm, u64 type); void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, unsigned int size_order, u64 type); void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid, u16 qdep, u64 addr, unsigned mask); void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr, unsigned long npages, bool ih); void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid, u32 pasid, u16 qdep, u64 addr, unsigned int size_order); void quirk_extra_dev_tlb_flush(struct device_domain_info *info, unsigned long address, unsigned long pages, u32 pasid, u16 qdep); void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did, u64 granu, u32 pasid); int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc, unsigned int count, unsigned long options); void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, unsigned int size_order, u64 type); /* * Options used in qi_submit_sync: * QI_OPT_WAIT_DRAIN - Wait for PRQ drain completion, spec 6.5.2.8. */ #define QI_OPT_WAIT_DRAIN BIT(0) int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); void device_block_translation(struct device *dev); int paging_domain_compatible(struct iommu_domain *domain, struct device *dev); struct dev_pasid_info * domain_add_dev_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid); void domain_remove_dev_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid); int __domain_setup_first_level(struct intel_iommu *iommu, struct device *dev, ioasid_t pasid, u16 did, phys_addr_t fsptptr, int flags, struct iommu_domain *old); int dmar_ir_support(void); void iommu_flush_write_buffer(struct intel_iommu *iommu); struct iommu_domain * intel_iommu_domain_alloc_nested(struct device *dev, struct iommu_domain *parent, u32 flags, const struct iommu_user_data *user_data); struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid); enum cache_tag_type { CACHE_TAG_IOTLB, CACHE_TAG_DEVTLB, CACHE_TAG_NESTING_IOTLB, CACHE_TAG_NESTING_DEVTLB, }; struct cache_tag { struct list_head node; enum cache_tag_type type; struct intel_iommu *iommu; /* * The @dev field represents the location of the cache. For IOTLB, it * resides on the IOMMU hardware. @dev stores the device pointer to * the IOMMU hardware. For DevTLB, it locates in the PCIe endpoint. * @dev stores the device pointer to that endpoint. */ struct device *dev; u16 domain_id; ioasid_t pasid; unsigned int users; }; int cache_tag_assign(struct dmar_domain *domain, u16 did, struct device *dev, ioasid_t pasid, enum cache_tag_type type); int cache_tag_assign_domain(struct dmar_domain *domain, struct device *dev, ioasid_t pasid); void cache_tag_unassign_domain(struct dmar_domain *domain, struct device *dev, ioasid_t pasid); void cache_tag_flush_range(struct dmar_domain *domain, unsigned long start, unsigned long end, int ih); void cache_tag_flush_all(struct dmar_domain *domain); void cache_tag_flush_range_np(struct dmar_domain *domain, unsigned long start, unsigned long end); void intel_context_flush_no_pasid(struct device_domain_info *info, struct context_entry *context, u16 did); int intel_iommu_enable_prq(struct intel_iommu *iommu); int intel_iommu_finish_prq(struct intel_iommu *iommu); void intel_iommu_page_response(struct device *dev, struct iopf_fault *evt, struct iommu_page_response *msg); void intel_iommu_drain_pasid_prq(struct device *dev, u32 pasid); int intel_iommu_enable_iopf(struct device *dev); void intel_iommu_disable_iopf(struct device *dev); static inline int iopf_for_domain_set(struct iommu_domain *domain, struct device *dev) { if (!domain || !domain->iopf_handler) return 0; return intel_iommu_enable_iopf(dev); } static inline void iopf_for_domain_remove(struct iommu_domain *domain, struct device *dev) { if (!domain || !domain->iopf_handler) return; intel_iommu_disable_iopf(dev); } static inline int iopf_for_domain_replace(struct iommu_domain *new, struct iommu_domain *old, struct device *dev) { int ret; ret = iopf_for_domain_set(new, dev); if (ret) return ret; iopf_for_domain_remove(old, dev); return 0; } #ifdef CONFIG_INTEL_IOMMU_SVM void intel_svm_check(struct intel_iommu *iommu); struct iommu_domain *intel_svm_domain_alloc(struct device *dev, struct mm_struct *mm); #else static inline void intel_svm_check(struct intel_iommu *iommu) {} static inline struct iommu_domain *intel_svm_domain_alloc(struct device *dev, struct mm_struct *mm) { return ERR_PTR(-ENODEV); } #endif #ifdef CONFIG_INTEL_IOMMU_DEBUGFS void intel_iommu_debugfs_init(void); void intel_iommu_debugfs_create_dev(struct device_domain_info *info); void intel_iommu_debugfs_remove_dev(struct device_domain_info *info); void intel_iommu_debugfs_create_dev_pasid(struct dev_pasid_info *dev_pasid); void intel_iommu_debugfs_remove_dev_pasid(struct dev_pasid_info *dev_pasid); #else static inline void intel_iommu_debugfs_init(void) {} static inline void intel_iommu_debugfs_create_dev(struct device_domain_info *info) {} static inline void intel_iommu_debugfs_remove_dev(struct device_domain_info *info) {} static inline void intel_iommu_debugfs_create_dev_pasid(struct dev_pasid_info *dev_pasid) {} static inline void intel_iommu_debugfs_remove_dev_pasid(struct dev_pasid_info *dev_pasid) {} #endif /* CONFIG_INTEL_IOMMU_DEBUGFS */ extern const struct attribute_group *intel_iommu_groups[]; struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, u8 devfn, int alloc); extern const struct iommu_ops intel_iommu_ops; extern const struct iommu_domain_ops intel_fs_paging_domain_ops; extern const struct iommu_domain_ops intel_ss_paging_domain_ops; static inline bool intel_domain_is_fs_paging(struct dmar_domain *domain) { return domain->domain.ops == &intel_fs_paging_domain_ops; } static inline bool intel_domain_is_ss_paging(struct dmar_domain *domain) { return domain->domain.ops == &intel_ss_paging_domain_ops; } #ifdef CONFIG_INTEL_IOMMU extern int intel_iommu_sm; int iommu_calculate_agaw(struct intel_iommu *iommu); int iommu_calculate_max_sagaw(struct intel_iommu *iommu); int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob); static inline bool ecmd_has_pmu_essential(struct intel_iommu *iommu) { return (iommu->ecmdcap[DMA_ECMD_ECCAP3] & DMA_ECMD_ECCAP3_ESSENTIAL) == DMA_ECMD_ECCAP3_ESSENTIAL; } extern int dmar_disabled; extern int intel_iommu_enabled; #else static inline int iommu_calculate_agaw(struct intel_iommu *iommu) { return 0; } static inline int iommu_calculate_max_sagaw(struct intel_iommu *iommu) { return 0; } #define dmar_disabled (1) #define intel_iommu_enabled (0) #define intel_iommu_sm (0) #endif static inline const char *decode_prq_descriptor(char *str, size_t size, u64 dw0, u64 dw1, u64 dw2, u64 dw3) { char *buf = str; int bytes; bytes = snprintf(buf, size, "rid=0x%llx addr=0x%llx %c%c%c%c%c pasid=0x%llx index=0x%llx", FIELD_GET(GENMASK_ULL(31, 16), dw0), FIELD_GET(GENMASK_ULL(63, 12), dw1), dw1 & BIT_ULL(0) ? 'r' : '-', dw1 & BIT_ULL(1) ? 'w' : '-', dw0 & BIT_ULL(52) ? 'x' : '-', dw0 & BIT_ULL(53) ? 'p' : '-', dw1 & BIT_ULL(2) ? 'l' : '-', FIELD_GET(GENMASK_ULL(51, 32), dw0), FIELD_GET(GENMASK_ULL(11, 3), dw1)); /* Private Data */ if (dw0 & BIT_ULL(9)) { size -= bytes; buf += bytes; snprintf(buf, size, " private=0x%llx/0x%llx\n", dw2, dw3); } return str; } #endif
5288 5754 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _KERNEL_PRINTK_RINGBUFFER_H #define _KERNEL_PRINTK_RINGBUFFER_H #include <linux/atomic.h> #include <linux/bits.h> #include <linux/dev_printk.h> #include <linux/stddef.h> #include <linux/types.h> /* * Meta information about each stored message. * * All fields are set by the printk code except for @seq, which is * set by the ringbuffer code. */ struct printk_info { u64 seq; /* sequence number */ u64 ts_nsec; /* timestamp in nanoseconds */ u16 text_len; /* length of text message */ u8 facility; /* syslog facility */ u8 flags:5; /* internal record flags */ u8 level:3; /* syslog level */ u32 caller_id; /* thread id or processor id */ struct dev_printk_info dev_info; }; /* * A structure providing the buffers, used by writers and readers. * * Writers: * Using prb_rec_init_wr(), a writer sets @text_buf_size before calling * prb_reserve(). On success, prb_reserve() sets @info and @text_buf to * buffers reserved for that writer. * * Readers: * Using prb_rec_init_rd(), a reader sets all fields before calling * prb_read_valid(). Note that the reader provides the @info and @text_buf, * buffers. On success, the struct pointed to by @info will be filled and * the char array pointed to by @text_buf will be filled with text data. */ struct printk_record { struct printk_info *info; char *text_buf; unsigned int text_buf_size; }; /* Specifies the logical position and span of a data block. */ struct prb_data_blk_lpos { unsigned long begin; unsigned long next; }; /* * A descriptor: the complete meta-data for a record. * * @state_var: A bitwise combination of descriptor ID and descriptor state. */ struct prb_desc { atomic_long_t state_var; struct prb_data_blk_lpos text_blk_lpos; }; /* A ringbuffer of "ID + data" elements. */ struct prb_data_ring { unsigned int size_bits; char *data; atomic_long_t head_lpos; atomic_long_t tail_lpos; }; /* A ringbuffer of "struct prb_desc" elements. */ struct prb_desc_ring { unsigned int count_bits; struct prb_desc *descs; struct printk_info *infos; atomic_long_t head_id; atomic_long_t tail_id; atomic_long_t last_finalized_seq; }; /* * The high level structure representing the printk ringbuffer. * * @fail: Count of failed prb_reserve() calls where not even a data-less * record was created. */ struct printk_ringbuffer { struct prb_desc_ring desc_ring; struct prb_data_ring text_data_ring; atomic_long_t fail; }; /* * Used by writers as a reserve/commit handle. * * @rb: Ringbuffer where the entry is reserved. * @irqflags: Saved irq flags to restore on entry commit. * @id: ID of the reserved descriptor. * @text_space: Total occupied buffer space in the text data ring, including * ID, alignment padding, and wrapping data blocks. * * This structure is an opaque handle for writers. Its contents are only * to be used by the ringbuffer implementation. */ struct prb_reserved_entry { struct printk_ringbuffer *rb; unsigned long irqflags; unsigned long id; unsigned int text_space; }; /* The possible responses of a descriptor state-query. */ enum desc_state { desc_miss = -1, /* ID mismatch (pseudo state) */ desc_reserved = 0x0, /* reserved, in use by writer */ desc_committed = 0x1, /* committed by writer, could get reopened */ desc_finalized = 0x2, /* committed, no further modification allowed */ desc_reusable = 0x3, /* free, not yet used by any writer */ }; #define _DATA_SIZE(sz_bits) (1UL << (sz_bits)) #define _DESCS_COUNT(ct_bits) (1U << (ct_bits)) #define DESC_SV_BITS BITS_PER_LONG #define DESC_FLAGS_SHIFT (DESC_SV_BITS - 2) #define DESC_FLAGS_MASK (3UL << DESC_FLAGS_SHIFT) #define DESC_STATE(sv) (3UL & (sv >> DESC_FLAGS_SHIFT)) #define DESC_SV(id, state) (((unsigned long)state << DESC_FLAGS_SHIFT) | id) #define DESC_ID_MASK (~DESC_FLAGS_MASK) #define DESC_ID(sv) ((sv) & DESC_ID_MASK) /* * Special data block logical position values (for fields of * @prb_desc.text_blk_lpos). * * - Bit0 is used to identify if the record has no data block. (Implemented in * the LPOS_DATALESS() macro.) * * - Bit1 specifies the reason for not having a data block. * * These special values could never be real lpos values because of the * meta data and alignment padding of data blocks. (See to_blk_size() for * details.) */ #define FAILED_LPOS 0x1 #define EMPTY_LINE_LPOS 0x3 #define FAILED_BLK_LPOS \ { \ .begin = FAILED_LPOS, \ .next = FAILED_LPOS, \ } /* * Descriptor Bootstrap * * The descriptor array is minimally initialized to allow immediate usage * by readers and writers. The requirements that the descriptor array * initialization must satisfy: * * Req1 * The tail must point to an existing (committed or reusable) descriptor. * This is required by the implementation of prb_first_seq(). * * Req2 * Readers must see that the ringbuffer is initially empty. * * Req3 * The first record reserved by a writer is assigned sequence number 0. * * To satisfy Req1, the tail initially points to a descriptor that is * minimally initialized (having no data block, i.e. data-less with the * data block's lpos @begin and @next values set to FAILED_LPOS). * * To satisfy Req2, the initial tail descriptor is initialized to the * reusable state. Readers recognize reusable descriptors as existing * records, but skip over them. * * To satisfy Req3, the last descriptor in the array is used as the initial * head (and tail) descriptor. This allows the first record reserved by a * writer (head + 1) to be the first descriptor in the array. (Only the first * descriptor in the array could have a valid sequence number of 0.) * * The first time a descriptor is reserved, it is assigned a sequence number * with the value of the array index. A "first time reserved" descriptor can * be recognized because it has a sequence number of 0 but does not have an * index of 0. (Only the first descriptor in the array could have a valid * sequence number of 0.) After the first reservation, all future reservations * (recycling) simply involve incrementing the sequence number by the array * count. * * Hack #1 * Only the first descriptor in the array is allowed to have the sequence * number 0. In this case it is not possible to recognize if it is being * reserved the first time (set to index value) or has been reserved * previously (increment by the array count). This is handled by _always_ * incrementing the sequence number by the array count when reserving the * first descriptor in the array. In order to satisfy Req3, the sequence * number of the first descriptor in the array is initialized to minus * the array count. Then, upon the first reservation, it is incremented * to 0, thus satisfying Req3. * * Hack #2 * prb_first_seq() can be called at any time by readers to retrieve the * sequence number of the tail descriptor. However, due to Req2 and Req3, * initially there are no records to report the sequence number of * (sequence numbers are u64 and there is nothing less than 0). To handle * this, the sequence number of the initial tail descriptor is initialized * to 0. Technically this is incorrect, because there is no record with * sequence number 0 (yet) and the tail descriptor is not the first * descriptor in the array. But it allows prb_read_valid() to correctly * report the existence of a record for _any_ given sequence number at all * times. Bootstrapping is complete when the tail is pushed the first * time, thus finally pointing to the first descriptor reserved by a * writer, which has the assigned sequence number 0. */ /* * Initiating Logical Value Overflows * * Both logical position (lpos) and ID values can be mapped to array indexes * but may experience overflows during the lifetime of the system. To ensure * that printk_ringbuffer can handle the overflows for these types, initial * values are chosen that map to the correct initial array indexes, but will * result in overflows soon. * * BLK0_LPOS * The initial @head_lpos and @tail_lpos for data rings. It is at index * 0 and the lpos value is such that it will overflow on the first wrap. * * DESC0_ID * The initial @head_id and @tail_id for the desc ring. It is at the last * index of the descriptor array (see Req3 above) and the ID value is such * that it will overflow on the second wrap. */ #define BLK0_LPOS(sz_bits) (-(_DATA_SIZE(sz_bits))) #define DESC0_ID(ct_bits) DESC_ID(-(_DESCS_COUNT(ct_bits) + 1)) #define DESC0_SV(ct_bits) DESC_SV(DESC0_ID(ct_bits), desc_reusable) /* * Define a ringbuffer with an external text data buffer. The same as * DEFINE_PRINTKRB() but requires specifying an external buffer for the * text data. * * Note: The specified external buffer must be of the size: * 2 ^ (descbits + avgtextbits) */ #define _DEFINE_PRINTKRB(name, descbits, avgtextbits, text_buf) \ static struct prb_desc _##name##_descs[_DESCS_COUNT(descbits)] = { \ /* the initial head and tail */ \ [_DESCS_COUNT(descbits) - 1] = { \ /* reusable */ \ .state_var = ATOMIC_INIT(DESC0_SV(descbits)), \ /* no associated data block */ \ .text_blk_lpos = FAILED_BLK_LPOS, \ }, \ }; \ static struct printk_info _##name##_infos[_DESCS_COUNT(descbits)] = { \ /* this will be the first record reserved by a writer */ \ [0] = { \ /* will be incremented to 0 on the first reservation */ \ .seq = -(u64)_DESCS_COUNT(descbits), \ }, \ /* the initial head and tail */ \ [_DESCS_COUNT(descbits) - 1] = { \ /* reports the first seq value during the bootstrap phase */ \ .seq = 0, \ }, \ }; \ static struct printk_ringbuffer name = { \ .desc_ring = { \ .count_bits = descbits, \ .descs = &_##name##_descs[0], \ .infos = &_##name##_infos[0], \ .head_id = ATOMIC_INIT(DESC0_ID(descbits)), \ .tail_id = ATOMIC_INIT(DESC0_ID(descbits)), \ .last_finalized_seq = ATOMIC_INIT(0), \ }, \ .text_data_ring = { \ .size_bits = (avgtextbits) + (descbits), \ .data = text_buf, \ .head_lpos = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))), \ .tail_lpos = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))), \ }, \ .fail = ATOMIC_LONG_INIT(0), \ } /** * DEFINE_PRINTKRB() - Define a ringbuffer. * * @name: The name of the ringbuffer variable. * @descbits: The number of descriptors as a power-of-2 value. * @avgtextbits: The average text data size per record as a power-of-2 value. * * This is a macro for defining a ringbuffer and all internal structures * such that it is ready for immediate use. See _DEFINE_PRINTKRB() for a * variant where the text data buffer can be specified externally. */ #define DEFINE_PRINTKRB(name, descbits, avgtextbits) \ static char _##name##_text[1U << ((avgtextbits) + (descbits))] \ __aligned(__alignof__(unsigned long)); \ _DEFINE_PRINTKRB(name, descbits, avgtextbits, &_##name##_text[0]) /* Writer Interface */ /** * prb_rec_init_wr() - Initialize a buffer for writing records. * * @r: The record to initialize. * @text_buf_size: The needed text buffer size. */ static inline void prb_rec_init_wr(struct printk_record *r, unsigned int text_buf_size) { r->info = NULL; r->text_buf = NULL; r->text_buf_size = text_buf_size; } bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, struct printk_record *r); bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, struct printk_record *r, u32 caller_id, unsigned int max_size); void prb_commit(struct prb_reserved_entry *e); void prb_final_commit(struct prb_reserved_entry *e); void prb_init(struct printk_ringbuffer *rb, char *text_buf, unsigned int text_buf_size, struct prb_desc *descs, unsigned int descs_count_bits, struct printk_info *infos); unsigned int prb_record_text_space(struct prb_reserved_entry *e); /* Reader Interface */ /** * prb_rec_init_rd() - Initialize a buffer for reading records. * * @r: The record to initialize. * @info: A buffer to store record meta-data. * @text_buf: A buffer to store text data. * @text_buf_size: The size of @text_buf. * * Initialize all the fields that a reader is interested in. All arguments * (except @r) are optional. Only record data for arguments that are * non-NULL or non-zero will be read. */ static inline void prb_rec_init_rd(struct printk_record *r, struct printk_info *info, char *text_buf, unsigned int text_buf_size) { r->info = info; r->text_buf = text_buf; r->text_buf_size = text_buf_size; } /** * prb_for_each_record() - Iterate over the records of a ringbuffer. * * @from: The sequence number to begin with. * @rb: The ringbuffer to iterate over. * @s: A u64 to store the sequence number on each iteration. * @r: A printk_record to store the record on each iteration. * * This is a macro for conveniently iterating over a ringbuffer. * Note that @s may not be the sequence number of the record on each * iteration. For the sequence number, @r->info->seq should be checked. * * Context: Any context. */ #define prb_for_each_record(from, rb, s, r) \ for ((s) = from; prb_read_valid(rb, s, r); (s) = (r)->info->seq + 1) /** * prb_for_each_info() - Iterate over the meta data of a ringbuffer. * * @from: The sequence number to begin with. * @rb: The ringbuffer to iterate over. * @s: A u64 to store the sequence number on each iteration. * @i: A printk_info to store the record meta data on each iteration. * @lc: An unsigned int to store the text line count of each record. * * This is a macro for conveniently iterating over a ringbuffer. * Note that @s may not be the sequence number of the record on each * iteration. For the sequence number, @r->info->seq should be checked. * * Context: Any context. */ #define prb_for_each_info(from, rb, s, i, lc) \ for ((s) = from; prb_read_valid_info(rb, s, i, lc); (s) = (i)->seq + 1) bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq, struct printk_record *r); bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq, struct printk_info *info, unsigned int *line_count); u64 prb_first_seq(struct printk_ringbuffer *rb); u64 prb_first_valid_seq(struct printk_ringbuffer *rb); u64 prb_next_seq(struct printk_ringbuffer *rb); u64 prb_next_reserve_seq(struct printk_ringbuffer *rb); #ifdef CONFIG_64BIT #define __u64seq_to_ulseq(u64seq) (u64seq) #define __ulseq_to_u64seq(rb, ulseq) (ulseq) #define ULSEQ_MAX(rb) (-1) #else /* CONFIG_64BIT */ #define __u64seq_to_ulseq(u64seq) ((u32)u64seq) #define ULSEQ_MAX(rb) __u64seq_to_ulseq(prb_first_seq(rb) + 0x80000000UL) static inline u64 __ulseq_to_u64seq(struct printk_ringbuffer *rb, u32 ulseq) { u64 rb_first_seq = prb_first_seq(rb); u64 seq; /* * The provided sequence is only the lower 32 bits of the ringbuffer * sequence. It needs to be expanded to 64bit. Get the first sequence * number from the ringbuffer and fold it. * * Having a 32bit representation in the console is sufficient. * If a console ever gets more than 2^31 records behind * the ringbuffer then this is the least of the problems. * * Also the access to the ring buffer is always safe. */ seq = rb_first_seq - (s32)((u32)rb_first_seq - ulseq); return seq; } #endif /* CONFIG_64BIT */ #endif /* _KERNEL_PRINTK_RINGBUFFER_H */
205 1 206 206 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2011 IBM Corporation * * Author: * Mimi Zohar <zohar@us.ibm.com> */ #include <linux/xattr.h> #include <linux/evm.h> int posix_xattr_acl(const char *xattr) { int xattr_len = strlen(xattr); if ((strlen(XATTR_NAME_POSIX_ACL_ACCESS) == xattr_len) && (strncmp(XATTR_NAME_POSIX_ACL_ACCESS, xattr, xattr_len) == 0)) return 1; if ((strlen(XATTR_NAME_POSIX_ACL_DEFAULT) == xattr_len) && (strncmp(XATTR_NAME_POSIX_ACL_DEFAULT, xattr, xattr_len) == 0)) return 1; return 0; }
66 66 65 66 6 6 6 6 6 6 1 1 1 1 7 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 // SPDX-License-Identifier: GPL-2.0-only /* * virtio transport for vsock * * Copyright (C) 2013-2015 Red Hat, Inc. * Author: Asias He <asias@redhat.com> * Stefan Hajnoczi <stefanha@redhat.com> * * Some of the code is take from Gerd Hoffmann <kraxel@redhat.com>'s * early virtio-vsock proof-of-concept bits. */ #include <linux/spinlock.h> #include <linux/module.h> #include <linux/list.h> #include <linux/atomic.h> #include <linux/virtio.h> #include <linux/virtio_ids.h> #include <linux/virtio_config.h> #include <linux/virtio_vsock.h> #include <net/sock.h> #include <linux/mutex.h> #include <net/af_vsock.h> static struct workqueue_struct *virtio_vsock_workqueue; static struct virtio_vsock __rcu *the_virtio_vsock; static DEFINE_MUTEX(the_virtio_vsock_mutex); /* protects the_virtio_vsock */ static struct virtio_transport virtio_transport; /* forward declaration */ struct virtio_vsock { struct virtio_device *vdev; struct virtqueue *vqs[VSOCK_VQ_MAX]; /* Virtqueue processing is deferred to a workqueue */ struct work_struct tx_work; struct work_struct rx_work; struct work_struct event_work; /* The following fields are protected by tx_lock. vqs[VSOCK_VQ_TX] * must be accessed with tx_lock held. */ struct mutex tx_lock; bool tx_run; struct work_struct send_pkt_work; struct sk_buff_head send_pkt_queue; atomic_t queued_replies; /* The following fields are protected by rx_lock. vqs[VSOCK_VQ_RX] * must be accessed with rx_lock held. */ struct mutex rx_lock; bool rx_run; int rx_buf_nr; int rx_buf_max_nr; /* The following fields are protected by event_lock. * vqs[VSOCK_VQ_EVENT] must be accessed with event_lock held. */ struct mutex event_lock; bool event_run; struct virtio_vsock_event event_list[8]; u32 guest_cid; bool seqpacket_allow; /* These fields are used only in tx path in function * 'virtio_transport_send_pkt_work()', so to save * stack space in it, place both of them here. Each * pointer from 'out_sgs' points to the corresponding * element in 'out_bufs' - this is initialized in * 'virtio_vsock_probe()'. Both fields are protected * by 'tx_lock'. +1 is needed for packet header. */ struct scatterlist *out_sgs[MAX_SKB_FRAGS + 1]; struct scatterlist out_bufs[MAX_SKB_FRAGS + 1]; }; static u32 virtio_transport_get_local_cid(void) { struct virtio_vsock *vsock; u32 ret; rcu_read_lock(); vsock = rcu_dereference(the_virtio_vsock); if (!vsock) { ret = VMADDR_CID_ANY; goto out_rcu; } ret = vsock->guest_cid; out_rcu: rcu_read_unlock(); return ret; } /* Caller need to hold vsock->tx_lock on vq */ static int virtio_transport_send_skb(struct sk_buff *skb, struct virtqueue *vq, struct virtio_vsock *vsock, gfp_t gfp) { int ret, in_sg = 0, out_sg = 0; struct scatterlist **sgs; sgs = vsock->out_sgs; sg_init_one(sgs[out_sg], virtio_vsock_hdr(skb), sizeof(*virtio_vsock_hdr(skb))); out_sg++; if (!skb_is_nonlinear(skb)) { if (skb->len > 0) { sg_init_one(sgs[out_sg], skb->data, skb->len); out_sg++; } } else { struct skb_shared_info *si; int i; /* If skb is nonlinear, then its buffer must contain * only header and nothing more. Data is stored in * the fragged part. */ WARN_ON_ONCE(skb_headroom(skb) != sizeof(*virtio_vsock_hdr(skb))); si = skb_shinfo(skb); for (i = 0; i < si->nr_frags; i++) { skb_frag_t *skb_frag = &si->frags[i]; void *va; /* We will use 'page_to_virt()' for the userspace page * here, because virtio or dma-mapping layers will call * 'virt_to_phys()' later to fill the buffer descriptor. * We don't touch memory at "virtual" address of this page. */ va = page_to_virt(skb_frag_page(skb_frag)); sg_init_one(sgs[out_sg], va + skb_frag_off(skb_frag), skb_frag_size(skb_frag)); out_sg++; } } ret = virtqueue_add_sgs(vq, sgs, out_sg, in_sg, skb, gfp); /* Usually this means that there is no more space available in * the vq */ if (ret < 0) return ret; virtio_transport_deliver_tap_pkt(skb); return 0; } static void virtio_transport_send_pkt_work(struct work_struct *work) { struct virtio_vsock *vsock = container_of(work, struct virtio_vsock, send_pkt_work); struct virtqueue *vq; bool added = false; bool restart_rx = false; mutex_lock(&vsock->tx_lock); if (!vsock->tx_run) goto out; vq = vsock->vqs[VSOCK_VQ_TX]; for (;;) { struct sk_buff *skb; bool reply; int ret; skb = virtio_vsock_skb_dequeue(&vsock->send_pkt_queue); if (!skb) break; reply = virtio_vsock_skb_reply(skb); ret = virtio_transport_send_skb(skb, vq, vsock, GFP_KERNEL); if (ret < 0) { virtio_vsock_skb_queue_head(&vsock->send_pkt_queue, skb); break; } if (reply) { struct virtqueue *rx_vq = vsock->vqs[VSOCK_VQ_RX]; int val; val = atomic_dec_return(&vsock->queued_replies); /* Do we now have resources to resume rx processing? */ if (val + 1 == virtqueue_get_vring_size(rx_vq)) restart_rx = true; } added = true; } if (added) virtqueue_kick(vq); out: mutex_unlock(&vsock->tx_lock); if (restart_rx) queue_work(virtio_vsock_workqueue, &vsock->rx_work); } /* Caller need to hold RCU for vsock. * Returns 0 if the packet is successfully put on the vq. */ static int virtio_transport_send_skb_fast_path(struct virtio_vsock *vsock, struct sk_buff *skb) { struct virtqueue *vq = vsock->vqs[VSOCK_VQ_TX]; int ret; /* Inside RCU, can't sleep! */ ret = mutex_trylock(&vsock->tx_lock); if (unlikely(ret == 0)) return -EBUSY; ret = virtio_transport_send_skb(skb, vq, vsock, GFP_ATOMIC); if (ret == 0) virtqueue_kick(vq); mutex_unlock(&vsock->tx_lock); return ret; } static int virtio_transport_send_pkt(struct sk_buff *skb) { struct virtio_vsock_hdr *hdr; struct virtio_vsock *vsock; int len = skb->len; hdr = virtio_vsock_hdr(skb); rcu_read_lock(); vsock = rcu_dereference(the_virtio_vsock); if (!vsock) { kfree_skb(skb); len = -ENODEV; goto out_rcu; } if (le64_to_cpu(hdr->dst_cid) == vsock->guest_cid) { kfree_skb(skb); len = -ENODEV; goto out_rcu; } /* If send_pkt_queue is empty, we can safely bypass this queue * because packet order is maintained and (try) to put the packet * on the virtqueue using virtio_transport_send_skb_fast_path. * If this fails we simply put the packet on the intermediate * queue and schedule the worker. */ if (!skb_queue_empty_lockless(&vsock->send_pkt_queue) || virtio_transport_send_skb_fast_path(vsock, skb)) { if (virtio_vsock_skb_reply(skb)) atomic_inc(&vsock->queued_replies); virtio_vsock_skb_queue_tail(&vsock->send_pkt_queue, skb); queue_work(virtio_vsock_workqueue, &vsock->send_pkt_work); } out_rcu: rcu_read_unlock(); return len; } static int virtio_transport_cancel_pkt(struct vsock_sock *vsk) { struct virtio_vsock *vsock; int cnt = 0, ret; rcu_read_lock(); vsock = rcu_dereference(the_virtio_vsock); if (!vsock) { ret = -ENODEV; goto out_rcu; } cnt = virtio_transport_purge_skbs(vsk, &vsock->send_pkt_queue); if (cnt) { struct virtqueue *rx_vq = vsock->vqs[VSOCK_VQ_RX]; int new_cnt; new_cnt = atomic_sub_return(cnt, &vsock->queued_replies); if (new_cnt + cnt >= virtqueue_get_vring_size(rx_vq) && new_cnt < virtqueue_get_vring_size(rx_vq)) queue_work(virtio_vsock_workqueue, &vsock->rx_work); } ret = 0; out_rcu: rcu_read_unlock(); return ret; } static void virtio_vsock_rx_fill(struct virtio_vsock *vsock) { int total_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE; struct scatterlist pkt, *p; struct virtqueue *vq; struct sk_buff *skb; int ret; vq = vsock->vqs[VSOCK_VQ_RX]; do { skb = virtio_vsock_alloc_linear_skb(total_len, GFP_KERNEL); if (!skb) break; memset(skb->head, 0, VIRTIO_VSOCK_SKB_HEADROOM); sg_init_one(&pkt, virtio_vsock_hdr(skb), total_len); p = &pkt; ret = virtqueue_add_sgs(vq, &p, 0, 1, skb, GFP_KERNEL); if (ret < 0) { kfree_skb(skb); break; } vsock->rx_buf_nr++; } while (vq->num_free); if (vsock->rx_buf_nr > vsock->rx_buf_max_nr) vsock->rx_buf_max_nr = vsock->rx_buf_nr; virtqueue_kick(vq); } static void virtio_transport_tx_work(struct work_struct *work) { struct virtio_vsock *vsock = container_of(work, struct virtio_vsock, tx_work); struct virtqueue *vq; bool added = false; vq = vsock->vqs[VSOCK_VQ_TX]; mutex_lock(&vsock->tx_lock); if (!vsock->tx_run) goto out; do { struct sk_buff *skb; unsigned int len; virtqueue_disable_cb(vq); while ((skb = virtqueue_get_buf(vq, &len)) != NULL) { virtio_transport_consume_skb_sent(skb, true); added = true; } } while (!virtqueue_enable_cb(vq)); out: mutex_unlock(&vsock->tx_lock); if (added) queue_work(virtio_vsock_workqueue, &vsock->send_pkt_work); } /* Is there space left for replies to rx packets? */ static bool virtio_transport_more_replies(struct virtio_vsock *vsock) { struct virtqueue *vq = vsock->vqs[VSOCK_VQ_RX]; int val; smp_rmb(); /* paired with atomic_inc() and atomic_dec_return() */ val = atomic_read(&vsock->queued_replies); return val < virtqueue_get_vring_size(vq); } /* event_lock must be held */ static int virtio_vsock_event_fill_one(struct virtio_vsock *vsock, struct virtio_vsock_event *event) { struct scatterlist sg; struct virtqueue *vq; vq = vsock->vqs[VSOCK_VQ_EVENT]; sg_init_one(&sg, event, sizeof(*event)); return virtqueue_add_inbuf(vq, &sg, 1, event, GFP_KERNEL); } /* event_lock must be held */ static void virtio_vsock_event_fill(struct virtio_vsock *vsock) { size_t i; for (i = 0; i < ARRAY_SIZE(vsock->event_list); i++) { struct virtio_vsock_event *event = &vsock->event_list[i]; virtio_vsock_event_fill_one(vsock, event); } virtqueue_kick(vsock->vqs[VSOCK_VQ_EVENT]); } static void virtio_vsock_reset_sock(struct sock *sk) { /* vmci_transport.c doesn't take sk_lock here either. At least we're * under vsock_table_lock so the sock cannot disappear while we're * executing. */ sk->sk_state = TCP_CLOSE; sk->sk_err = ECONNRESET; sk_error_report(sk); } static void virtio_vsock_update_guest_cid(struct virtio_vsock *vsock) { struct virtio_device *vdev = vsock->vdev; __le64 guest_cid; vdev->config->get(vdev, offsetof(struct virtio_vsock_config, guest_cid), &guest_cid, sizeof(guest_cid)); vsock->guest_cid = le64_to_cpu(guest_cid); } /* event_lock must be held */ static void virtio_vsock_event_handle(struct virtio_vsock *vsock, struct virtio_vsock_event *event) { switch (le32_to_cpu(event->id)) { case VIRTIO_VSOCK_EVENT_TRANSPORT_RESET: virtio_vsock_update_guest_cid(vsock); vsock_for_each_connected_socket(&virtio_transport.transport, virtio_vsock_reset_sock); break; } } static void virtio_transport_event_work(struct work_struct *work) { struct virtio_vsock *vsock = container_of(work, struct virtio_vsock, event_work); struct virtqueue *vq; vq = vsock->vqs[VSOCK_VQ_EVENT]; mutex_lock(&vsock->event_lock); if (!vsock->event_run) goto out; do { struct virtio_vsock_event *event; unsigned int len; virtqueue_disable_cb(vq); while ((event = virtqueue_get_buf(vq, &len)) != NULL) { if (len == sizeof(*event)) virtio_vsock_event_handle(vsock, event); virtio_vsock_event_fill_one(vsock, event); } } while (!virtqueue_enable_cb(vq)); virtqueue_kick(vsock->vqs[VSOCK_VQ_EVENT]); out: mutex_unlock(&vsock->event_lock); } static void virtio_vsock_event_done(struct virtqueue *vq) { struct virtio_vsock *vsock = vq->vdev->priv; if (!vsock) return; queue_work(virtio_vsock_workqueue, &vsock->event_work); } static void virtio_vsock_tx_done(struct virtqueue *vq) { struct virtio_vsock *vsock = vq->vdev->priv; if (!vsock) return; queue_work(virtio_vsock_workqueue, &vsock->tx_work); } static void virtio_vsock_rx_done(struct virtqueue *vq) { struct virtio_vsock *vsock = vq->vdev->priv; if (!vsock) return; queue_work(virtio_vsock_workqueue, &vsock->rx_work); } static bool virtio_transport_can_msgzerocopy(int bufs_num) { struct virtio_vsock *vsock; bool res = false; rcu_read_lock(); vsock = rcu_dereference(the_virtio_vsock); if (vsock) { struct virtqueue *vq = vsock->vqs[VSOCK_VQ_TX]; /* Check that tx queue is large enough to keep whole * data to send. This is needed, because when there is * not enough free space in the queue, current skb to * send will be reinserted to the head of tx list of * the socket to retry transmission later, so if skb * is bigger than whole queue, it will be reinserted * again and again, thus blocking other skbs to be sent. * Each page of the user provided buffer will be added * as a single buffer to the tx virtqueue, so compare * number of pages against maximum capacity of the queue. */ if (bufs_num <= vq->num_max) res = true; } rcu_read_unlock(); return res; } static bool virtio_transport_msgzerocopy_allow(void) { return true; } static bool virtio_transport_seqpacket_allow(u32 remote_cid); static struct virtio_transport virtio_transport = { .transport = { .module = THIS_MODULE, .get_local_cid = virtio_transport_get_local_cid, .init = virtio_transport_do_socket_init, .destruct = virtio_transport_destruct, .release = virtio_transport_release, .connect = virtio_transport_connect, .shutdown = virtio_transport_shutdown, .cancel_pkt = virtio_transport_cancel_pkt, .dgram_bind = virtio_transport_dgram_bind, .dgram_dequeue = virtio_transport_dgram_dequeue, .dgram_enqueue = virtio_transport_dgram_enqueue, .dgram_allow = virtio_transport_dgram_allow, .stream_dequeue = virtio_transport_stream_dequeue, .stream_enqueue = virtio_transport_stream_enqueue, .stream_has_data = virtio_transport_stream_has_data, .stream_has_space = virtio_transport_stream_has_space, .stream_rcvhiwat = virtio_transport_stream_rcvhiwat, .stream_is_active = virtio_transport_stream_is_active, .stream_allow = virtio_transport_stream_allow, .seqpacket_dequeue = virtio_transport_seqpacket_dequeue, .seqpacket_enqueue = virtio_transport_seqpacket_enqueue, .seqpacket_allow = virtio_transport_seqpacket_allow, .seqpacket_has_data = virtio_transport_seqpacket_has_data, .msgzerocopy_allow = virtio_transport_msgzerocopy_allow, .notify_poll_in = virtio_transport_notify_poll_in, .notify_poll_out = virtio_transport_notify_poll_out, .notify_recv_init = virtio_transport_notify_recv_init, .notify_recv_pre_block = virtio_transport_notify_recv_pre_block, .notify_recv_pre_dequeue = virtio_transport_notify_recv_pre_dequeue, .notify_recv_post_dequeue = virtio_transport_notify_recv_post_dequeue, .notify_send_init = virtio_transport_notify_send_init, .notify_send_pre_block = virtio_transport_notify_send_pre_block, .notify_send_pre_enqueue = virtio_transport_notify_send_pre_enqueue, .notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue, .notify_buffer_size = virtio_transport_notify_buffer_size, .notify_set_rcvlowat = virtio_transport_notify_set_rcvlowat, .unsent_bytes = virtio_transport_unsent_bytes, .read_skb = virtio_transport_read_skb, }, .send_pkt = virtio_transport_send_pkt, .can_msgzerocopy = virtio_transport_can_msgzerocopy, }; static bool virtio_transport_seqpacket_allow(u32 remote_cid) { struct virtio_vsock *vsock; bool seqpacket_allow; seqpacket_allow = false; rcu_read_lock(); vsock = rcu_dereference(the_virtio_vsock); if (vsock) seqpacket_allow = vsock->seqpacket_allow; rcu_read_unlock(); return seqpacket_allow; } static void virtio_transport_rx_work(struct work_struct *work) { struct virtio_vsock *vsock = container_of(work, struct virtio_vsock, rx_work); struct virtqueue *vq; vq = vsock->vqs[VSOCK_VQ_RX]; mutex_lock(&vsock->rx_lock); if (!vsock->rx_run) goto out; do { virtqueue_disable_cb(vq); for (;;) { unsigned int len, payload_len; struct virtio_vsock_hdr *hdr; struct sk_buff *skb; if (!virtio_transport_more_replies(vsock)) { /* Stop rx until the device processes already * pending replies. Leave rx virtqueue * callbacks disabled. */ goto out; } skb = virtqueue_get_buf(vq, &len); if (!skb) break; vsock->rx_buf_nr--; /* Drop short/long packets */ if (unlikely(len < sizeof(*hdr) || len > virtio_vsock_skb_len(skb))) { kfree_skb(skb); continue; } hdr = virtio_vsock_hdr(skb); payload_len = le32_to_cpu(hdr->len); if (unlikely(payload_len > len - sizeof(*hdr))) { kfree_skb(skb); continue; } if (payload_len) virtio_vsock_skb_put(skb, payload_len); virtio_transport_deliver_tap_pkt(skb); virtio_transport_recv_pkt(&virtio_transport, skb); } } while (!virtqueue_enable_cb(vq)); out: if (vsock->rx_buf_nr < vsock->rx_buf_max_nr / 2) virtio_vsock_rx_fill(vsock); mutex_unlock(&vsock->rx_lock); } static int virtio_vsock_vqs_init(struct virtio_vsock *vsock) { struct virtio_device *vdev = vsock->vdev; struct virtqueue_info vqs_info[] = { { "rx", virtio_vsock_rx_done }, { "tx", virtio_vsock_tx_done }, { "event", virtio_vsock_event_done }, }; int ret; mutex_lock(&vsock->rx_lock); vsock->rx_buf_nr = 0; vsock->rx_buf_max_nr = 0; mutex_unlock(&vsock->rx_lock); atomic_set(&vsock->queued_replies, 0); ret = virtio_find_vqs(vdev, VSOCK_VQ_MAX, vsock->vqs, vqs_info, NULL); if (ret < 0) return ret; virtio_vsock_update_guest_cid(vsock); virtio_device_ready(vdev); return 0; } static void virtio_vsock_vqs_start(struct virtio_vsock *vsock) { mutex_lock(&vsock->tx_lock); vsock->tx_run = true; mutex_unlock(&vsock->tx_lock); mutex_lock(&vsock->rx_lock); virtio_vsock_rx_fill(vsock); vsock->rx_run = true; mutex_unlock(&vsock->rx_lock); mutex_lock(&vsock->event_lock); virtio_vsock_event_fill(vsock); vsock->event_run = true; mutex_unlock(&vsock->event_lock); /* virtio_transport_send_pkt() can queue packets once * the_virtio_vsock is set, but they won't be processed until * vsock->tx_run is set to true. We queue vsock->send_pkt_work * when initialization finishes to send those packets queued * earlier. * We don't need to queue the other workers (rx, event) because * as long as we don't fill the queues with empty buffers, the * host can't send us any notification. */ queue_work(virtio_vsock_workqueue, &vsock->send_pkt_work); } static void virtio_vsock_vqs_del(struct virtio_vsock *vsock) { struct virtio_device *vdev = vsock->vdev; struct sk_buff *skb; /* Reset all connected sockets when the VQs disappear */ vsock_for_each_connected_socket(&virtio_transport.transport, virtio_vsock_reset_sock); /* Stop all work handlers to make sure no one is accessing the device, * so we can safely call virtio_reset_device(). */ mutex_lock(&vsock->rx_lock); vsock->rx_run = false; mutex_unlock(&vsock->rx_lock); mutex_lock(&vsock->tx_lock); vsock->tx_run = false; mutex_unlock(&vsock->tx_lock); mutex_lock(&vsock->event_lock); vsock->event_run = false; mutex_unlock(&vsock->event_lock); /* Flush all device writes and interrupts, device will not use any * more buffers. */ virtio_reset_device(vdev); mutex_lock(&vsock->rx_lock); while ((skb = virtqueue_detach_unused_buf(vsock->vqs[VSOCK_VQ_RX]))) kfree_skb(skb); mutex_unlock(&vsock->rx_lock); mutex_lock(&vsock->tx_lock); while ((skb = virtqueue_detach_unused_buf(vsock->vqs[VSOCK_VQ_TX]))) kfree_skb(skb); mutex_unlock(&vsock->tx_lock); virtio_vsock_skb_queue_purge(&vsock->send_pkt_queue); /* Delete virtqueues and flush outstanding callbacks if any */ vdev->config->del_vqs(vdev); } static int virtio_vsock_probe(struct virtio_device *vdev) { struct virtio_vsock *vsock = NULL; int ret; int i; ret = mutex_lock_interruptible(&the_virtio_vsock_mutex); if (ret) return ret; /* Only one virtio-vsock device per guest is supported */ if (rcu_dereference_protected(the_virtio_vsock, lockdep_is_held(&the_virtio_vsock_mutex))) { ret = -EBUSY; goto out; } vsock = kzalloc(sizeof(*vsock), GFP_KERNEL); if (!vsock) { ret = -ENOMEM; goto out; } vsock->vdev = vdev; mutex_init(&vsock->tx_lock); mutex_init(&vsock->rx_lock); mutex_init(&vsock->event_lock); skb_queue_head_init(&vsock->send_pkt_queue); INIT_WORK(&vsock->rx_work, virtio_transport_rx_work); INIT_WORK(&vsock->tx_work, virtio_transport_tx_work); INIT_WORK(&vsock->event_work, virtio_transport_event_work); INIT_WORK(&vsock->send_pkt_work, virtio_transport_send_pkt_work); if (virtio_has_feature(vdev, VIRTIO_VSOCK_F_SEQPACKET)) vsock->seqpacket_allow = true; vdev->priv = vsock; ret = virtio_vsock_vqs_init(vsock); if (ret < 0) goto out; for (i = 0; i < ARRAY_SIZE(vsock->out_sgs); i++) vsock->out_sgs[i] = &vsock->out_bufs[i]; rcu_assign_pointer(the_virtio_vsock, vsock); virtio_vsock_vqs_start(vsock); mutex_unlock(&the_virtio_vsock_mutex); return 0; out: kfree(vsock); mutex_unlock(&the_virtio_vsock_mutex); return ret; } static void virtio_vsock_remove(struct virtio_device *vdev) { struct virtio_vsock *vsock = vdev->priv; mutex_lock(&the_virtio_vsock_mutex); vdev->priv = NULL; rcu_assign_pointer(the_virtio_vsock, NULL); synchronize_rcu(); virtio_vsock_vqs_del(vsock); /* Other works can be queued before 'config->del_vqs()', so we flush * all works before to free the vsock object to avoid use after free. */ flush_work(&vsock->rx_work); flush_work(&vsock->tx_work); flush_work(&vsock->event_work); flush_work(&vsock->send_pkt_work); mutex_unlock(&the_virtio_vsock_mutex); kfree(vsock); } #ifdef CONFIG_PM_SLEEP static int virtio_vsock_freeze(struct virtio_device *vdev) { struct virtio_vsock *vsock = vdev->priv; mutex_lock(&the_virtio_vsock_mutex); rcu_assign_pointer(the_virtio_vsock, NULL); synchronize_rcu(); virtio_vsock_vqs_del(vsock); mutex_unlock(&the_virtio_vsock_mutex); return 0; } static int virtio_vsock_restore(struct virtio_device *vdev) { struct virtio_vsock *vsock = vdev->priv; int ret; mutex_lock(&the_virtio_vsock_mutex); /* Only one virtio-vsock device per guest is supported */ if (rcu_dereference_protected(the_virtio_vsock, lockdep_is_held(&the_virtio_vsock_mutex))) { ret = -EBUSY; goto out; } ret = virtio_vsock_vqs_init(vsock); if (ret < 0) goto out; rcu_assign_pointer(the_virtio_vsock, vsock); virtio_vsock_vqs_start(vsock); out: mutex_unlock(&the_virtio_vsock_mutex); return ret; } #endif /* CONFIG_PM_SLEEP */ static struct virtio_device_id id_table[] = { { VIRTIO_ID_VSOCK, VIRTIO_DEV_ANY_ID }, { 0 }, }; static unsigned int features[] = { VIRTIO_VSOCK_F_SEQPACKET }; static struct virtio_driver virtio_vsock_driver = { .feature_table = features, .feature_table_size = ARRAY_SIZE(features), .driver.name = KBUILD_MODNAME, .id_table = id_table, .probe = virtio_vsock_probe, .remove = virtio_vsock_remove, #ifdef CONFIG_PM_SLEEP .freeze = virtio_vsock_freeze, .restore = virtio_vsock_restore, #endif }; static int __init virtio_vsock_init(void) { int ret; virtio_vsock_workqueue = alloc_workqueue("virtio_vsock", WQ_PERCPU, 0); if (!virtio_vsock_workqueue) return -ENOMEM; ret = vsock_core_register(&virtio_transport.transport, VSOCK_TRANSPORT_F_G2H); if (ret) goto out_wq; ret = register_virtio_driver(&virtio_vsock_driver); if (ret) goto out_vci; return 0; out_vci: vsock_core_unregister(&virtio_transport.transport); out_wq: destroy_workqueue(virtio_vsock_workqueue); return ret; } static void __exit virtio_vsock_exit(void) { unregister_virtio_driver(&virtio_vsock_driver); vsock_core_unregister(&virtio_transport.transport); destroy_workqueue(virtio_vsock_workqueue); } module_init(virtio_vsock_init); module_exit(virtio_vsock_exit); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Asias He"); MODULE_DESCRIPTION("virtio transport for vsock"); MODULE_DEVICE_TABLE(virtio, id_table);
58 288 7 551 550 551 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TTY_H #define _LINUX_TTY_H #include <linux/fs.h> #include <linux/major.h> #include <linux/termios.h> #include <linux/workqueue.h> #include <linux/tty_driver.h> #include <linux/tty_ldisc.h> #include <linux/tty_port.h> #include <linux/mutex.h> #include <linux/tty_flags.h> #include <uapi/linux/tty.h> #include <linux/rwsem.h> #include <linux/llist.h> /* * (Note: the *_driver.minor_start values 1, 64, 128, 192 are * hardcoded at present.) */ #define NR_UNIX98_PTY_DEFAULT 4096 /* Default maximum for Unix98 ptys */ #define NR_UNIX98_PTY_RESERVE 1024 /* Default reserve for main devpts */ #define NR_UNIX98_PTY_MAX (1 << MINORBITS) /* Absolute limit */ /* * This character is the same as _POSIX_VDISABLE: it cannot be used as * a c_cc[] character, but indicates that a particular special character * isn't in use (eg VINTR has no character etc) */ #define __DISABLED_CHAR '\0' #define INTR_CHAR(tty) ((tty)->termios.c_cc[VINTR]) #define QUIT_CHAR(tty) ((tty)->termios.c_cc[VQUIT]) #define ERASE_CHAR(tty) ((tty)->termios.c_cc[VERASE]) #define KILL_CHAR(tty) ((tty)->termios.c_cc[VKILL]) #define EOF_CHAR(tty) ((tty)->termios.c_cc[VEOF]) #define TIME_CHAR(tty) ((tty)->termios.c_cc[VTIME]) #define MIN_CHAR(tty) ((tty)->termios.c_cc[VMIN]) #define SWTC_CHAR(tty) ((tty)->termios.c_cc[VSWTC]) #define START_CHAR(tty) ((tty)->termios.c_cc[VSTART]) #define STOP_CHAR(tty) ((tty)->termios.c_cc[VSTOP]) #define SUSP_CHAR(tty) ((tty)->termios.c_cc[VSUSP]) #define EOL_CHAR(tty) ((tty)->termios.c_cc[VEOL]) #define REPRINT_CHAR(tty) ((tty)->termios.c_cc[VREPRINT]) #define DISCARD_CHAR(tty) ((tty)->termios.c_cc[VDISCARD]) #define WERASE_CHAR(tty) ((tty)->termios.c_cc[VWERASE]) #define LNEXT_CHAR(tty) ((tty)->termios.c_cc[VLNEXT]) #define EOL2_CHAR(tty) ((tty)->termios.c_cc[VEOL2]) #define _I_FLAG(tty, f) ((tty)->termios.c_iflag & (f)) #define _O_FLAG(tty, f) ((tty)->termios.c_oflag & (f)) #define _C_FLAG(tty, f) ((tty)->termios.c_cflag & (f)) #define _L_FLAG(tty, f) ((tty)->termios.c_lflag & (f)) #define I_IGNBRK(tty) _I_FLAG((tty), IGNBRK) #define I_BRKINT(tty) _I_FLAG((tty), BRKINT) #define I_IGNPAR(tty) _I_FLAG((tty), IGNPAR) #define I_PARMRK(tty) _I_FLAG((tty), PARMRK) #define I_INPCK(tty) _I_FLAG((tty), INPCK) #define I_ISTRIP(tty) _I_FLAG((tty), ISTRIP) #define I_INLCR(tty) _I_FLAG((tty), INLCR) #define I_IGNCR(tty) _I_FLAG((tty), IGNCR) #define I_ICRNL(tty) _I_FLAG((tty), ICRNL) #define I_IUCLC(tty) _I_FLAG((tty), IUCLC) #define I_IXON(tty) _I_FLAG((tty), IXON) #define I_IXANY(tty) _I_FLAG((tty), IXANY) #define I_IXOFF(tty) _I_FLAG((tty), IXOFF) #define I_IMAXBEL(tty) _I_FLAG((tty), IMAXBEL) #define I_IUTF8(tty) _I_FLAG((tty), IUTF8) #define O_OPOST(tty) _O_FLAG((tty), OPOST) #define O_OLCUC(tty) _O_FLAG((tty), OLCUC) #define O_ONLCR(tty) _O_FLAG((tty), ONLCR) #define O_OCRNL(tty) _O_FLAG((tty), OCRNL) #define O_ONOCR(tty) _O_FLAG((tty), ONOCR) #define O_ONLRET(tty) _O_FLAG((tty), ONLRET) #define O_OFILL(tty) _O_FLAG((tty), OFILL) #define O_OFDEL(tty) _O_FLAG((tty), OFDEL) #define O_NLDLY(tty) _O_FLAG((tty), NLDLY) #define O_CRDLY(tty) _O_FLAG((tty), CRDLY) #define O_TABDLY(tty) _O_FLAG((tty), TABDLY) #define O_BSDLY(tty) _O_FLAG((tty), BSDLY) #define O_VTDLY(tty) _O_FLAG((tty), VTDLY) #define O_FFDLY(tty) _O_FLAG((tty), FFDLY) #define C_BAUD(tty) _C_FLAG((tty), CBAUD) #define C_CSIZE(tty) _C_FLAG((tty), CSIZE) #define C_CSTOPB(tty) _C_FLAG((tty), CSTOPB) #define C_CREAD(tty) _C_FLAG((tty), CREAD) #define C_PARENB(tty) _C_FLAG((tty), PARENB) #define C_PARODD(tty) _C_FLAG((tty), PARODD) #define C_HUPCL(tty) _C_FLAG((tty), HUPCL) #define C_CLOCAL(tty) _C_FLAG((tty), CLOCAL) #define C_CIBAUD(tty) _C_FLAG((tty), CIBAUD) #define C_CRTSCTS(tty) _C_FLAG((tty), CRTSCTS) #define C_CMSPAR(tty) _C_FLAG((tty), CMSPAR) #define L_ISIG(tty) _L_FLAG((tty), ISIG) #define L_ICANON(tty) _L_FLAG((tty), ICANON) #define L_XCASE(tty) _L_FLAG((tty), XCASE) #define L_ECHO(tty) _L_FLAG((tty), ECHO) #define L_ECHOE(tty) _L_FLAG((tty), ECHOE) #define L_ECHOK(tty) _L_FLAG((tty), ECHOK) #define L_ECHONL(tty) _L_FLAG((tty), ECHONL) #define L_NOFLSH(tty) _L_FLAG((tty), NOFLSH) #define L_TOSTOP(tty) _L_FLAG((tty), TOSTOP) #define L_ECHOCTL(tty) _L_FLAG((tty), ECHOCTL) #define L_ECHOPRT(tty) _L_FLAG((tty), ECHOPRT) #define L_ECHOKE(tty) _L_FLAG((tty), ECHOKE) #define L_FLUSHO(tty) _L_FLAG((tty), FLUSHO) #define L_PENDIN(tty) _L_FLAG((tty), PENDIN) #define L_IEXTEN(tty) _L_FLAG((tty), IEXTEN) #define L_EXTPROC(tty) _L_FLAG((tty), EXTPROC) struct device; struct signal_struct; struct tty_operations; /** * struct tty_struct - state associated with a tty while open * * @kref: reference counting by tty_kref_get() and tty_kref_put(), reaching zero * frees the structure * @dev: class device or %NULL (e.g. ptys, serdev) * @driver: &struct tty_driver operating this tty * @ops: &struct tty_operations of @driver for this tty (open, close, etc.) * @index: index of this tty (e.g. to construct @name like tty12) * @ldisc_sem: protects line discipline changes (@ldisc) -- lock tty not pty * @ldisc: the current line discipline for this tty (n_tty by default) * @atomic_write_lock: protects against concurrent writers, i.e. locks * @write_cnt, @write_buf and similar * @legacy_mutex: leftover from history (BKL -> BTM -> @legacy_mutex), * protecting several operations on this tty * @throttle_mutex: protects against concurrent tty_throttle_safe() and * tty_unthrottle_safe() (but not tty_unthrottle()) * @termios_rwsem: protects @termios and @termios_locked * @winsize_mutex: protects @winsize * @termios: termios for the current tty, copied from/to @driver.termios * @termios_locked: locked termios (by %TIOCGLCKTRMIOS and %TIOCSLCKTRMIOS * ioctls) * @name: name of the tty constructed by tty_line_name() (e.g. ttyS3) * @flags: bitwise OR of %TTY_THROTTLED, %TTY_IO_ERROR, ... * @count: count of open processes, reaching zero cancels all the work for * this tty and drops a @kref too (but does not free this tty) * @winsize: size of the terminal "window" (cf. @winsize_mutex) * @flow: flow settings grouped together * @flow.lock: lock for @flow members * @flow.stopped: tty stopped/started by stop_tty()/start_tty() * @flow.tco_stopped: tty stopped/started by %TCOOFF/%TCOON ioctls (it has * precedence over @flow.stopped) * @ctrl: control settings grouped together * @ctrl.lock: lock for @ctrl members * @ctrl.pgrp: process group of this tty (setpgrp(2)) * @ctrl.session: session of this tty (setsid(2)). Writes are protected by both * @ctrl.lock and @legacy_mutex, readers must use at least one of * them. * @ctrl.pktstatus: packet mode status (bitwise OR of %TIOCPKT_ constants) * @ctrl.packet: packet mode enabled * @hw_stopped: not controlled by the tty layer, under @driver's control for CTS * handling * @receive_room: bytes permitted to feed to @ldisc without any being lost * @flow_change: controls behavior of throttling, see tty_throttle_safe() and * tty_unthrottle_safe() * @link: link to another pty (master -> slave and vice versa) * @fasync: state for %O_ASYNC (for %SIGIO); managed by fasync_helper() * @write_wait: concurrent writers are waiting in this queue until they are * allowed to write * @read_wait: readers wait for data in this queue * @hangup_work: normally a work to perform a hangup (do_tty_hangup()); while * freeing the tty, (re)used to release_one_tty() * @disc_data: pointer to @ldisc's private data (e.g. to &struct n_tty_data) * @driver_data: pointer to @driver's private data (e.g. &struct uart_state) * @files_lock: protects @tty_files list * @tty_files: list of (re)openers of this tty (i.e. linked &struct * tty_file_private) * @closing: when set during close, n_tty processes only START & STOP chars * @write_buf: temporary buffer used during tty_write() to copy user data to * @write_cnt: count of bytes written in tty_write() to @write_buf * @SAK_work: if the tty has a pending do_SAK, it is queued here * @port: persistent storage for this device (i.e. &struct tty_port) * * All of the state associated with a tty while the tty is open. Persistent * storage for tty devices is referenced here as @port and is documented in * &struct tty_port. */ struct tty_struct { struct kref kref; int index; struct device *dev; struct tty_driver *driver; struct tty_port *port; const struct tty_operations *ops; struct tty_ldisc *ldisc; struct ld_semaphore ldisc_sem; struct mutex atomic_write_lock; struct mutex legacy_mutex; struct mutex throttle_mutex; struct rw_semaphore termios_rwsem; struct mutex winsize_mutex; struct ktermios termios, termios_locked; char name[64]; unsigned long flags; int count; unsigned int receive_room; struct winsize winsize; struct { spinlock_t lock; bool stopped; bool tco_stopped; } flow; struct { struct pid *pgrp; struct pid *session; spinlock_t lock; unsigned char pktstatus; bool packet; } ctrl; bool hw_stopped; bool closing; int flow_change; struct tty_struct *link; struct fasync_struct *fasync; wait_queue_head_t write_wait; wait_queue_head_t read_wait; struct work_struct hangup_work; void *disc_data; void *driver_data; spinlock_t files_lock; int write_cnt; u8 *write_buf; struct list_head tty_files; struct work_struct SAK_work; } __randomize_layout; /* Each of a tty's open files has private_data pointing to tty_file_private */ struct tty_file_private { struct tty_struct *tty; struct file *file; struct list_head list; }; /** * enum tty_struct_flags - TTY Struct Flags * * These bits are used in the :c:member:`tty_struct.flags` field. * * So that interrupts won't be able to mess up the queues, * copy_to_cooked must be atomic with respect to itself, as must * tty->write. Thus, you must use the inline functions set_bit() and * clear_bit() to make things atomic. * * @TTY_THROTTLED: * Driver input is throttled. The ldisc should call * :c:member:`tty_driver.unthrottle()` in order to resume reception when * it is ready to process more data (at threshold min). * * @TTY_IO_ERROR: * If set, causes all subsequent userspace read/write calls on the tty to * fail, returning -%EIO. (May be no ldisc too.) * * @TTY_OTHER_CLOSED: * Device is a pty and the other side has closed. * * @TTY_EXCLUSIVE: * Exclusive open mode (a single opener). * * @TTY_DO_WRITE_WAKEUP: * If set, causes the driver to call the * :c:member:`tty_ldisc_ops.write_wakeup()` method in order to resume * transmission when it can accept more data to transmit. * * @TTY_LDISC_OPEN: * Indicates that a line discipline is open. For debugging purposes only. * * @TTY_PTY_LOCK: * A flag private to pty code to implement %TIOCSPTLCK/%TIOCGPTLCK logic. * * @TTY_NO_WRITE_SPLIT: * Prevent driver from splitting up writes into smaller chunks (preserve * write boundaries to driver). * * @TTY_HUPPED: * The TTY was hung up. This is set post :c:member:`tty_driver.hangup()`. * * @TTY_HUPPING: * The TTY is in the process of hanging up to abort potential readers. * * @TTY_LDISC_CHANGING: * Line discipline for this TTY is being changed. I/O should not block * when this is set. Use tty_io_nonblock() to check. * * @TTY_LDISC_HALTED: * Line discipline for this TTY was stopped. No work should be queued to * this ldisc. */ enum tty_struct_flags { TTY_THROTTLED, TTY_IO_ERROR, TTY_OTHER_CLOSED, TTY_EXCLUSIVE, TTY_DO_WRITE_WAKEUP, TTY_LDISC_OPEN, TTY_PTY_LOCK, TTY_NO_WRITE_SPLIT, TTY_HUPPED, TTY_HUPPING, TTY_LDISC_CHANGING, TTY_LDISC_HALTED, }; static inline bool tty_io_nonblock(struct tty_struct *tty, struct file *file) { return file->f_flags & O_NONBLOCK || test_bit(TTY_LDISC_CHANGING, &tty->flags); } static inline bool tty_io_error(struct tty_struct *tty) { return test_bit(TTY_IO_ERROR, &tty->flags); } static inline bool tty_throttled(struct tty_struct *tty) { return test_bit(TTY_THROTTLED, &tty->flags); } #ifdef CONFIG_TTY void tty_kref_put(struct tty_struct *tty); struct pid *tty_get_pgrp(struct tty_struct *tty); void tty_vhangup_self(void); void disassociate_ctty(int priv); dev_t tty_devnum(struct tty_struct *tty); void proc_clear_tty(struct task_struct *p); struct tty_struct *get_current_tty(void); /* tty_io.c */ int __init tty_init(void); const char *tty_name(const struct tty_struct *tty); struct tty_struct *tty_kopen_exclusive(dev_t device); struct tty_struct *tty_kopen_shared(dev_t device); void tty_kclose(struct tty_struct *tty); int tty_dev_name_to_number(const char *name, dev_t *number); #else static inline void tty_kref_put(struct tty_struct *tty) { } static inline struct pid *tty_get_pgrp(struct tty_struct *tty) { return NULL; } static inline void tty_vhangup_self(void) { } static inline void disassociate_ctty(int priv) { } static inline dev_t tty_devnum(struct tty_struct *tty) { return 0; } static inline void proc_clear_tty(struct task_struct *p) { } static inline struct tty_struct *get_current_tty(void) { return NULL; } /* tty_io.c */ static inline int __init tty_init(void) { return 0; } static inline const char *tty_name(const struct tty_struct *tty) { return "(none)"; } static inline struct tty_struct *tty_kopen_exclusive(dev_t device) { return ERR_PTR(-ENODEV); } static inline void tty_kclose(struct tty_struct *tty) { } static inline int tty_dev_name_to_number(const char *name, dev_t *number) { return -ENOTSUPP; } #endif extern struct ktermios tty_std_termios; int vcs_init(void); extern const struct class tty_class; /** * tty_kref_get - get a tty reference * @tty: tty device * * Returns: a new reference to a tty object * * Locking: The caller must hold sufficient locks/counts to ensure that their * existing reference cannot go away. */ static inline struct tty_struct *tty_kref_get(struct tty_struct *tty) { if (tty) kref_get(&tty->kref); return tty; } const char *tty_driver_name(const struct tty_struct *tty); void tty_wait_until_sent(struct tty_struct *tty, long timeout); void stop_tty(struct tty_struct *tty); void start_tty(struct tty_struct *tty); void tty_write_message(struct tty_struct *tty, char *msg); int tty_send_xchar(struct tty_struct *tty, u8 ch); int tty_put_char(struct tty_struct *tty, u8 c); unsigned int tty_chars_in_buffer(struct tty_struct *tty); unsigned int tty_write_room(struct tty_struct *tty); void tty_driver_flush_buffer(struct tty_struct *tty); void tty_unthrottle(struct tty_struct *tty); bool tty_throttle_safe(struct tty_struct *tty); bool tty_unthrottle_safe(struct tty_struct *tty); int tty_do_resize(struct tty_struct *tty, struct winsize *ws); int tty_get_icount(struct tty_struct *tty, struct serial_icounter_struct *icount); int tty_get_tiocm(struct tty_struct *tty); int is_current_pgrp_orphaned(void); void tty_hangup(struct tty_struct *tty); void tty_vhangup(struct tty_struct *tty); int tty_hung_up_p(struct file *filp); void do_SAK(struct tty_struct *tty); void __do_SAK(struct tty_struct *tty); void no_tty(void); speed_t tty_termios_baud_rate(const struct ktermios *termios); void tty_termios_encode_baud_rate(struct ktermios *termios, speed_t ibaud, speed_t obaud); void tty_encode_baud_rate(struct tty_struct *tty, speed_t ibaud, speed_t obaud); /** * tty_get_baud_rate - get tty bit rates * @tty: tty to query * * Returns: the baud rate as an integer for this terminal * * Locking: The termios lock must be held by the caller. */ static inline speed_t tty_get_baud_rate(const struct tty_struct *tty) { return tty_termios_baud_rate(&tty->termios); } unsigned char tty_get_char_size(unsigned int cflag); unsigned char tty_get_frame_size(unsigned int cflag); void tty_termios_copy_hw(struct ktermios *new, const struct ktermios *old); bool tty_termios_hw_change(const struct ktermios *a, const struct ktermios *b); int tty_set_termios(struct tty_struct *tty, struct ktermios *kt); void tty_wakeup(struct tty_struct *tty); int tty_mode_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg); int tty_perform_flush(struct tty_struct *tty, unsigned long arg); struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx); void tty_release_struct(struct tty_struct *tty, int idx); void tty_init_termios(struct tty_struct *tty); void tty_save_termios(struct tty_struct *tty); int tty_standard_install(struct tty_driver *driver, struct tty_struct *tty); extern struct mutex tty_mutex; /* n_tty.c */ void n_tty_inherit_ops(struct tty_ldisc_ops *ops); #ifdef CONFIG_TTY void __init n_tty_init(void); #else static inline void n_tty_init(void) { } #endif /* tty_audit.c */ #ifdef CONFIG_AUDIT void tty_audit_exit(void); void tty_audit_fork(struct signal_struct *sig); int tty_audit_push(void); #else static inline void tty_audit_exit(void) { } static inline void tty_audit_fork(struct signal_struct *sig) { } static inline int tty_audit_push(void) { return 0; } #endif /* tty_ioctl.c */ int n_tty_ioctl_helper(struct tty_struct *tty, unsigned int cmd, unsigned long arg); /* vt.c */ int vt_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg); long vt_compat_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg); /* tty_mutex.c */ /* functions for preparation of BKL removal */ void tty_lock(struct tty_struct *tty); int tty_lock_interruptible(struct tty_struct *tty); void tty_unlock(struct tty_struct *tty); void tty_lock_slave(struct tty_struct *tty); void tty_unlock_slave(struct tty_struct *tty); void tty_set_lock_subclass(struct tty_struct *tty); #endif
42 42 40 42 42 42 123 123 123 123 123 123 123 6 5 123 123 123 1 122 122 122 122 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Support for INET6 connection oriented protocols. * * Authors: See the TCPv6 sources */ #include <linux/module.h> #include <linux/in6.h> #include <linux/ipv6.h> #include <linux/jhash.h> #include <linux/slab.h> #include <net/addrconf.h> #include <net/inet_connection_sock.h> #include <net/inet_ecn.h> #include <net/inet_hashtables.h> #include <net/ip6_route.h> #include <net/sock.h> #include <net/inet6_connection_sock.h> #include <net/sock_reuseport.h> struct dst_entry *inet6_csk_route_req(const struct sock *sk, struct flowi6 *fl6, const struct request_sock *req, u8 proto) { struct inet_request_sock *ireq = inet_rsk(req); const struct ipv6_pinfo *np = inet6_sk(sk); struct in6_addr *final_p, final; struct dst_entry *dst; memset(fl6, 0, sizeof(*fl6)); fl6->flowi6_proto = proto; fl6->daddr = ireq->ir_v6_rmt_addr; rcu_read_lock(); final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); rcu_read_unlock(); fl6->saddr = ireq->ir_v6_loc_addr; fl6->flowi6_oif = ireq->ir_iif; fl6->flowi6_mark = ireq->ir_mark; fl6->fl6_dport = ireq->ir_rmt_port; fl6->fl6_sport = htons(ireq->ir_num); fl6->flowi6_uid = sk_uid(sk); security_req_classify_flow(req, flowi6_to_flowi_common(fl6)); dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); if (IS_ERR(dst)) return NULL; return dst; } static inline struct dst_entry *__inet6_csk_dst_check(struct sock *sk, u32 cookie) { return __sk_dst_check(sk, cookie); } static struct dst_entry *inet6_csk_route_socket(struct sock *sk, struct flowi6 *fl6) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct in6_addr *final_p, final; struct dst_entry *dst; memset(fl6, 0, sizeof(*fl6)); fl6->flowi6_proto = sk->sk_protocol; fl6->daddr = sk->sk_v6_daddr; fl6->saddr = np->saddr; fl6->flowlabel = np->flow_label; IP6_ECN_flow_xmit(sk, fl6->flowlabel); fl6->flowi6_oif = sk->sk_bound_dev_if; fl6->flowi6_mark = sk->sk_mark; fl6->fl6_sport = inet->inet_sport; fl6->fl6_dport = inet->inet_dport; fl6->flowi6_uid = sk_uid(sk); security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6)); rcu_read_lock(); final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); rcu_read_unlock(); dst = __inet6_csk_dst_check(sk, np->dst_cookie); if (!dst) { dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); if (!IS_ERR(dst)) ip6_dst_store(sk, dst, false, false); } return dst; } int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl_unused) { struct ipv6_pinfo *np = inet6_sk(sk); struct flowi6 fl6; struct dst_entry *dst; int res; dst = inet6_csk_route_socket(sk, &fl6); if (IS_ERR(dst)) { WRITE_ONCE(sk->sk_err_soft, -PTR_ERR(dst)); sk->sk_route_caps = 0; kfree_skb(skb); return PTR_ERR(dst); } rcu_read_lock(); skb_dst_set_noref(skb, dst); /* Restore final destination back after routing done */ fl6.daddr = sk->sk_v6_daddr; res = ip6_xmit(sk, skb, &fl6, sk->sk_mark, rcu_dereference(np->opt), np->tclass, READ_ONCE(sk->sk_priority)); rcu_read_unlock(); return res; } EXPORT_SYMBOL_GPL(inet6_csk_xmit); struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu) { struct flowi6 fl6; struct dst_entry *dst = inet6_csk_route_socket(sk, &fl6); if (IS_ERR(dst)) return NULL; dst->ops->update_pmtu(dst, sk, NULL, mtu, true); dst = inet6_csk_route_socket(sk, &fl6); return IS_ERR(dst) ? NULL : dst; }
2777 2780 2687 189 189 328 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2001-2003 Patrick Mochel <mochel@osdl.org> * Copyright (c) 2004-2009 Greg Kroah-Hartman <gregkh@suse.de> * Copyright (c) 2008-2012 Novell Inc. * Copyright (c) 2012-2019 Greg Kroah-Hartman <gregkh@linuxfoundation.org> * Copyright (c) 2012-2019 Linux Foundation * * Core driver model functions and structures that should not be * shared outside of the drivers/base/ directory. * */ #include <linux/notifier.h> /** * struct subsys_private - structure to hold the private to the driver core portions of the bus_type/class structure. * * @subsys - the struct kset that defines this subsystem * @devices_kset - the subsystem's 'devices' directory * @interfaces - list of subsystem interfaces associated * @mutex - protect the devices, and interfaces lists. * * @drivers_kset - the list of drivers associated * @klist_devices - the klist to iterate over the @devices_kset * @klist_drivers - the klist to iterate over the @drivers_kset * @bus_notifier - the bus notifier list for anything that cares about things * on this bus. * @bus - pointer back to the struct bus_type that this structure is associated * with. * @dev_root: Default device to use as the parent. * * @glue_dirs - "glue" directory to put in-between the parent device to * avoid namespace conflicts * @class - pointer back to the struct class that this structure is associated * with. * @lock_key: Lock class key for use by the lock validator * * This structure is the one that is the actual kobject allowing struct * bus_type/class to be statically allocated safely. Nothing outside of the * driver core should ever touch these fields. */ struct subsys_private { struct kset subsys; struct kset *devices_kset; struct list_head interfaces; struct mutex mutex; struct kset *drivers_kset; struct klist klist_devices; struct klist klist_drivers; struct blocking_notifier_head bus_notifier; unsigned int drivers_autoprobe:1; const struct bus_type *bus; struct device *dev_root; struct kset glue_dirs; const struct class *class; struct lock_class_key lock_key; }; #define to_subsys_private(obj) container_of_const(obj, struct subsys_private, subsys.kobj) static inline struct subsys_private *subsys_get(struct subsys_private *sp) { if (sp) kset_get(&sp->subsys); return sp; } static inline void subsys_put(struct subsys_private *sp) { if (sp) kset_put(&sp->subsys); } struct subsys_private *bus_to_subsys(const struct bus_type *bus); struct subsys_private *class_to_subsys(const struct class *class); struct driver_private { struct kobject kobj; struct klist klist_devices; struct klist_node knode_bus; struct module_kobject *mkobj; struct device_driver *driver; }; #define to_driver(obj) container_of(obj, struct driver_private, kobj) /** * struct device_private - structure to hold the private to the driver core portions of the device structure. * * @klist_children - klist containing all children of this device * @knode_parent - node in sibling list * @knode_driver - node in driver list * @knode_bus - node in bus list * @knode_class - node in class list * @deferred_probe - entry in deferred_probe_list which is used to retry the * binding of drivers which were unable to get all the resources needed by * the device; typically because it depends on another driver getting * probed first. * @async_driver - pointer to device driver awaiting probe via async_probe * @device - pointer back to the struct device that this structure is * associated with. * @dead - This device is currently either in the process of or has been * removed from the system. Any asynchronous events scheduled for this * device should exit without taking any action. * * Nothing outside of the driver core should ever touch these fields. */ struct device_private { struct klist klist_children; struct klist_node knode_parent; struct klist_node knode_driver; struct klist_node knode_bus; struct klist_node knode_class; struct list_head deferred_probe; const struct device_driver *async_driver; char *deferred_probe_reason; struct device *device; u8 dead:1; }; #define to_device_private_parent(obj) \ container_of(obj, struct device_private, knode_parent) #define to_device_private_driver(obj) \ container_of(obj, struct device_private, knode_driver) #define to_device_private_bus(obj) \ container_of(obj, struct device_private, knode_bus) #define to_device_private_class(obj) \ container_of(obj, struct device_private, knode_class) /* initialisation functions */ int devices_init(void); int buses_init(void); int classes_init(void); int firmware_init(void); #ifdef CONFIG_SYS_HYPERVISOR int hypervisor_init(void); #else static inline int hypervisor_init(void) { return 0; } #endif int platform_bus_init(void); int faux_bus_init(void); void cpu_dev_init(void); void container_dev_init(void); #ifdef CONFIG_AUXILIARY_BUS void auxiliary_bus_init(void); #else static inline void auxiliary_bus_init(void) { } #endif struct kobject *virtual_device_parent(void); int bus_add_device(struct device *dev); void bus_probe_device(struct device *dev); void bus_remove_device(struct device *dev); void bus_notify(struct device *dev, enum bus_notifier_event value); bool bus_is_registered(const struct bus_type *bus); int bus_add_driver(struct device_driver *drv); void bus_remove_driver(struct device_driver *drv); void device_release_driver_internal(struct device *dev, const struct device_driver *drv, struct device *parent); void driver_detach(const struct device_driver *drv); void driver_deferred_probe_del(struct device *dev); void device_set_deferred_probe_reason(const struct device *dev, struct va_format *vaf); static inline int driver_match_device(const struct device_driver *drv, struct device *dev) { return drv->bus->match ? drv->bus->match(dev, drv) : 1; } static inline void dev_sync_state(struct device *dev) { if (dev->bus->sync_state) dev->bus->sync_state(dev); else if (dev->driver && dev->driver->sync_state) dev->driver->sync_state(dev); } int driver_add_groups(const struct device_driver *drv, const struct attribute_group **groups); void driver_remove_groups(const struct device_driver *drv, const struct attribute_group **groups); void device_driver_detach(struct device *dev); static inline void device_set_driver(struct device *dev, const struct device_driver *drv) { /* * Majority (all?) read accesses to dev->driver happens either * while holding device lock or in bus/driver code that is only * invoked when the device is bound to a driver and there is no * concern of the pointer being changed while it is being read. * However when reading device's uevent file we read driver pointer * without taking device lock (so we do not block there for * arbitrary amount of time). We use WRITE_ONCE() here to prevent * tearing so that READ_ONCE() can safely be used in uevent code. */ // FIXME - this cast should not be needed "soon" WRITE_ONCE(dev->driver, (struct device_driver *)drv); } int devres_release_all(struct device *dev); void device_block_probing(void); void device_unblock_probing(void); void deferred_probe_extend_timeout(void); void driver_deferred_probe_trigger(void); const char *device_get_devnode(const struct device *dev, umode_t *mode, kuid_t *uid, kgid_t *gid, const char **tmp); /* /sys/devices directory */ extern struct kset *devices_kset; void devices_kset_move_last(struct device *dev); #if defined(CONFIG_MODULES) && defined(CONFIG_SYSFS) int module_add_driver(struct module *mod, const struct device_driver *drv); void module_remove_driver(const struct device_driver *drv); #else static inline int module_add_driver(struct module *mod, struct device_driver *drv) { return 0; } static inline void module_remove_driver(struct device_driver *drv) { } #endif #ifdef CONFIG_DEVTMPFS int devtmpfs_init(void); #else static inline int devtmpfs_init(void) { return 0; } #endif #ifdef CONFIG_BLOCK extern const struct class block_class; static inline bool is_blockdev(struct device *dev) { return dev->class == &block_class; } #else static inline bool is_blockdev(struct device *dev) { return false; } #endif /* Device links support */ int device_links_read_lock(void); void device_links_read_unlock(int idx); int device_links_read_lock_held(void); int device_links_check_suppliers(struct device *dev); void device_links_force_bind(struct device *dev); void device_links_driver_bound(struct device *dev); void device_links_driver_cleanup(struct device *dev); void device_links_no_driver(struct device *dev); bool device_links_busy(struct device *dev); void device_links_unbind_consumers(struct device *dev); bool device_link_flag_is_sync_state_only(u32 flags); void fw_devlink_drivers_done(void); void fw_devlink_probing_done(void); #define dev_for_each_link_to_supplier(__link, __dev) \ list_for_each_entry_srcu(__link, &(__dev)->links.suppliers, c_node, \ device_links_read_lock_held()) #define dev_for_each_link_to_consumer(__link, __dev) \ list_for_each_entry_srcu(__link, &(__dev)->links.consumers, s_node, \ device_links_read_lock_held()) /* device pm support */ void device_pm_move_to_tail(struct device *dev); #ifdef CONFIG_DEVTMPFS int devtmpfs_create_node(struct device *dev); int devtmpfs_delete_node(struct device *dev); #else static inline int devtmpfs_create_node(struct device *dev) { return 0; } static inline int devtmpfs_delete_node(struct device *dev) { return 0; } #endif void software_node_notify(struct device *dev); void software_node_notify_remove(struct device *dev);
1 2 1 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 // SPDX-License-Identifier: GPL-2.0-only /* Tom Kelly's Scalable TCP * * See http://www.deneholme.net/tom/scalable/ * * John Heffner <jheffner@sc.edu> */ #include <linux/module.h> #include <net/tcp.h> /* These factors derived from the recommended values in the aer: * .01 and 7/8. */ #define TCP_SCALABLE_AI_CNT 100U #define TCP_SCALABLE_MD_SCALE 3 static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); if (!tcp_is_cwnd_limited(sk)) return; if (tcp_in_slow_start(tp)) { acked = tcp_slow_start(tp, acked); if (!acked) return; } tcp_cong_avoid_ai(tp, min(tcp_snd_cwnd(tp), TCP_SCALABLE_AI_CNT), acked); } static u32 tcp_scalable_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); return max(tcp_snd_cwnd(tp) - (tcp_snd_cwnd(tp)>>TCP_SCALABLE_MD_SCALE), 2U); } static struct tcp_congestion_ops tcp_scalable __read_mostly = { .ssthresh = tcp_scalable_ssthresh, .undo_cwnd = tcp_reno_undo_cwnd, .cong_avoid = tcp_scalable_cong_avoid, .owner = THIS_MODULE, .name = "scalable", }; static int __init tcp_scalable_register(void) { return tcp_register_congestion_control(&tcp_scalable); } static void __exit tcp_scalable_unregister(void) { tcp_unregister_congestion_control(&tcp_scalable); } module_init(tcp_scalable_register); module_exit(tcp_scalable_unregister); MODULE_AUTHOR("John Heffner"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Scalable TCP");
2 2 1 15 15 7 14 13 12 10 11 2 10 5 5 1 4 5 10 6 5 3 2 4 8 1 15 6 5 1 6 6 4 4 6 3 3 6 6 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org> * Copyright (c) 2012 Intel Corporation */ #include <linux/module.h> #include <linux/init.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/string.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_tables.h> #include <net/ip.h> struct nft_nat { u8 sreg_addr_min; u8 sreg_addr_max; u8 sreg_proto_min; u8 sreg_proto_max; enum nf_nat_manip_type type:8; u8 family; u16 flags; }; static void nft_nat_setup_addr(struct nf_nat_range2 *range, const struct nft_regs *regs, const struct nft_nat *priv) { switch (priv->family) { case AF_INET: range->min_addr.ip = (__force __be32) regs->data[priv->sreg_addr_min]; range->max_addr.ip = (__force __be32) regs->data[priv->sreg_addr_max]; break; case AF_INET6: memcpy(range->min_addr.ip6, &regs->data[priv->sreg_addr_min], sizeof(range->min_addr.ip6)); memcpy(range->max_addr.ip6, &regs->data[priv->sreg_addr_max], sizeof(range->max_addr.ip6)); break; } } static void nft_nat_setup_proto(struct nf_nat_range2 *range, const struct nft_regs *regs, const struct nft_nat *priv) { range->min_proto.all = (__force __be16) nft_reg_load16(&regs->data[priv->sreg_proto_min]); range->max_proto.all = (__force __be16) nft_reg_load16(&regs->data[priv->sreg_proto_max]); } static void nft_nat_setup_netmap(struct nf_nat_range2 *range, const struct nft_pktinfo *pkt, const struct nft_nat *priv) { struct sk_buff *skb = pkt->skb; union nf_inet_addr new_addr; __be32 netmask; int i, len = 0; switch (priv->type) { case NFT_NAT_SNAT: if (nft_pf(pkt) == NFPROTO_IPV4) { new_addr.ip = ip_hdr(skb)->saddr; len = sizeof(struct in_addr); } else { new_addr.in6 = ipv6_hdr(skb)->saddr; len = sizeof(struct in6_addr); } break; case NFT_NAT_DNAT: if (nft_pf(pkt) == NFPROTO_IPV4) { new_addr.ip = ip_hdr(skb)->daddr; len = sizeof(struct in_addr); } else { new_addr.in6 = ipv6_hdr(skb)->daddr; len = sizeof(struct in6_addr); } break; } for (i = 0; i < len / sizeof(__be32); i++) { netmask = ~(range->min_addr.ip6[i] ^ range->max_addr.ip6[i]); new_addr.ip6[i] &= ~netmask; new_addr.ip6[i] |= range->min_addr.ip6[i] & netmask; } range->min_addr = new_addr; range->max_addr = new_addr; } static void nft_nat_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_nat *priv = nft_expr_priv(expr); enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(pkt->skb, &ctinfo); struct nf_nat_range2 range; memset(&range, 0, sizeof(range)); if (priv->sreg_addr_min) { nft_nat_setup_addr(&range, regs, priv); if (priv->flags & NF_NAT_RANGE_NETMAP) nft_nat_setup_netmap(&range, pkt, priv); } if (priv->sreg_proto_min) nft_nat_setup_proto(&range, regs, priv); range.flags = priv->flags; regs->verdict.code = nf_nat_setup_info(ct, &range, priv->type); } static const struct nla_policy nft_nat_policy[NFTA_NAT_MAX + 1] = { [NFTA_NAT_TYPE] = { .type = NLA_U32 }, [NFTA_NAT_FAMILY] = { .type = NLA_U32 }, [NFTA_NAT_REG_ADDR_MIN] = { .type = NLA_U32 }, [NFTA_NAT_REG_ADDR_MAX] = { .type = NLA_U32 }, [NFTA_NAT_REG_PROTO_MIN] = { .type = NLA_U32 }, [NFTA_NAT_REG_PROTO_MAX] = { .type = NLA_U32 }, [NFTA_NAT_FLAGS] = NLA_POLICY_MASK(NLA_BE32, NF_NAT_RANGE_MASK), }; static int nft_nat_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_nat *priv = nft_expr_priv(expr); int err; if (ctx->family != NFPROTO_IPV4 && ctx->family != NFPROTO_IPV6 && ctx->family != NFPROTO_INET) return -EOPNOTSUPP; err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT); if (err < 0) return err; switch (priv->type) { case NFT_NAT_SNAT: err = nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN)); break; case NFT_NAT_DNAT: err = nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT)); break; } return err; } static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_nat *priv = nft_expr_priv(expr); unsigned int alen, plen; u32 family; int err; if (tb[NFTA_NAT_TYPE] == NULL || (tb[NFTA_NAT_REG_ADDR_MIN] == NULL && tb[NFTA_NAT_REG_PROTO_MIN] == NULL)) return -EINVAL; switch (ntohl(nla_get_be32(tb[NFTA_NAT_TYPE]))) { case NFT_NAT_SNAT: priv->type = NF_NAT_MANIP_SRC; break; case NFT_NAT_DNAT: priv->type = NF_NAT_MANIP_DST; break; default: return -EOPNOTSUPP; } if (tb[NFTA_NAT_FAMILY] == NULL) return -EINVAL; family = ntohl(nla_get_be32(tb[NFTA_NAT_FAMILY])); if (ctx->family != NFPROTO_INET && ctx->family != family) return -EOPNOTSUPP; switch (family) { case NFPROTO_IPV4: alen = sizeof_field(struct nf_nat_range, min_addr.ip); break; case NFPROTO_IPV6: alen = sizeof_field(struct nf_nat_range, min_addr.ip6); break; default: if (tb[NFTA_NAT_REG_ADDR_MIN]) return -EAFNOSUPPORT; break; } priv->family = family; if (tb[NFTA_NAT_REG_ADDR_MIN]) { err = nft_parse_register_load(ctx, tb[NFTA_NAT_REG_ADDR_MIN], &priv->sreg_addr_min, alen); if (err < 0) return err; if (tb[NFTA_NAT_REG_ADDR_MAX]) { err = nft_parse_register_load(ctx, tb[NFTA_NAT_REG_ADDR_MAX], &priv->sreg_addr_max, alen); if (err < 0) return err; } else { priv->sreg_addr_max = priv->sreg_addr_min; } priv->flags |= NF_NAT_RANGE_MAP_IPS; } plen = sizeof_field(struct nf_nat_range, min_proto.all); if (tb[NFTA_NAT_REG_PROTO_MIN]) { err = nft_parse_register_load(ctx, tb[NFTA_NAT_REG_PROTO_MIN], &priv->sreg_proto_min, plen); if (err < 0) return err; if (tb[NFTA_NAT_REG_PROTO_MAX]) { err = nft_parse_register_load(ctx, tb[NFTA_NAT_REG_PROTO_MAX], &priv->sreg_proto_max, plen); if (err < 0) return err; } else { priv->sreg_proto_max = priv->sreg_proto_min; } priv->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } if (tb[NFTA_NAT_FLAGS]) priv->flags |= ntohl(nla_get_be32(tb[NFTA_NAT_FLAGS])); return nf_ct_netns_get(ctx->net, family); } static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_nat *priv = nft_expr_priv(expr); switch (priv->type) { case NF_NAT_MANIP_SRC: if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_SNAT))) goto nla_put_failure; break; case NF_NAT_MANIP_DST: if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_DNAT))) goto nla_put_failure; break; } if (nla_put_be32(skb, NFTA_NAT_FAMILY, htonl(priv->family))) goto nla_put_failure; if (priv->sreg_addr_min) { if (nft_dump_register(skb, NFTA_NAT_REG_ADDR_MIN, priv->sreg_addr_min) || nft_dump_register(skb, NFTA_NAT_REG_ADDR_MAX, priv->sreg_addr_max)) goto nla_put_failure; } if (priv->sreg_proto_min) { if (nft_dump_register(skb, NFTA_NAT_REG_PROTO_MIN, priv->sreg_proto_min) || nft_dump_register(skb, NFTA_NAT_REG_PROTO_MAX, priv->sreg_proto_max)) goto nla_put_failure; } if (priv->flags != 0) { if (nla_put_be32(skb, NFTA_NAT_FLAGS, htonl(priv->flags))) goto nla_put_failure; } return 0; nla_put_failure: return -1; } static void nft_nat_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_nat *priv = nft_expr_priv(expr); nf_ct_netns_put(ctx->net, priv->family); } static struct nft_expr_type nft_nat_type; static const struct nft_expr_ops nft_nat_ops = { .type = &nft_nat_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_nat)), .eval = nft_nat_eval, .init = nft_nat_init, .destroy = nft_nat_destroy, .dump = nft_nat_dump, .validate = nft_nat_validate, .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_nat_type __read_mostly = { .name = "nat", .ops = &nft_nat_ops, .policy = nft_nat_policy, .maxattr = NFTA_NAT_MAX, .owner = THIS_MODULE, }; #ifdef CONFIG_NF_TABLES_INET static void nft_nat_inet_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_nat *priv = nft_expr_priv(expr); if (priv->family == nft_pf(pkt) || priv->family == NFPROTO_INET) nft_nat_eval(expr, regs, pkt); } static const struct nft_expr_ops nft_nat_inet_ops = { .type = &nft_nat_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_nat)), .eval = nft_nat_inet_eval, .init = nft_nat_init, .destroy = nft_nat_destroy, .dump = nft_nat_dump, .validate = nft_nat_validate, .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_inet_nat_type __read_mostly = { .name = "nat", .family = NFPROTO_INET, .ops = &nft_nat_inet_ops, .policy = nft_nat_policy, .maxattr = NFTA_NAT_MAX, .owner = THIS_MODULE, }; static int nft_nat_inet_module_init(void) { return nft_register_expr(&nft_inet_nat_type); } static void nft_nat_inet_module_exit(void) { nft_unregister_expr(&nft_inet_nat_type); } #else static int nft_nat_inet_module_init(void) { return 0; } static void nft_nat_inet_module_exit(void) { } #endif static int __init nft_nat_module_init(void) { int ret = nft_nat_inet_module_init(); if (ret) return ret; ret = nft_register_expr(&nft_nat_type); if (ret) nft_nat_inet_module_exit(); return ret; } static void __exit nft_nat_module_exit(void) { nft_nat_inet_module_exit(); nft_unregister_expr(&nft_nat_type); } module_init(nft_nat_module_init); module_exit(nft_nat_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>"); MODULE_ALIAS_NFT_EXPR("nat"); MODULE_DESCRIPTION("Network Address Translation support");
7 5 5 5 3 5 11 14 11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NET_IP6_TUNNEL_H #define _NET_IP6_TUNNEL_H #include <linux/ipv6.h> #include <linux/netdevice.h> #include <linux/if_tunnel.h> #include <linux/ip6_tunnel.h> #include <net/ip_tunnels.h> #include <net/dst_cache.h> #define IP6TUNNEL_ERR_TIMEO (30*HZ) /* capable of sending packets */ #define IP6_TNL_F_CAP_XMIT 0x10000 /* capable of receiving packets */ #define IP6_TNL_F_CAP_RCV 0x20000 /* determine capability on a per-packet basis */ #define IP6_TNL_F_CAP_PER_PACKET 0x40000 struct __ip6_tnl_parm { char name[IFNAMSIZ]; /* name of tunnel device */ int link; /* ifindex of underlying L2 interface */ __u8 proto; /* tunnel protocol */ __u8 encap_limit; /* encapsulation limit for tunnel */ __u8 hop_limit; /* hop limit for tunnel */ bool collect_md; __be32 flowinfo; /* traffic class and flowlabel for tunnel */ __u32 flags; /* tunnel flags */ struct in6_addr laddr; /* local tunnel end-point address */ struct in6_addr raddr; /* remote tunnel end-point address */ IP_TUNNEL_DECLARE_FLAGS(i_flags); IP_TUNNEL_DECLARE_FLAGS(o_flags); __be32 i_key; __be32 o_key; __u32 fwmark; __u32 index; /* ERSPAN type II index */ __u8 erspan_ver; /* ERSPAN version */ __u8 dir; /* direction */ __u16 hwid; /* hwid */ }; /* IPv6 tunnel */ struct ip6_tnl { struct ip6_tnl __rcu *next; /* next tunnel in list */ struct net_device *dev; /* virtual device associated with tunnel */ netdevice_tracker dev_tracker; struct net *net; /* netns for packet i/o */ struct __ip6_tnl_parm parms; /* tunnel configuration parameters */ struct flowi fl; /* flowi template for xmit */ struct dst_cache dst_cache; /* cached dst */ struct gro_cells gro_cells; int err_count; unsigned long err_time; /* These fields used only by GRE */ __u32 i_seqno; /* The last seen seqno */ atomic_t o_seqno; /* The last output seqno */ int hlen; /* tun_hlen + encap_hlen */ int tun_hlen; /* Precalculated header length */ int encap_hlen; /* Encap header length (FOU,GUE) */ struct ip_tunnel_encap encap; int mlink; }; struct ip6_tnl_encap_ops { size_t (*encap_hlen)(struct ip_tunnel_encap *e); int (*build_header)(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, struct flowi6 *fl6); int (*err_handler)(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info); }; #ifdef CONFIG_INET extern const struct ip6_tnl_encap_ops __rcu * ip6tun_encaps[MAX_IPTUN_ENCAP_OPS]; int ip6_tnl_encap_add_ops(const struct ip6_tnl_encap_ops *ops, unsigned int num); int ip6_tnl_encap_del_ops(const struct ip6_tnl_encap_ops *ops, unsigned int num); int ip6_tnl_encap_setup(struct ip6_tnl *t, struct ip_tunnel_encap *ipencap); static inline int ip6_encap_hlen(struct ip_tunnel_encap *e) { const struct ip6_tnl_encap_ops *ops; int hlen = -EINVAL; if (e->type == TUNNEL_ENCAP_NONE) return 0; if (e->type >= MAX_IPTUN_ENCAP_OPS) return -EINVAL; rcu_read_lock(); ops = rcu_dereference(ip6tun_encaps[e->type]); if (likely(ops && ops->encap_hlen)) hlen = ops->encap_hlen(e); rcu_read_unlock(); return hlen; } static inline int ip6_tnl_encap(struct sk_buff *skb, struct ip6_tnl *t, u8 *protocol, struct flowi6 *fl6) { const struct ip6_tnl_encap_ops *ops; int ret = -EINVAL; if (t->encap.type == TUNNEL_ENCAP_NONE) return 0; if (t->encap.type >= MAX_IPTUN_ENCAP_OPS) return -EINVAL; rcu_read_lock(); ops = rcu_dereference(ip6tun_encaps[t->encap.type]); if (likely(ops && ops->build_header)) ret = ops->build_header(skb, &t->encap, protocol, fl6); rcu_read_unlock(); return ret; } /* Tunnel encapsulation limit destination sub-option */ struct ipv6_tlv_tnl_enc_lim { __u8 type; /* type-code for option */ __u8 length; /* option length */ __u8 encap_limit; /* tunnel encapsulation limit */ } __packed; int ip6_tnl_rcv_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr); int ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, bool log_ecn_error); int ip6_tnl_xmit_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr); int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, struct flowi6 *fl6, int encap_limit, __u32 *pmtu, __u8 proto); __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw); __u32 ip6_tnl_get_cap(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr); struct net *ip6_tnl_get_link_net(const struct net_device *dev); int ip6_tnl_get_iflink(const struct net_device *dev); int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu); static inline void ip6tunnel_xmit(struct sock *sk, struct sk_buff *skb, struct net_device *dev, u16 ip6cb_flags) { int pkt_len, err; memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); IP6CB(skb)->flags = ip6cb_flags; pkt_len = skb->len - skb_inner_network_offset(skb); err = ip6_local_out(skb_dst_dev_net(skb), sk, skb); if (dev) { if (unlikely(net_xmit_eval(err))) pkt_len = -1; iptunnel_xmit_stats(dev, pkt_len); } } #endif #endif
4 13 33 25 58 25 7 34 34 37 106 1 4 4 107 347 457 457 35 6 37 39 37 48 19 14 14 14 12 14 37 5 4 5 5 5 5 4 5 42 36 33 50 18 2 77 62 2 37 8 4 4 53 54 51 5 1 19 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_IP_TUNNELS_H #define __NET_IP_TUNNELS_H 1 #include <linux/if_tunnel.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/socket.h> #include <linux/types.h> #include <linux/u64_stats_sync.h> #include <linux/bitops.h> #include <net/dsfield.h> #include <net/flow.h> #include <net/gro_cells.h> #include <net/inet_dscp.h> #include <net/inet_ecn.h> #include <net/netns/generic.h> #include <net/rtnetlink.h> #include <net/lwtunnel.h> #include <net/dst_cache.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> #endif /* Keep error state on tunnel for 30 sec */ #define IPTUNNEL_ERR_TIMEO (30*HZ) /* Used to memset ip_tunnel padding. */ #define IP_TUNNEL_KEY_SIZE offsetofend(struct ip_tunnel_key, tp_dst) /* Used to memset ipv4 address padding. */ #define IP_TUNNEL_KEY_IPV4_PAD offsetofend(struct ip_tunnel_key, u.ipv4.dst) #define IP_TUNNEL_KEY_IPV4_PAD_LEN \ (sizeof_field(struct ip_tunnel_key, u) - \ sizeof_field(struct ip_tunnel_key, u.ipv4)) #define __ipt_flag_op(op, ...) \ op(__VA_ARGS__, __IP_TUNNEL_FLAG_NUM) #define IP_TUNNEL_DECLARE_FLAGS(...) \ __ipt_flag_op(DECLARE_BITMAP, __VA_ARGS__) #define ip_tunnel_flags_zero(...) __ipt_flag_op(bitmap_zero, __VA_ARGS__) #define ip_tunnel_flags_copy(...) __ipt_flag_op(bitmap_copy, __VA_ARGS__) #define ip_tunnel_flags_and(...) __ipt_flag_op(bitmap_and, __VA_ARGS__) #define ip_tunnel_flags_or(...) __ipt_flag_op(bitmap_or, __VA_ARGS__) #define ip_tunnel_flags_empty(...) \ __ipt_flag_op(bitmap_empty, __VA_ARGS__) #define ip_tunnel_flags_intersect(...) \ __ipt_flag_op(bitmap_intersects, __VA_ARGS__) #define ip_tunnel_flags_subset(...) \ __ipt_flag_op(bitmap_subset, __VA_ARGS__) struct ip_tunnel_key { __be64 tun_id; union { struct { __be32 src; __be32 dst; } ipv4; struct { struct in6_addr src; struct in6_addr dst; } ipv6; } u; IP_TUNNEL_DECLARE_FLAGS(tun_flags); __be32 label; /* Flow Label for IPv6 */ u32 nhid; u8 tos; /* TOS for IPv4, TC for IPv6 */ u8 ttl; /* TTL for IPv4, HL for IPv6 */ __be16 tp_src; __be16 tp_dst; __u8 flow_flags; }; struct ip_tunnel_encap { u16 type; u16 flags; __be16 sport; __be16 dport; }; /* Flags for ip_tunnel_info mode. */ #define IP_TUNNEL_INFO_TX 0x01 /* represents tx tunnel parameters */ #define IP_TUNNEL_INFO_IPV6 0x02 /* key contains IPv6 addresses */ #define IP_TUNNEL_INFO_BRIDGE 0x04 /* represents a bridged tunnel id */ /* Maximum tunnel options length. */ #define IP_TUNNEL_OPTS_MAX \ GENMASK((sizeof_field(struct ip_tunnel_info, \ options_len) * BITS_PER_BYTE) - 1, 0) #define ip_tunnel_info_opts(info) \ _Generic(info, \ const struct ip_tunnel_info * : ((const void *)(info)->options),\ struct ip_tunnel_info * : ((void *)(info)->options)\ ) struct ip_tunnel_info { struct ip_tunnel_key key; struct ip_tunnel_encap encap; #ifdef CONFIG_DST_CACHE struct dst_cache dst_cache; #endif u8 options_len; u8 mode; u8 options[] __aligned_largest __counted_by(options_len); }; /* 6rd prefix/relay information */ #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel_6rd_parm { struct in6_addr prefix; __be32 relay_prefix; u16 prefixlen; u16 relay_prefixlen; }; #endif struct ip_tunnel_prl_entry { struct ip_tunnel_prl_entry __rcu *next; __be32 addr; u16 flags; struct rcu_head rcu_head; }; struct metadata_dst; /* Kernel-side variant of ip_tunnel_parm */ struct ip_tunnel_parm_kern { char name[IFNAMSIZ]; IP_TUNNEL_DECLARE_FLAGS(i_flags); IP_TUNNEL_DECLARE_FLAGS(o_flags); __be32 i_key; __be32 o_key; int link; struct iphdr iph; }; struct ip_tunnel { struct ip_tunnel __rcu *next; struct hlist_node hash_node; struct net_device *dev; netdevice_tracker dev_tracker; struct net *net; /* netns for packet i/o */ unsigned long err_time; /* Time when the last ICMP error * arrived */ int err_count; /* Number of arrived ICMP errors */ /* These four fields used only by GRE */ u32 i_seqno; /* The last seen seqno */ atomic_t o_seqno; /* The last output seqno */ int tun_hlen; /* Precalculated header length */ /* These four fields used only by ERSPAN */ u32 index; /* ERSPAN type II index */ u8 erspan_ver; /* ERSPAN version */ u8 dir; /* ERSPAN direction */ u16 hwid; /* ERSPAN hardware ID */ struct dst_cache dst_cache; struct ip_tunnel_parm_kern parms; int mlink; int encap_hlen; /* Encap header length (FOU,GUE) */ int hlen; /* tun_hlen + encap_hlen */ struct ip_tunnel_encap encap; /* for SIT */ #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel_6rd_parm ip6rd; #endif struct ip_tunnel_prl_entry __rcu *prl; /* potential router list */ unsigned int prl_count; /* # of entries in PRL */ unsigned int ip_tnl_net_id; struct gro_cells gro_cells; __u32 fwmark; bool collect_md; bool ignore_df; }; struct tnl_ptk_info { IP_TUNNEL_DECLARE_FLAGS(flags); __be16 proto; __be32 key; __be32 seq; int hdr_len; }; #define PACKET_RCVD 0 #define PACKET_REJECT 1 #define PACKET_NEXT 2 #define IP_TNL_HASH_BITS 7 #define IP_TNL_HASH_SIZE (1 << IP_TNL_HASH_BITS) struct ip_tunnel_net { struct net_device *fb_tunnel_dev; struct rtnl_link_ops *rtnl_link_ops; struct hlist_head tunnels[IP_TNL_HASH_SIZE]; struct ip_tunnel __rcu *collect_md_tun; int type; }; static inline void ip_tunnel_set_options_present(unsigned long *flags) { IP_TUNNEL_DECLARE_FLAGS(present) = { }; __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, present); __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, present); __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, present); __set_bit(IP_TUNNEL_GTP_OPT_BIT, present); __set_bit(IP_TUNNEL_PFCP_OPT_BIT, present); ip_tunnel_flags_or(flags, flags, present); } static inline void ip_tunnel_clear_options_present(unsigned long *flags) { IP_TUNNEL_DECLARE_FLAGS(present) = { }; __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, present); __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, present); __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, present); __set_bit(IP_TUNNEL_GTP_OPT_BIT, present); __set_bit(IP_TUNNEL_PFCP_OPT_BIT, present); __ipt_flag_op(bitmap_andnot, flags, flags, present); } static inline bool ip_tunnel_is_options_present(const unsigned long *flags) { IP_TUNNEL_DECLARE_FLAGS(present) = { }; __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, present); __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, present); __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, present); __set_bit(IP_TUNNEL_GTP_OPT_BIT, present); __set_bit(IP_TUNNEL_PFCP_OPT_BIT, present); return ip_tunnel_flags_intersect(flags, present); } static inline bool ip_tunnel_flags_is_be16_compat(const unsigned long *flags) { IP_TUNNEL_DECLARE_FLAGS(supp) = { }; bitmap_set(supp, 0, BITS_PER_TYPE(__be16)); __set_bit(IP_TUNNEL_VTI_BIT, supp); return ip_tunnel_flags_subset(flags, supp); } static inline void ip_tunnel_flags_from_be16(unsigned long *dst, __be16 flags) { ip_tunnel_flags_zero(dst); bitmap_write(dst, be16_to_cpu(flags), 0, BITS_PER_TYPE(__be16)); __assign_bit(IP_TUNNEL_VTI_BIT, dst, flags & VTI_ISVTI); } static inline __be16 ip_tunnel_flags_to_be16(const unsigned long *flags) { __be16 ret; ret = cpu_to_be16(bitmap_read(flags, 0, BITS_PER_TYPE(__be16))); if (test_bit(IP_TUNNEL_VTI_BIT, flags)) ret |= VTI_ISVTI; return ret; } static inline void ip_tunnel_key_init(struct ip_tunnel_key *key, __be32 saddr, __be32 daddr, u8 tos, u8 ttl, __be32 label, __be16 tp_src, __be16 tp_dst, __be64 tun_id, const unsigned long *tun_flags) { key->tun_id = tun_id; key->u.ipv4.src = saddr; key->u.ipv4.dst = daddr; memset((unsigned char *)key + IP_TUNNEL_KEY_IPV4_PAD, 0, IP_TUNNEL_KEY_IPV4_PAD_LEN); key->tos = tos; key->ttl = ttl; key->label = label; ip_tunnel_flags_copy(key->tun_flags, tun_flags); /* For the tunnel types on the top of IPsec, the tp_src and tp_dst of * the upper tunnel are used. * E.g: GRE over IPSEC, the tp_src and tp_port are zero. */ key->tp_src = tp_src; key->tp_dst = tp_dst; /* Clear struct padding. */ if (sizeof(*key) != IP_TUNNEL_KEY_SIZE) memset((unsigned char *)key + IP_TUNNEL_KEY_SIZE, 0, sizeof(*key) - IP_TUNNEL_KEY_SIZE); } static inline bool ip_tunnel_dst_cache_usable(const struct sk_buff *skb, const struct ip_tunnel_info *info) { if (skb->mark) return false; return !info || !test_bit(IP_TUNNEL_NOCACHE_BIT, info->key.tun_flags); } static inline unsigned short ip_tunnel_info_af(const struct ip_tunnel_info *tun_info) { return tun_info->mode & IP_TUNNEL_INFO_IPV6 ? AF_INET6 : AF_INET; } static inline __be64 key32_to_tunnel_id(__be32 key) { #ifdef __BIG_ENDIAN return (__force __be64)key; #else return (__force __be64)((__force u64)key << 32); #endif } /* Returns the least-significant 32 bits of a __be64. */ static inline __be32 tunnel_id_to_key32(__be64 tun_id) { #ifdef __BIG_ENDIAN return (__force __be32)tun_id; #else return (__force __be32)((__force u64)tun_id >> 32); #endif } #ifdef CONFIG_INET static inline void ip_tunnel_init_flow(struct flowi4 *fl4, int proto, __be32 daddr, __be32 saddr, __be32 key, __u8 tos, struct net *net, int oif, __u32 mark, __u32 tun_inner_hash, __u8 flow_flags) { memset(fl4, 0, sizeof(*fl4)); if (oif) { fl4->flowi4_l3mdev = l3mdev_master_upper_ifindex_by_index(net, oif); /* Legacy VRF/l3mdev use case */ fl4->flowi4_oif = fl4->flowi4_l3mdev ? 0 : oif; } fl4->daddr = daddr; fl4->saddr = saddr; fl4->flowi4_dscp = inet_dsfield_to_dscp(tos); fl4->flowi4_proto = proto; fl4->fl4_gre_key = key; fl4->flowi4_mark = mark; fl4->flowi4_multipath_hash = tun_inner_hash; fl4->flowi4_flags = flow_flags; } int ip_tunnel_init(struct net_device *dev); void ip_tunnel_uninit(struct net_device *dev); void ip_tunnel_dellink(struct net_device *dev, struct list_head *head); struct net *ip_tunnel_get_link_net(const struct net_device *dev); int ip_tunnel_get_iflink(const struct net_device *dev); int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, struct rtnl_link_ops *ops, char *devname); void ip_tunnel_delete_net(struct net *net, unsigned int id, struct rtnl_link_ops *ops, struct list_head *dev_to_kill); void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, const u8 protocol); void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const u8 proto, int tunnel_hlen); int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd); bool ip_tunnel_parm_from_user(struct ip_tunnel_parm_kern *kp, const void __user *data); bool ip_tunnel_parm_to_user(void __user *data, struct ip_tunnel_parm_kern *kp); int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd); int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict); int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu); struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, int link, const unsigned long *flags, __be32 remote, __be32 local, __be32 key); void ip_tunnel_md_udp_encap(struct sk_buff *skb, struct ip_tunnel_info *info); int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, bool log_ecn_error); int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm_kern *p, __u32 fwmark); int ip_tunnel_newlink(struct net *net, struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm_kern *p, __u32 fwmark); void ip_tunnel_setup(struct net_device *dev, unsigned int net_id); bool ip_tunnel_netlink_encap_parms(struct nlattr *data[], struct ip_tunnel_encap *encap); void ip_tunnel_netlink_parms(struct nlattr *data[], struct ip_tunnel_parm_kern *parms); extern const struct header_ops ip_tunnel_header_ops; __be16 ip_tunnel_parse_protocol(const struct sk_buff *skb); struct ip_tunnel_encap_ops { size_t (*encap_hlen)(struct ip_tunnel_encap *e); int (*build_header)(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, struct flowi4 *fl4); int (*err_handler)(struct sk_buff *skb, u32 info); }; #define MAX_IPTUN_ENCAP_OPS 8 extern const struct ip_tunnel_encap_ops __rcu * iptun_encaps[MAX_IPTUN_ENCAP_OPS]; int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *op, unsigned int num); int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *op, unsigned int num); int ip_tunnel_encap_setup(struct ip_tunnel *t, struct ip_tunnel_encap *ipencap); static inline enum skb_drop_reason pskb_inet_may_pull_reason(struct sk_buff *skb) { int nhlen; switch (skb->protocol) { #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): nhlen = sizeof(struct ipv6hdr); break; #endif case htons(ETH_P_IP): nhlen = sizeof(struct iphdr); break; default: nhlen = 0; } return pskb_network_may_pull_reason(skb, nhlen); } static inline bool pskb_inet_may_pull(struct sk_buff *skb) { return pskb_inet_may_pull_reason(skb) == SKB_NOT_DROPPED_YET; } /* Variant of pskb_inet_may_pull(). */ static inline enum skb_drop_reason skb_vlan_inet_prepare(struct sk_buff *skb, bool inner_proto_inherit) { int nhlen = 0, maclen = inner_proto_inherit ? 0 : ETH_HLEN; __be16 type = skb->protocol; enum skb_drop_reason reason; /* Essentially this is skb_protocol(skb, true) * And we get MAC len. */ if (eth_type_vlan(type)) type = __vlan_get_protocol(skb, type, &maclen); switch (type) { #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): nhlen = sizeof(struct ipv6hdr); break; #endif case htons(ETH_P_IP): nhlen = sizeof(struct iphdr); break; } /* For ETH_P_IPV6/ETH_P_IP we make sure to pull * a base network header in skb->head. */ reason = pskb_may_pull_reason(skb, maclen + nhlen); if (reason) return reason; skb_set_network_header(skb, maclen); return SKB_NOT_DROPPED_YET; } static inline int ip_encap_hlen(struct ip_tunnel_encap *e) { const struct ip_tunnel_encap_ops *ops; int hlen = -EINVAL; if (e->type == TUNNEL_ENCAP_NONE) return 0; if (e->type >= MAX_IPTUN_ENCAP_OPS) return -EINVAL; rcu_read_lock(); ops = rcu_dereference(iptun_encaps[e->type]); if (likely(ops && ops->encap_hlen)) hlen = ops->encap_hlen(e); rcu_read_unlock(); return hlen; } static inline int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, struct flowi4 *fl4) { const struct ip_tunnel_encap_ops *ops; int ret = -EINVAL; if (e->type == TUNNEL_ENCAP_NONE) return 0; if (e->type >= MAX_IPTUN_ENCAP_OPS) return -EINVAL; rcu_read_lock(); ops = rcu_dereference(iptun_encaps[e->type]); if (likely(ops && ops->build_header)) ret = ops->build_header(skb, e, protocol, fl4); rcu_read_unlock(); return ret; } /* Extract dsfield from inner protocol */ static inline u8 ip_tunnel_get_dsfield(const struct iphdr *iph, const struct sk_buff *skb) { __be16 payload_protocol = skb_protocol(skb, true); if (payload_protocol == htons(ETH_P_IP)) return iph->tos; else if (payload_protocol == htons(ETH_P_IPV6)) return ipv6_get_dsfield((const struct ipv6hdr *)iph); else return 0; } static inline __be32 ip_tunnel_get_flowlabel(const struct iphdr *iph, const struct sk_buff *skb) { __be16 payload_protocol = skb_protocol(skb, true); if (payload_protocol == htons(ETH_P_IPV6)) return ip6_flowlabel((const struct ipv6hdr *)iph); else return 0; } static inline u8 ip_tunnel_get_ttl(const struct iphdr *iph, const struct sk_buff *skb) { __be16 payload_protocol = skb_protocol(skb, true); if (payload_protocol == htons(ETH_P_IP)) return iph->ttl; else if (payload_protocol == htons(ETH_P_IPV6)) return ((const struct ipv6hdr *)iph)->hop_limit; else return 0; } /* Propagate ECN bits out */ static inline u8 ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph, const struct sk_buff *skb) { u8 inner = ip_tunnel_get_dsfield(iph, skb); return INET_ECN_encapsulate(tos, inner); } int __iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto, bool raw_proto, bool xnet); static inline int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto, bool xnet) { return __iptunnel_pull_header(skb, hdr_len, inner_proto, false, xnet); } void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, u8 proto, u8 tos, u8 ttl, __be16 df, bool xnet, u16 ipcb_flags); struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, gfp_t flags); int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst, int headroom, bool reply); static inline void ip_tunnel_adj_headroom(struct net_device *dev, unsigned int headroom) { /* we must cap headroom to some upperlimit, else pskb_expand_head * will overflow header offsets in skb_headers_offset_update(). */ const unsigned int max_allowed = 512; if (headroom > max_allowed) headroom = max_allowed; if (headroom > READ_ONCE(dev->needed_headroom)) WRITE_ONCE(dev->needed_headroom, headroom); } int iptunnel_handle_offloads(struct sk_buff *skb, int gso_type_mask); static inline int iptunnel_pull_offloads(struct sk_buff *skb) { if (skb_is_gso(skb)) { int err; err = skb_unclone(skb, GFP_ATOMIC); if (unlikely(err)) return err; skb_shinfo(skb)->gso_type &= ~(NETIF_F_GSO_ENCAP_ALL >> NETIF_F_GSO_SHIFT); } skb->encapsulation = 0; return 0; } static inline void iptunnel_xmit_stats(struct net_device *dev, int pkt_len) { if (pkt_len > 0) { struct pcpu_sw_netstats *tstats = get_cpu_ptr(dev->tstats); u64_stats_update_begin(&tstats->syncp); u64_stats_add(&tstats->tx_bytes, pkt_len); u64_stats_inc(&tstats->tx_packets); u64_stats_update_end(&tstats->syncp); put_cpu_ptr(tstats); return; } if (pkt_len < 0) { DEV_STATS_INC(dev, tx_errors); DEV_STATS_INC(dev, tx_aborted_errors); } else { DEV_STATS_INC(dev, tx_dropped); } } static inline void ip_tunnel_info_opts_get(void *to, const struct ip_tunnel_info *info) { memcpy(to, ip_tunnel_info_opts(info), info->options_len); } static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info, const void *from, int len, const unsigned long *flags) { info->options_len = len; if (len > 0) { memcpy(ip_tunnel_info_opts(info), from, len); ip_tunnel_flags_or(info->key.tun_flags, info->key.tun_flags, flags); } } static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstate) { return (struct ip_tunnel_info *)lwtstate->data; } DECLARE_STATIC_KEY_FALSE(ip_tunnel_metadata_cnt); /* Returns > 0 if metadata should be collected */ static inline int ip_tunnel_collect_metadata(void) { return static_branch_unlikely(&ip_tunnel_metadata_cnt); } void __init ip_tunnel_core_init(void); void ip_tunnel_need_metadata(void); void ip_tunnel_unneed_metadata(void); #else /* CONFIG_INET */ static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstate) { return NULL; } static inline void ip_tunnel_need_metadata(void) { } static inline void ip_tunnel_unneed_metadata(void) { } static inline void ip_tunnel_info_opts_get(void *to, const struct ip_tunnel_info *info) { } static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info, const void *from, int len, const unsigned long *flags) { info->options_len = 0; } #endif /* CONFIG_INET */ #endif /* __NET_IP_TUNNELS_H */
11 8 6 5 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 // SPDX-License-Identifier: GPL-2.0-or-later /* Module signature checker * * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/module.h> #include <linux/module_signature.h> #include <linux/string.h> #include <linux/verification.h> #include <linux/security.h> #include <crypto/public_key.h> #include <uapi/linux/module.h> #include "internal.h" #undef MODULE_PARAM_PREFIX #define MODULE_PARAM_PREFIX "module." static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE); module_param(sig_enforce, bool_enable_only, 0644); /* * Export sig_enforce kernel cmdline parameter to allow other subsystems rely * on that instead of directly to CONFIG_MODULE_SIG_FORCE config. */ bool is_module_sig_enforced(void) { return sig_enforce; } EXPORT_SYMBOL(is_module_sig_enforced); void set_module_sig_enforced(void) { sig_enforce = true; } /* * Verify the signature on a module. */ int mod_verify_sig(const void *mod, struct load_info *info) { struct module_signature ms; size_t sig_len, modlen = info->len; int ret; pr_devel("==>%s(,%zu)\n", __func__, modlen); if (modlen <= sizeof(ms)) return -EBADMSG; memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms)); ret = mod_check_sig(&ms, modlen, "module"); if (ret) return ret; sig_len = be32_to_cpu(ms.sig_len); modlen -= sig_len + sizeof(ms); info->len = modlen; return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len, VERIFY_USE_SECONDARY_KEYRING, VERIFYING_MODULE_SIGNATURE, NULL, NULL); } int module_sig_check(struct load_info *info, int flags) { int err = -ENODATA; const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; const char *reason; const void *mod = info->hdr; bool mangled_module = flags & (MODULE_INIT_IGNORE_MODVERSIONS | MODULE_INIT_IGNORE_VERMAGIC); /* * Do not allow mangled modules as a module with version information * removed is no longer the module that was signed. */ if (!mangled_module && info->len > markerlen && memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) { /* We truncate the module to discard the signature */ info->len -= markerlen; err = mod_verify_sig(mod, info); if (!err) { info->sig_ok = true; return 0; } } /* * We don't permit modules to be loaded into the trusted kernels * without a valid signature on them, but if we're not enforcing, * certain errors are non-fatal. */ switch (err) { case -ENODATA: reason = "unsigned module"; break; case -ENOPKG: reason = "module with unsupported crypto"; break; case -ENOKEY: reason = "module with unavailable key"; break; default: /* * All other errors are fatal, including lack of memory, * unparseable signatures, and signature check failures -- * even if signatures aren't required. */ return err; } if (is_module_sig_enforced()) { pr_notice("Loading of %s is rejected\n", reason); return -EKEYREJECTED; } return security_locked_down(LOCKDOWN_MODULE_SIGNATURE); }
7 3 3 3 3 3 3 3 3 7 7 6 6 6 6 6 2 2 1 6 6 3 1 2 1 1 2 15 15 3 15 15 3 15 15 6 6 6 15 15 14 14 6 14 12 1 3 11 15 3 15 1 1 1 1 5 4 4 2 5 17 5 17 1 16 15 13 4 4 4 6 2 1 4 3 6 29 5 6 29 6 6 6 6 2 6 7 7 7 7 7 7 6 2 6 5 2 18 18 4 9 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 2 4 1 9 5 3 3 5 9 9 9 8 7 9 172 6 3 22 2 3 1 2 1 2 172 128 27 173 42 42 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 // SPDX-License-Identifier: GPL-2.0-only /* By Ross Biro 1/23/92 */ /* * Pentium III FXSR, SSE support * Gareth Hughes <gareth@valinux.com>, May 2000 */ #include <linux/kernel.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> #include <linux/mm.h> #include <linux/smp.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/ptrace.h> #include <linux/user.h> #include <linux/elf.h> #include <linux/security.h> #include <linux/audit.h> #include <linux/seccomp.h> #include <linux/signal.h> #include <linux/perf_event.h> #include <linux/hw_breakpoint.h> #include <linux/rcupdate.h> #include <linux/export.h> #include <linux/context_tracking.h> #include <linux/nospec.h> #include <linux/uaccess.h> #include <asm/processor.h> #include <asm/fpu/signal.h> #include <asm/fpu/regset.h> #include <asm/fpu/xstate.h> #include <asm/debugreg.h> #include <asm/ldt.h> #include <asm/desc.h> #include <asm/prctl.h> #include <asm/proto.h> #include <asm/hw_breakpoint.h> #include <asm/traps.h> #include <asm/syscall.h> #include <asm/fsgsbase.h> #include <asm/io_bitmap.h> #include "tls.h" enum x86_regset_32 { REGSET32_GENERAL, REGSET32_FP, REGSET32_XFP, REGSET32_XSTATE, REGSET32_TLS, REGSET32_IOPERM, }; enum x86_regset_64 { REGSET64_GENERAL, REGSET64_FP, REGSET64_IOPERM, REGSET64_XSTATE, REGSET64_SSP, }; #define REGSET_GENERAL \ ({ \ BUILD_BUG_ON((int)REGSET32_GENERAL != (int)REGSET64_GENERAL); \ REGSET32_GENERAL; \ }) #define REGSET_FP \ ({ \ BUILD_BUG_ON((int)REGSET32_FP != (int)REGSET64_FP); \ REGSET32_FP; \ }) struct pt_regs_offset { const char *name; int offset; }; #define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)} #define REG_OFFSET_END {.name = NULL, .offset = 0} static const struct pt_regs_offset regoffset_table[] = { #ifdef CONFIG_X86_64 REG_OFFSET_NAME(r15), REG_OFFSET_NAME(r14), REG_OFFSET_NAME(r13), REG_OFFSET_NAME(r12), REG_OFFSET_NAME(r11), REG_OFFSET_NAME(r10), REG_OFFSET_NAME(r9), REG_OFFSET_NAME(r8), #endif REG_OFFSET_NAME(bx), REG_OFFSET_NAME(cx), REG_OFFSET_NAME(dx), REG_OFFSET_NAME(si), REG_OFFSET_NAME(di), REG_OFFSET_NAME(bp), REG_OFFSET_NAME(ax), #ifdef CONFIG_X86_32 REG_OFFSET_NAME(ds), REG_OFFSET_NAME(es), REG_OFFSET_NAME(fs), REG_OFFSET_NAME(gs), #endif REG_OFFSET_NAME(orig_ax), REG_OFFSET_NAME(ip), REG_OFFSET_NAME(cs), REG_OFFSET_NAME(flags), REG_OFFSET_NAME(sp), REG_OFFSET_NAME(ss), REG_OFFSET_END, }; /** * regs_query_register_offset() - query register offset from its name * @name: the name of a register * * regs_query_register_offset() returns the offset of a register in struct * pt_regs from its name. If the name is invalid, this returns -EINVAL; */ int regs_query_register_offset(const char *name) { const struct pt_regs_offset *roff; for (roff = regoffset_table; roff->name != NULL; roff++) if (!strcmp(roff->name, name)) return roff->offset; return -EINVAL; } /** * regs_query_register_name() - query register name from its offset * @offset: the offset of a register in struct pt_regs. * * regs_query_register_name() returns the name of a register from its * offset in struct pt_regs. If the @offset is invalid, this returns NULL; */ const char *regs_query_register_name(unsigned int offset) { const struct pt_regs_offset *roff; for (roff = regoffset_table; roff->name != NULL; roff++) if (roff->offset == offset) return roff->name; return NULL; } /* * does not yet catch signals sent when the child dies. * in exit.c or in signal.c. */ /* * Determines which flags the user has access to [1 = access, 0 = no access]. */ #define FLAG_MASK_32 ((unsigned long) \ (X86_EFLAGS_CF | X86_EFLAGS_PF | \ X86_EFLAGS_AF | X86_EFLAGS_ZF | \ X86_EFLAGS_SF | X86_EFLAGS_TF | \ X86_EFLAGS_DF | X86_EFLAGS_OF | \ X86_EFLAGS_RF | X86_EFLAGS_AC)) /* * Determines whether a value may be installed in a segment register. */ static inline bool invalid_selector(u16 value) { return unlikely(value != 0 && (value & SEGMENT_RPL_MASK) != USER_RPL); } #ifdef CONFIG_X86_32 #define FLAG_MASK FLAG_MASK_32 static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno) { BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0); return &regs->bx + (regno >> 2); } static u16 get_segment_reg(struct task_struct *task, unsigned long offset) { /* * Returning the value truncates it to 16 bits. */ unsigned int retval; if (offset != offsetof(struct user_regs_struct, gs)) retval = *pt_regs_access(task_pt_regs(task), offset); else { if (task == current) savesegment(gs, retval); else retval = task->thread.gs; } return retval; } static int set_segment_reg(struct task_struct *task, unsigned long offset, u16 value) { if (WARN_ON_ONCE(task == current)) return -EIO; /* * The value argument was already truncated to 16 bits. */ if (invalid_selector(value)) return -EIO; /* * For %cs and %ss we cannot permit a null selector. * We can permit a bogus selector as long as it has USER_RPL. * Null selectors are fine for other segment registers, but * we will never get back to user mode with invalid %cs or %ss * and will take the trap in iret instead. Much code relies * on user_mode() to distinguish a user trap frame (which can * safely use invalid selectors) from a kernel trap frame. */ switch (offset) { case offsetof(struct user_regs_struct, cs): case offsetof(struct user_regs_struct, ss): if (unlikely(value == 0)) return -EIO; fallthrough; default: *pt_regs_access(task_pt_regs(task), offset) = value; break; case offsetof(struct user_regs_struct, gs): task->thread.gs = value; } return 0; } #else /* CONFIG_X86_64 */ #define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT) static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long offset) { BUILD_BUG_ON(offsetof(struct pt_regs, r15) != 0); return &regs->r15 + (offset / sizeof(regs->r15)); } static u16 get_segment_reg(struct task_struct *task, unsigned long offset) { /* * Returning the value truncates it to 16 bits. */ unsigned int seg; switch (offset) { case offsetof(struct user_regs_struct, fs): if (task == current) { /* Older gas can't assemble movq %?s,%r?? */ asm("movl %%fs,%0" : "=r" (seg)); return seg; } return task->thread.fsindex; case offsetof(struct user_regs_struct, gs): if (task == current) { asm("movl %%gs,%0" : "=r" (seg)); return seg; } return task->thread.gsindex; case offsetof(struct user_regs_struct, ds): if (task == current) { asm("movl %%ds,%0" : "=r" (seg)); return seg; } return task->thread.ds; case offsetof(struct user_regs_struct, es): if (task == current) { asm("movl %%es,%0" : "=r" (seg)); return seg; } return task->thread.es; case offsetof(struct user_regs_struct, cs): case offsetof(struct user_regs_struct, ss): break; } return *pt_regs_access(task_pt_regs(task), offset); } static int set_segment_reg(struct task_struct *task, unsigned long offset, u16 value) { if (WARN_ON_ONCE(task == current)) return -EIO; /* * The value argument was already truncated to 16 bits. */ if (invalid_selector(value)) return -EIO; /* * Writes to FS and GS will change the stored selector. Whether * this changes the segment base as well depends on whether * FSGSBASE is enabled. */ switch (offset) { case offsetof(struct user_regs_struct,fs): task->thread.fsindex = value; break; case offsetof(struct user_regs_struct,gs): task->thread.gsindex = value; break; case offsetof(struct user_regs_struct,ds): task->thread.ds = value; break; case offsetof(struct user_regs_struct,es): task->thread.es = value; break; /* * Can't actually change these in 64-bit mode. */ case offsetof(struct user_regs_struct,cs): if (unlikely(value == 0)) return -EIO; task_pt_regs(task)->cs = value; break; case offsetof(struct user_regs_struct,ss): if (unlikely(value == 0)) return -EIO; task_pt_regs(task)->ss = value; break; } return 0; } #endif /* CONFIG_X86_32 */ static unsigned long get_flags(struct task_struct *task) { unsigned long retval = task_pt_regs(task)->flags; /* * If the debugger set TF, hide it from the readout. */ if (test_tsk_thread_flag(task, TIF_FORCED_TF)) retval &= ~X86_EFLAGS_TF; return retval; } static int set_flags(struct task_struct *task, unsigned long value) { struct pt_regs *regs = task_pt_regs(task); /* * If the user value contains TF, mark that * it was not "us" (the debugger) that set it. * If not, make sure it stays set if we had. */ if (value & X86_EFLAGS_TF) clear_tsk_thread_flag(task, TIF_FORCED_TF); else if (test_tsk_thread_flag(task, TIF_FORCED_TF)) value |= X86_EFLAGS_TF; regs->flags = (regs->flags & ~FLAG_MASK) | (value & FLAG_MASK); return 0; } static int putreg(struct task_struct *child, unsigned long offset, unsigned long value) { switch (offset) { case offsetof(struct user_regs_struct, cs): case offsetof(struct user_regs_struct, ds): case offsetof(struct user_regs_struct, es): case offsetof(struct user_regs_struct, fs): case offsetof(struct user_regs_struct, gs): case offsetof(struct user_regs_struct, ss): return set_segment_reg(child, offset, value); case offsetof(struct user_regs_struct, flags): return set_flags(child, value); #ifdef CONFIG_X86_64 case offsetof(struct user_regs_struct,fs_base): if (value >= TASK_SIZE_MAX) return -EIO; x86_fsbase_write_task(child, value); return 0; case offsetof(struct user_regs_struct,gs_base): if (value >= TASK_SIZE_MAX) return -EIO; x86_gsbase_write_task(child, value); return 0; #endif } *pt_regs_access(task_pt_regs(child), offset) = value; return 0; } static unsigned long getreg(struct task_struct *task, unsigned long offset) { switch (offset) { case offsetof(struct user_regs_struct, cs): case offsetof(struct user_regs_struct, ds): case offsetof(struct user_regs_struct, es): case offsetof(struct user_regs_struct, fs): case offsetof(struct user_regs_struct, gs): case offsetof(struct user_regs_struct, ss): return get_segment_reg(task, offset); case offsetof(struct user_regs_struct, flags): return get_flags(task); #ifdef CONFIG_X86_64 case offsetof(struct user_regs_struct, fs_base): return x86_fsbase_read_task(task); case offsetof(struct user_regs_struct, gs_base): return x86_gsbase_read_task(task); #endif } return *pt_regs_access(task_pt_regs(task), offset); } static int genregs_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { int reg; for (reg = 0; to.left; reg++) membuf_store(&to, getreg(target, reg * sizeof(unsigned long))); return 0; } static int genregs_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { int ret = 0; if (kbuf) { const unsigned long *k = kbuf; while (count >= sizeof(*k) && !ret) { ret = putreg(target, pos, *k++); count -= sizeof(*k); pos += sizeof(*k); } } else { const unsigned long __user *u = ubuf; while (count >= sizeof(*u) && !ret) { unsigned long word; ret = __get_user(word, u++); if (ret) break; ret = putreg(target, pos, word); count -= sizeof(*u); pos += sizeof(*u); } } return ret; } static void ptrace_triggered(struct perf_event *bp, struct perf_sample_data *data, struct pt_regs *regs) { int i; struct thread_struct *thread = &(current->thread); /* * Store in the virtual DR6 register the fact that the breakpoint * was hit so the thread's debugger will see it. */ for (i = 0; i < HBP_NUM; i++) { if (thread->ptrace_bps[i] == bp) break; } thread->virtual_dr6 |= (DR_TRAP0 << i); } /* * Walk through every ptrace breakpoints for this thread and * build the dr7 value on top of their attributes. * */ static unsigned long ptrace_get_dr7(struct perf_event *bp[]) { int i; int dr7 = 0; struct arch_hw_breakpoint *info; for (i = 0; i < HBP_NUM; i++) { if (bp[i] && !bp[i]->attr.disabled) { info = counter_arch_bp(bp[i]); dr7 |= encode_dr7(i, info->len, info->type); } } return dr7; } static int ptrace_fill_bp_fields(struct perf_event_attr *attr, int len, int type, bool disabled) { int err, bp_len, bp_type; err = arch_bp_generic_fields(len, type, &bp_len, &bp_type); if (!err) { attr->bp_len = bp_len; attr->bp_type = bp_type; attr->disabled = disabled; } return err; } static struct perf_event * ptrace_register_breakpoint(struct task_struct *tsk, int len, int type, unsigned long addr, bool disabled) { struct perf_event_attr attr; int err; ptrace_breakpoint_init(&attr); attr.bp_addr = addr; err = ptrace_fill_bp_fields(&attr, len, type, disabled); if (err) return ERR_PTR(err); return register_user_hw_breakpoint(&attr, ptrace_triggered, NULL, tsk); } static int ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, int disabled) { struct perf_event_attr attr = bp->attr; int err; err = ptrace_fill_bp_fields(&attr, len, type, disabled); if (err) return err; return modify_user_hw_breakpoint(bp, &attr); } /* * Handle ptrace writes to debug register 7. */ static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data) { struct thread_struct *thread = &tsk->thread; unsigned long old_dr7; bool second_pass = false; int i, rc, ret = 0; data &= ~DR_CONTROL_RESERVED; old_dr7 = ptrace_get_dr7(thread->ptrace_bps); restore: rc = 0; for (i = 0; i < HBP_NUM; i++) { unsigned len, type; bool disabled = !decode_dr7(data, i, &len, &type); struct perf_event *bp = thread->ptrace_bps[i]; if (!bp) { if (disabled) continue; bp = ptrace_register_breakpoint(tsk, len, type, 0, disabled); if (IS_ERR(bp)) { rc = PTR_ERR(bp); break; } thread->ptrace_bps[i] = bp; continue; } rc = ptrace_modify_breakpoint(bp, len, type, disabled); if (rc) break; } /* Restore if the first pass failed, second_pass shouldn't fail. */ if (rc && !WARN_ON(second_pass)) { ret = rc; data = old_dr7; second_pass = true; goto restore; } return ret; } /* * Handle PTRACE_PEEKUSR calls for the debug register area. */ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) { struct thread_struct *thread = &tsk->thread; unsigned long val = 0; if (n < HBP_NUM) { int index = array_index_nospec(n, HBP_NUM); struct perf_event *bp = thread->ptrace_bps[index]; if (bp) val = bp->hw.info.address; } else if (n == 6) { val = thread->virtual_dr6 ^ DR6_RESERVED; /* Flip back to arch polarity */ } else if (n == 7) { val = thread->ptrace_dr7; } return val; } static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, unsigned long addr) { struct thread_struct *t = &tsk->thread; struct perf_event *bp = t->ptrace_bps[nr]; int err = 0; if (!bp) { /* * Put stub len and type to create an inactive but correct bp. * * CHECKME: the previous code returned -EIO if the addr wasn't * a valid task virtual addr. The new one will return -EINVAL in * this case. * -EINVAL may be what we want for in-kernel breakpoints users, * but -EIO looks better for ptrace, since we refuse a register * writing for the user. And anyway this is the previous * behaviour. */ bp = ptrace_register_breakpoint(tsk, X86_BREAKPOINT_LEN_1, X86_BREAKPOINT_WRITE, addr, true); if (IS_ERR(bp)) err = PTR_ERR(bp); else t->ptrace_bps[nr] = bp; } else { struct perf_event_attr attr = bp->attr; attr.bp_addr = addr; err = modify_user_hw_breakpoint(bp, &attr); } return err; } /* * Handle PTRACE_POKEUSR calls for the debug register area. */ static int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val) { struct thread_struct *thread = &tsk->thread; /* There are no DR4 or DR5 registers */ int rc = -EIO; if (n < HBP_NUM) { rc = ptrace_set_breakpoint_addr(tsk, n, val); } else if (n == 6) { thread->virtual_dr6 = val ^ DR6_RESERVED; /* Flip to positive polarity */ rc = 0; } else if (n == 7) { rc = ptrace_write_dr7(tsk, val); if (!rc) thread->ptrace_dr7 = val; } return rc; } /* * These access the current or another (stopped) task's io permission * bitmap for debugging or core dump. */ static int ioperm_active(struct task_struct *target, const struct user_regset *regset) { struct io_bitmap *iobm = target->thread.io_bitmap; return iobm ? DIV_ROUND_UP(iobm->max, regset->size) : 0; } static int ioperm_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { struct io_bitmap *iobm = target->thread.io_bitmap; if (!iobm) return -ENXIO; return membuf_write(&to, iobm->bitmap, IO_BITMAP_BYTES); } /* * Called by kernel/ptrace.c when detaching.. * * Make sure the single step bit is not set. */ void ptrace_disable(struct task_struct *child) { user_disable_single_step(child); } #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION static const struct user_regset_view user_x86_32_view; /* Initialized below. */ #endif #ifdef CONFIG_X86_64 static const struct user_regset_view user_x86_64_view; /* Initialized below. */ #endif long arch_ptrace(struct task_struct *child, long request, unsigned long addr, unsigned long data) { int ret; unsigned long __user *datap = (unsigned long __user *)data; #ifdef CONFIG_X86_64 /* This is native 64-bit ptrace() */ const struct user_regset_view *regset_view = &user_x86_64_view; #else /* This is native 32-bit ptrace() */ const struct user_regset_view *regset_view = &user_x86_32_view; #endif switch (request) { /* read the word at location addr in the USER area. */ case PTRACE_PEEKUSR: { unsigned long tmp; ret = -EIO; if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user)) break; tmp = 0; /* Default return condition */ if (addr < sizeof(struct user_regs_struct)) tmp = getreg(child, addr); else if (addr >= offsetof(struct user, u_debugreg[0]) && addr <= offsetof(struct user, u_debugreg[7])) { addr -= offsetof(struct user, u_debugreg[0]); tmp = ptrace_get_debugreg(child, addr / sizeof(data)); } ret = put_user(tmp, datap); break; } case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ ret = -EIO; if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user)) break; if (addr < sizeof(struct user_regs_struct)) ret = putreg(child, addr, data); else if (addr >= offsetof(struct user, u_debugreg[0]) && addr <= offsetof(struct user, u_debugreg[7])) { addr -= offsetof(struct user, u_debugreg[0]); ret = ptrace_set_debugreg(child, addr / sizeof(data), data); } break; case PTRACE_GETREGS: /* Get all gp regs from the child. */ return copy_regset_to_user(child, regset_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct), datap); case PTRACE_SETREGS: /* Set all gp regs in the child. */ return copy_regset_from_user(child, regset_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct), datap); case PTRACE_GETFPREGS: /* Get the child FPU state. */ return copy_regset_to_user(child, regset_view, REGSET_FP, 0, sizeof(struct user_i387_struct), datap); case PTRACE_SETFPREGS: /* Set the child FPU state. */ return copy_regset_from_user(child, regset_view, REGSET_FP, 0, sizeof(struct user_i387_struct), datap); #ifdef CONFIG_X86_32 case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */ return copy_regset_to_user(child, &user_x86_32_view, REGSET32_XFP, 0, sizeof(struct user_fxsr_struct), datap) ? -EIO : 0; case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */ return copy_regset_from_user(child, &user_x86_32_view, REGSET32_XFP, 0, sizeof(struct user_fxsr_struct), datap) ? -EIO : 0; #endif #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION case PTRACE_GET_THREAD_AREA: if ((int) addr < 0) return -EIO; ret = do_get_thread_area(child, addr, (struct user_desc __user *)data); break; case PTRACE_SET_THREAD_AREA: if ((int) addr < 0) return -EIO; ret = do_set_thread_area(child, addr, (struct user_desc __user *)data, 0); break; #endif #ifdef CONFIG_X86_64 /* normal 64bit interface to access TLS data. Works just like arch_prctl, except that the arguments are reversed. */ case PTRACE_ARCH_PRCTL: ret = do_arch_prctl_64(child, data, addr); break; #endif default: ret = ptrace_request(child, request, addr, data); break; } return ret; } #ifdef CONFIG_IA32_EMULATION #include <linux/compat.h> #include <linux/syscalls.h> #include <asm/ia32.h> #include <asm/user32.h> #define R32(l,q) \ case offsetof(struct user32, regs.l): \ regs->q = value; break #define SEG32(rs) \ case offsetof(struct user32, regs.rs): \ return set_segment_reg(child, \ offsetof(struct user_regs_struct, rs), \ value); \ break static int putreg32(struct task_struct *child, unsigned regno, u32 value) { struct pt_regs *regs = task_pt_regs(child); int ret; switch (regno) { SEG32(cs); SEG32(ds); SEG32(es); /* * A 32-bit ptracer on a 64-bit kernel expects that writing * FS or GS will also update the base. This is needed for * operations like PTRACE_SETREGS to fully restore a saved * CPU state. */ case offsetof(struct user32, regs.fs): ret = set_segment_reg(child, offsetof(struct user_regs_struct, fs), value); if (ret == 0) child->thread.fsbase = x86_fsgsbase_read_task(child, value); return ret; case offsetof(struct user32, regs.gs): ret = set_segment_reg(child, offsetof(struct user_regs_struct, gs), value); if (ret == 0) child->thread.gsbase = x86_fsgsbase_read_task(child, value); return ret; SEG32(ss); R32(ebx, bx); R32(ecx, cx); R32(edx, dx); R32(edi, di); R32(esi, si); R32(ebp, bp); R32(eax, ax); R32(eip, ip); R32(esp, sp); case offsetof(struct user32, regs.orig_eax): /* * Warning: bizarre corner case fixup here. A 32-bit * debugger setting orig_eax to -1 wants to disable * syscall restart. Make sure that the syscall * restart code sign-extends orig_ax. Also make sure * we interpret the -ERESTART* codes correctly if * loaded into regs->ax in case the task is not * actually still sitting at the exit from a 32-bit * syscall with TS_COMPAT still set. */ regs->orig_ax = value; if (syscall_get_nr(child, regs) != -1) child->thread_info.status |= TS_I386_REGS_POKED; break; case offsetof(struct user32, regs.eflags): return set_flags(child, value); case offsetof(struct user32, u_debugreg[0]) ... offsetof(struct user32, u_debugreg[7]): regno -= offsetof(struct user32, u_debugreg[0]); return ptrace_set_debugreg(child, regno / 4, value); default: if (regno > sizeof(struct user32) || (regno & 3)) return -EIO; /* * Other dummy fields in the virtual user structure * are ignored */ break; } return 0; } #undef R32 #undef SEG32 #define R32(l,q) \ case offsetof(struct user32, regs.l): \ *val = regs->q; break #define SEG32(rs) \ case offsetof(struct user32, regs.rs): \ *val = get_segment_reg(child, \ offsetof(struct user_regs_struct, rs)); \ break static int getreg32(struct task_struct *child, unsigned regno, u32 *val) { struct pt_regs *regs = task_pt_regs(child); switch (regno) { SEG32(ds); SEG32(es); SEG32(fs); SEG32(gs); R32(cs, cs); R32(ss, ss); R32(ebx, bx); R32(ecx, cx); R32(edx, dx); R32(edi, di); R32(esi, si); R32(ebp, bp); R32(eax, ax); R32(orig_eax, orig_ax); R32(eip, ip); R32(esp, sp); case offsetof(struct user32, regs.eflags): *val = get_flags(child); break; case offsetof(struct user32, u_debugreg[0]) ... offsetof(struct user32, u_debugreg[7]): regno -= offsetof(struct user32, u_debugreg[0]); *val = ptrace_get_debugreg(child, regno / 4); break; default: if (regno > sizeof(struct user32) || (regno & 3)) return -EIO; /* * Other dummy fields in the virtual user structure * are ignored */ *val = 0; break; } return 0; } #undef R32 #undef SEG32 static int genregs32_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { int reg; for (reg = 0; to.left; reg++) { u32 val; getreg32(target, reg * 4, &val); membuf_store(&to, val); } return 0; } static int genregs32_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { int ret = 0; if (kbuf) { const compat_ulong_t *k = kbuf; while (count >= sizeof(*k) && !ret) { ret = putreg32(target, pos, *k++); count -= sizeof(*k); pos += sizeof(*k); } } else { const compat_ulong_t __user *u = ubuf; while (count >= sizeof(*u) && !ret) { compat_ulong_t word; ret = __get_user(word, u++); if (ret) break; ret = putreg32(target, pos, word); count -= sizeof(*u); pos += sizeof(*u); } } return ret; } static long ia32_arch_ptrace(struct task_struct *child, compat_long_t request, compat_ulong_t caddr, compat_ulong_t cdata) { unsigned long addr = caddr; unsigned long data = cdata; void __user *datap = compat_ptr(data); int ret; __u32 val; switch (request) { case PTRACE_PEEKUSR: ret = getreg32(child, addr, &val); if (ret == 0) ret = put_user(val, (__u32 __user *)datap); break; case PTRACE_POKEUSR: ret = putreg32(child, addr, data); break; case PTRACE_GETREGS: /* Get all gp regs from the child. */ return copy_regset_to_user(child, &user_x86_32_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct32), datap); case PTRACE_SETREGS: /* Set all gp regs in the child. */ return copy_regset_from_user(child, &user_x86_32_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct32), datap); case PTRACE_GETFPREGS: /* Get the child FPU state. */ return copy_regset_to_user(child, &user_x86_32_view, REGSET_FP, 0, sizeof(struct user_i387_ia32_struct), datap); case PTRACE_SETFPREGS: /* Set the child FPU state. */ return copy_regset_from_user( child, &user_x86_32_view, REGSET_FP, 0, sizeof(struct user_i387_ia32_struct), datap); case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */ return copy_regset_to_user(child, &user_x86_32_view, REGSET32_XFP, 0, sizeof(struct user32_fxsr_struct), datap); case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */ return copy_regset_from_user(child, &user_x86_32_view, REGSET32_XFP, 0, sizeof(struct user32_fxsr_struct), datap); case PTRACE_GET_THREAD_AREA: case PTRACE_SET_THREAD_AREA: return arch_ptrace(child, request, addr, data); default: return compat_ptrace_request(child, request, addr, data); } return ret; } #endif /* CONFIG_IA32_EMULATION */ #ifdef CONFIG_X86_X32_ABI static long x32_arch_ptrace(struct task_struct *child, compat_long_t request, compat_ulong_t caddr, compat_ulong_t cdata) { unsigned long addr = caddr; unsigned long data = cdata; void __user *datap = compat_ptr(data); int ret; switch (request) { /* Read 32bits at location addr in the USER area. Only allow to return the lower 32bits of segment and debug registers. */ case PTRACE_PEEKUSR: { u32 tmp; ret = -EIO; if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user) || addr < offsetof(struct user_regs_struct, cs)) break; tmp = 0; /* Default return condition */ if (addr < sizeof(struct user_regs_struct)) tmp = getreg(child, addr); else if (addr >= offsetof(struct user, u_debugreg[0]) && addr <= offsetof(struct user, u_debugreg[7])) { addr -= offsetof(struct user, u_debugreg[0]); tmp = ptrace_get_debugreg(child, addr / sizeof(data)); } ret = put_user(tmp, (__u32 __user *)datap); break; } /* Write the word at location addr in the USER area. Only allow to update segment and debug registers with the upper 32bits zero-extended. */ case PTRACE_POKEUSR: ret = -EIO; if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user) || addr < offsetof(struct user_regs_struct, cs)) break; if (addr < sizeof(struct user_regs_struct)) ret = putreg(child, addr, data); else if (addr >= offsetof(struct user, u_debugreg[0]) && addr <= offsetof(struct user, u_debugreg[7])) { addr -= offsetof(struct user, u_debugreg[0]); ret = ptrace_set_debugreg(child, addr / sizeof(data), data); } break; case PTRACE_GETREGS: /* Get all gp regs from the child. */ return copy_regset_to_user(child, &user_x86_64_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct), datap); case PTRACE_SETREGS: /* Set all gp regs in the child. */ return copy_regset_from_user(child, &user_x86_64_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct), datap); case PTRACE_GETFPREGS: /* Get the child FPU state. */ return copy_regset_to_user(child, &user_x86_64_view, REGSET_FP, 0, sizeof(struct user_i387_struct), datap); case PTRACE_SETFPREGS: /* Set the child FPU state. */ return copy_regset_from_user(child, &user_x86_64_view, REGSET_FP, 0, sizeof(struct user_i387_struct), datap); default: return compat_ptrace_request(child, request, addr, data); } return ret; } #endif #ifdef CONFIG_COMPAT long compat_arch_ptrace(struct task_struct *child, compat_long_t request, compat_ulong_t caddr, compat_ulong_t cdata) { #ifdef CONFIG_X86_X32_ABI if (!in_ia32_syscall()) return x32_arch_ptrace(child, request, caddr, cdata); #endif #ifdef CONFIG_IA32_EMULATION return ia32_arch_ptrace(child, request, caddr, cdata); #else return 0; #endif } #endif /* CONFIG_COMPAT */ #ifdef CONFIG_X86_64 static struct user_regset x86_64_regsets[] __ro_after_init = { [REGSET64_GENERAL] = { USER_REGSET_NOTE_TYPE(PRSTATUS), .n = sizeof(struct user_regs_struct) / sizeof(long), .size = sizeof(long), .align = sizeof(long), .regset_get = genregs_get, .set = genregs_set }, [REGSET64_FP] = { USER_REGSET_NOTE_TYPE(PRFPREG), .n = sizeof(struct fxregs_state) / sizeof(long), .size = sizeof(long), .align = sizeof(long), .active = regset_xregset_fpregs_active, .regset_get = xfpregs_get, .set = xfpregs_set }, [REGSET64_XSTATE] = { USER_REGSET_NOTE_TYPE(X86_XSTATE), .size = sizeof(u64), .align = sizeof(u64), .active = xstateregs_active, .regset_get = xstateregs_get, .set = xstateregs_set }, [REGSET64_IOPERM] = { USER_REGSET_NOTE_TYPE(386_IOPERM), .n = IO_BITMAP_LONGS, .size = sizeof(long), .align = sizeof(long), .active = ioperm_active, .regset_get = ioperm_get }, #ifdef CONFIG_X86_USER_SHADOW_STACK [REGSET64_SSP] = { USER_REGSET_NOTE_TYPE(X86_SHSTK), .n = 1, .size = sizeof(u64), .align = sizeof(u64), .active = ssp_active, .regset_get = ssp_get, .set = ssp_set }, #endif }; static const struct user_regset_view user_x86_64_view = { .name = "x86_64", .e_machine = EM_X86_64, .regsets = x86_64_regsets, .n = ARRAY_SIZE(x86_64_regsets) }; #else /* CONFIG_X86_32 */ #define user_regs_struct32 user_regs_struct #define genregs32_get genregs_get #define genregs32_set genregs_set #endif /* CONFIG_X86_64 */ #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION static struct user_regset x86_32_regsets[] __ro_after_init = { [REGSET32_GENERAL] = { USER_REGSET_NOTE_TYPE(PRSTATUS), .n = sizeof(struct user_regs_struct32) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), .regset_get = genregs32_get, .set = genregs32_set }, [REGSET32_FP] = { USER_REGSET_NOTE_TYPE(PRFPREG), .n = sizeof(struct user_i387_ia32_struct) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), .active = regset_fpregs_active, .regset_get = fpregs_get, .set = fpregs_set }, [REGSET32_XFP] = { USER_REGSET_NOTE_TYPE(PRXFPREG), .n = sizeof(struct fxregs_state) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), .active = regset_xregset_fpregs_active, .regset_get = xfpregs_get, .set = xfpregs_set }, [REGSET32_XSTATE] = { USER_REGSET_NOTE_TYPE(X86_XSTATE), .size = sizeof(u64), .align = sizeof(u64), .active = xstateregs_active, .regset_get = xstateregs_get, .set = xstateregs_set }, [REGSET32_TLS] = { USER_REGSET_NOTE_TYPE(386_TLS), .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN, .size = sizeof(struct user_desc), .align = sizeof(struct user_desc), .active = regset_tls_active, .regset_get = regset_tls_get, .set = regset_tls_set }, [REGSET32_IOPERM] = { USER_REGSET_NOTE_TYPE(386_IOPERM), .n = IO_BITMAP_BYTES / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), .active = ioperm_active, .regset_get = ioperm_get }, }; static const struct user_regset_view user_x86_32_view = { .name = "i386", .e_machine = EM_386, .regsets = x86_32_regsets, .n = ARRAY_SIZE(x86_32_regsets) }; #endif /* * This represents bytes 464..511 in the memory layout exported through * the REGSET_XSTATE interface. */ u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask) { #ifdef CONFIG_X86_64 x86_64_regsets[REGSET64_XSTATE].n = size / sizeof(u64); #endif #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION x86_32_regsets[REGSET32_XSTATE].n = size / sizeof(u64); #endif xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask; } /* * This is used by the core dump code to decide which regset to dump. The * core dump code writes out the resulting .e_machine and the corresponding * regsets. This is suboptimal if the task is messing around with its CS.L * field, but at worst the core dump will end up missing some information. * * Unfortunately, it is also used by the broken PTRACE_GETREGSET and * PTRACE_SETREGSET APIs. These APIs look at the .regsets field but have * no way to make sure that the e_machine they use matches the caller's * expectations. The result is that the data format returned by * PTRACE_GETREGSET depends on the returned CS field (and even the offset * of the returned CS field depends on its value!) and the data format * accepted by PTRACE_SETREGSET is determined by the old CS value. The * upshot is that it is basically impossible to use these APIs correctly. * * The best way to fix it in the long run would probably be to add new * improved ptrace() APIs to read and write registers reliably, possibly by * allowing userspace to select the ELF e_machine variant that they expect. */ const struct user_regset_view *task_user_regset_view(struct task_struct *task) { #ifdef CONFIG_IA32_EMULATION if (!user_64bit_mode(task_pt_regs(task))) #endif #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION return &user_x86_32_view; #endif #ifdef CONFIG_X86_64 return &user_x86_64_view; #endif } void send_sigtrap(struct pt_regs *regs, int error_code, int si_code) { struct task_struct *tsk = current; tsk->thread.trap_nr = X86_TRAP_DB; tsk->thread.error_code = error_code; /* Send us the fake SIGTRAP */ force_sig_fault(SIGTRAP, si_code, user_mode(regs) ? (void __user *)regs->ip : NULL); } void user_single_step_report(struct pt_regs *regs) { send_sigtrap(regs, 0, TRAP_BRKPT); }
3 38 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 /* * Copyright (c) 2016 Intel Corporation * * Permission to use, copy, modify, distribute, and sell this software and its * documentation for any purpose is hereby granted without fee, provided that * the above copyright notice appear in all copies and that both that copyright * notice and this permission notice appear in supporting documentation, and * that the name of the copyright holders not be used in advertising or * publicity pertaining to distribution of the software without specific, * written prior permission. The copyright holders make no representations * about the suitability of this software for any purpose. It is provided "as * is" without express or implied warranty. * * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE * OF THIS SOFTWARE. */ #ifndef __DRM_PROPERTY_H__ #define __DRM_PROPERTY_H__ #include <linux/list.h> #include <linux/ctype.h> #include <drm/drm_mode_object.h> #include <uapi/drm/drm_mode.h> /** * struct drm_property_enum - symbolic values for enumerations * @head: list of enum values, linked to &drm_property.enum_list * @name: symbolic name for the enum * * For enumeration and bitmask properties this structure stores the symbolic * decoding for each value. This is used for example for the rotation property. */ struct drm_property_enum { /** * @value: numeric property value for this enum entry * * If the property has the type &DRM_MODE_PROP_BITMASK, @value stores a * bitshift, not a bitmask. In other words, the enum entry is enabled * if the bit number @value is set in the property's value. This enum * entry has the bitmask ``1 << value``. */ uint64_t value; struct list_head head; char name[DRM_PROP_NAME_LEN]; }; /** * struct drm_property - modeset object property * * This structure represent a modeset object property. It combines both the name * of the property with the set of permissible values. This means that when a * driver wants to use a property with the same name on different objects, but * with different value ranges, then it must create property for each one. An * example would be rotation of &drm_plane, when e.g. the primary plane cannot * be rotated. But if both the name and the value range match, then the same * property structure can be instantiated multiple times for the same object. * Userspace must be able to cope with this and cannot assume that the same * symbolic property will have the same modeset object ID on all modeset * objects. * * Properties are created by one of the special functions, as explained in * detail in the @flags structure member. * * To actually expose a property it must be attached to each object using * drm_object_attach_property(). Currently properties can only be attached to * &drm_connector, &drm_crtc and &drm_plane. * * Properties are also used as the generic metadatatransport for the atomic * IOCTL. Everything that was set directly in structures in the legacy modeset * IOCTLs (like the plane source or destination windows, or e.g. the links to * the CRTC) is exposed as a property with the DRM_MODE_PROP_ATOMIC flag set. */ struct drm_property { /** * @head: per-device list of properties, for cleanup. */ struct list_head head; /** * @base: base KMS object */ struct drm_mode_object base; /** * @flags: * * Property flags and type. A property needs to be one of the following * types: * * DRM_MODE_PROP_RANGE * Range properties report their minimum and maximum admissible unsigned values. * The KMS core verifies that values set by application fit in that * range. The range is unsigned. Range properties are created using * drm_property_create_range(). * * DRM_MODE_PROP_SIGNED_RANGE * Range properties report their minimum and maximum admissible unsigned values. * The KMS core verifies that values set by application fit in that * range. The range is signed. Range properties are created using * drm_property_create_signed_range(). * * DRM_MODE_PROP_ENUM * Enumerated properties take a numerical value that ranges from 0 to * the number of enumerated values defined by the property minus one, * and associate a free-formed string name to each value. Applications * can retrieve the list of defined value-name pairs and use the * numerical value to get and set property instance values. Enum * properties are created using drm_property_create_enum(). * * DRM_MODE_PROP_BITMASK * Bitmask properties are enumeration properties that additionally * restrict all enumerated values to the 0..63 range. Bitmask property * instance values combine one or more of the enumerated bits defined * by the property. Bitmask properties are created using * drm_property_create_bitmask(). * * DRM_MODE_PROP_OBJECT * Object properties are used to link modeset objects. This is used * extensively in the atomic support to create the display pipeline, * by linking &drm_framebuffer to &drm_plane, &drm_plane to * &drm_crtc and &drm_connector to &drm_crtc. An object property can * only link to a specific type of &drm_mode_object, this limit is * enforced by the core. Object properties are created using * drm_property_create_object(). * * Object properties work like blob properties, but in a more * general fashion. They are limited to atomic drivers and must have * the DRM_MODE_PROP_ATOMIC flag set. * * DRM_MODE_PROP_BLOB * Blob properties store a binary blob without any format restriction. * The binary blobs are created as KMS standalone objects, and blob * property instance values store the ID of their associated blob * object. Blob properties are created by calling * drm_property_create() with DRM_MODE_PROP_BLOB as the type. * * Actual blob objects to contain blob data are created using * drm_property_create_blob(), or through the corresponding IOCTL. * * Besides the built-in limit to only accept blob objects blob * properties work exactly like object properties. The only reasons * blob properties exist is backwards compatibility with existing * userspace. * * In addition a property can have any combination of the below flags: * * DRM_MODE_PROP_ATOMIC * Set for properties which encode atomic modeset state. Such * properties are not exposed to legacy userspace. * * DRM_MODE_PROP_IMMUTABLE * Set for properties whose values cannot be changed by * userspace. The kernel is allowed to update the value of these * properties. This is generally used to expose probe state to * userspace, e.g. the EDID, or the connector path property on DP * MST sinks. Kernel can update the value of an immutable property * by calling drm_object_property_set_value(). */ uint32_t flags; /** * @name: symbolic name of the properties */ char name[DRM_PROP_NAME_LEN]; /** * @num_values: size of the @values array. */ uint32_t num_values; /** * @values: * * Array with limits and values for the property. The * interpretation of these limits is dependent upon the type per @flags. */ uint64_t *values; /** * @dev: DRM device */ struct drm_device *dev; /** * @enum_list: * * List of &drm_prop_enum_list structures with the symbolic names for * enum and bitmask values. */ struct list_head enum_list; }; /** * struct drm_property_blob - Blob data for &drm_property * @base: base KMS object * @dev: DRM device * @head_global: entry on the global blob list in * &drm_mode_config.property_blob_list. * @head_file: entry on the per-file blob list in &drm_file.blobs list. * @length: size of the blob in bytes, invariant over the lifetime of the object * @data: actual data, embedded at the end of this structure * * Blobs are used to store bigger values than what fits directly into the 64 * bits available for a &drm_property. * * Blobs are reference counted using drm_property_blob_get() and * drm_property_blob_put(). They are created using drm_property_create_blob(). */ struct drm_property_blob { struct drm_mode_object base; struct drm_device *dev; struct list_head head_global; struct list_head head_file; size_t length; void *data; }; struct drm_prop_enum_list { int type; const char *name; }; #define obj_to_property(x) container_of(x, struct drm_property, base) #define obj_to_blob(x) container_of(x, struct drm_property_blob, base) /** * drm_property_type_is - check the type of a property * @property: property to check * @type: property type to compare with * * This is a helper function becauase the uapi encoding of property types is * a bit special for historical reasons. */ static inline bool drm_property_type_is(struct drm_property *property, uint32_t type) { /* instanceof for props.. handles extended type vs original types: */ if (property->flags & DRM_MODE_PROP_EXTENDED_TYPE) return (property->flags & DRM_MODE_PROP_EXTENDED_TYPE) == type; return property->flags & type; } struct drm_property *drm_property_create(struct drm_device *dev, u32 flags, const char *name, int num_values); struct drm_property *drm_property_create_enum(struct drm_device *dev, u32 flags, const char *name, const struct drm_prop_enum_list *props, int num_values); struct drm_property *drm_property_create_bitmask(struct drm_device *dev, u32 flags, const char *name, const struct drm_prop_enum_list *props, int num_props, uint64_t supported_bits); struct drm_property *drm_property_create_range(struct drm_device *dev, u32 flags, const char *name, uint64_t min, uint64_t max); struct drm_property *drm_property_create_signed_range(struct drm_device *dev, u32 flags, const char *name, int64_t min, int64_t max); struct drm_property *drm_property_create_object(struct drm_device *dev, u32 flags, const char *name, uint32_t type); struct drm_property *drm_property_create_bool(struct drm_device *dev, u32 flags, const char *name); int drm_property_add_enum(struct drm_property *property, uint64_t value, const char *name); void drm_property_destroy(struct drm_device *dev, struct drm_property *property); struct drm_property_blob *drm_property_create_blob(struct drm_device *dev, size_t length, const void *data); struct drm_property_blob *drm_property_lookup_blob(struct drm_device *dev, uint32_t id); int drm_property_replace_blob_from_id(struct drm_device *dev, struct drm_property_blob **blob, uint64_t blob_id, ssize_t expected_size, ssize_t expected_elem_size, bool *replaced); int drm_property_replace_global_blob(struct drm_device *dev, struct drm_property_blob **replace, size_t length, const void *data, struct drm_mode_object *obj_holds_id, struct drm_property *prop_holds_id); bool drm_property_replace_blob(struct drm_property_blob **blob, struct drm_property_blob *new_blob); struct drm_property_blob *drm_property_blob_get(struct drm_property_blob *blob); void drm_property_blob_put(struct drm_property_blob *blob); /** * drm_property_find - find property object * @dev: DRM device * @file_priv: drm file to check for lease against. * @id: property object id * * This function looks up the property object specified by id and returns it. */ static inline struct drm_property *drm_property_find(struct drm_device *dev, struct drm_file *file_priv, uint32_t id) { struct drm_mode_object *mo; mo = drm_mode_object_find(dev, file_priv, id, DRM_MODE_OBJECT_PROPERTY); return mo ? obj_to_property(mo) : NULL; } #endif
65 65 1865 701 624 2 2 2 377 48 1254 377 1254 293 292 3 976 1254 10 121 120 31 121 3529 95 93 23 23 23 34 279 34 784 796 4 791 780 67 781 795 700 27 700 9 13 1498 83 1498 113 1473 1498 13 13 11 8 83 83 83 60 60 60 60 60 60 60 967 4 4 2 967 1663 1664 1584 1585 1529 6 4 4 2 1 9 1 9 9 4 4 4 4 4 4 4 3 4 4 4 1 3 2 3 2 1 2 2 2 2 4 1 1 1 4 1 1 453 1330 3 1329 1274 9 1274 1272 1274 1274 1274 1272 1273 81 22 60 1271 1273 1270 1273 1273 22 22 1272 83 83 83 12 12 12 4 4 4 4 4 4 4 3 2 2 4 4 19 3 5 2 19 8 8 8 7 7 7 228 19 19 19 5 5 5 230 335 122 217 1 334 333 182 182 12 12 180 180 179 179 177 178 230 229 14 2 1 14 14 229 226 7 3 7 7 230 12 226 226 226 12 12 12 12 12 12 12 12 12 12 12 230 35 396 1 395 3 396 2 395 1 395 4 396 7 395 362 363 1 363 147 147 35 66 66 113 33 83 35 19 350 160 349 186 25 352 349 172 350 297 52 296 258 186 186 174 1 1 1 1 248 190 249 248 83 83 83 83 11 10 11 11 81 83 83 83 83 63 62 63 83 83 83 83 19 12 83 75 15 76 20 20 5 57 1 58 1344 1014 1344 1345 1329 386 963 963 963 1 963 72 962 10 8 2 2 1 9 9 9 7 1244 4 1244 1241 1233 10 9 9 6 6 6 6 1232 1232 1236 1236 1234 1233 8 1 8 8 8 1232 29 29 1235 2 60 60 60 60 60 60 1 60 58 58 58 37 37 28 60 60 60 60 60 60 60 60 60 60 6 60 60 60 1 1 1 1 1 60 1615 1614 1614 1613 1382 987 1513 128 2 1 2 484 60 60 1615 457 457 10 10 456 195 456 1282 1245 1242 8 1235 10 9 2 9 3 1229 29 8 1233 9 9 8 8 8 8 8 1233 938 929 928 928 928 928 928 1453 8 13 24 18 18 16 16 15 14 14 9 14 3 9 1098 16 1099 1099 131 986 980 1098 790 790 790 790 790 788 789 51 51 51 1 1 50 50 50 35 90 1271 1244 1271 23 10 9 8 17 11 24 142 139 142 142 699 1 1 15 16 16 15 5 15 3 15 3 15 16 892 1430 1431 1428 766 726 1 1427 2 1428 4 1428 175 1119 1118 356 42 1427 1431 1425 1424 1430 6 6 11 11 11 11 6 5 5 4 4 10 10 10 9 9 8 8 2 2 7 11 335 2 333 337 335 335 278 336 301 280 279 300 336 333 336 278 158 158 158 141 158 1 1 158 141 158 139 141 141 1 23 158 133 133 257 278 345 337 337 337 275 1 31 31 98 99 85 84 54 85 85 54 54 54 31 85 84 85 99 98 99 27 6 21 20 16 20 26 139 137 123 122 1 121 122 41 14 13 12 10 12 12 12 12 2 12 12 12 12 12 12 12 12 4 4 45 48 51 212 211 211 210 212 670 670 7 664 663 665 665 664 665 670 129 1 128 129 129 129 129 298 3 296 295 429 50 34 50 50 50 49 50 51 161 16 13 161 161 29 27 28 109 109 185 109 185 184 31 160 159 185 159 159 18 3 143 24 158 43 43 2 41 41 40 24 24 24 17 16 16 40 40 40 1 1 42 16 16 16 10 10 13 13 13 16 35 23 12 9 19 15 14 14 774 209 195 191 210 190 1 209 199 120 198 127 15 66 20 20 19 1 26 26 19 19 9 151 152 152 138 140 139 152 123 119 7 2 7 6 3 3 2 8 8 7 1 1 1 1 1 5 55 55 2 5 5 7 7 13 13 4 4 4 4 4 1 3 4 4 3 1 2 8 7 6 1 5 4 4 3 1 2 123 121 121 83 82 90 121 139 25 6 3 3 25 3 2 23 1 23 22 1 21 19 17 1 17 1 17 5 17 3 17 3 14 3 16 1 25 9 76 75 76 75 75 75 76 1 76 1 75 76 1 76 75 76 2 76 4 76 3 76 1 76 1 76 1 73 1 75 34 34 3 34 67 67 67 65 66 65 67 66 67 67 66 66 67 67 64 67 23 23 67 67 66 67 142 142 142 1428 1428 248 173 248 59 39 146 146 146 700 701 700 701 701 701 700 700 49 57 598 599 1 599 599 79 79 54 54 54 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 /* * Resizable virtual memory filesystem for Linux. * * Copyright (C) 2000 Linus Torvalds. * 2000 Transmeta Corp. * 2000-2001 Christoph Rohland * 2000-2001 SAP AG * 2002 Red Hat Inc. * Copyright (C) 2002-2011 Hugh Dickins. * Copyright (C) 2011 Google Inc. * Copyright (C) 2002-2005 VERITAS Software Corporation. * Copyright (C) 2004 Andi Kleen, SuSE Labs * * Extended attribute support for tmpfs: * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net> * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> * * tiny-shmem: * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com> * * This file is released under the GPL. */ #include <linux/fs.h> #include <linux/init.h> #include <linux/vfs.h> #include <linux/mount.h> #include <linux/ramfs.h> #include <linux/pagemap.h> #include <linux/file.h> #include <linux/fileattr.h> #include <linux/mm.h> #include <linux/random.h> #include <linux/sched/signal.h> #include <linux/export.h> #include <linux/shmem_fs.h> #include <linux/swap.h> #include <linux/uio.h> #include <linux/hugetlb.h> #include <linux/fs_parser.h> #include <linux/swapfile.h> #include <linux/iversion.h> #include <linux/unicode.h> #include "swap.h" static struct vfsmount *shm_mnt __ro_after_init; #ifdef CONFIG_SHMEM /* * This virtual memory filesystem is heavily based on the ramfs. It * extends ramfs by the ability to use swap and honor resource limits * which makes it a completely usable filesystem. */ #include <linux/xattr.h> #include <linux/exportfs.h> #include <linux/posix_acl.h> #include <linux/posix_acl_xattr.h> #include <linux/mman.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/backing-dev.h> #include <linux/writeback.h> #include <linux/pagevec.h> #include <linux/percpu_counter.h> #include <linux/falloc.h> #include <linux/splice.h> #include <linux/security.h> #include <linux/swapops.h> #include <linux/mempolicy.h> #include <linux/namei.h> #include <linux/ctype.h> #include <linux/migrate.h> #include <linux/highmem.h> #include <linux/seq_file.h> #include <linux/magic.h> #include <linux/syscalls.h> #include <linux/fcntl.h> #include <uapi/linux/memfd.h> #include <linux/rmap.h> #include <linux/uuid.h> #include <linux/quotaops.h> #include <linux/rcupdate_wait.h> #include <linux/uaccess.h> #include "internal.h" #define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT) /* Pretend that each entry is of this size in directory's i_size */ #define BOGO_DIRENT_SIZE 20 /* Pretend that one inode + its dentry occupy this much memory */ #define BOGO_INODE_SIZE 1024 /* Symlink up to this size is kmalloc'ed instead of using a swappable page */ #define SHORT_SYMLINK_LEN 128 /* * shmem_fallocate communicates with shmem_fault or shmem_writeout via * inode->i_private (with i_rwsem making sure that it has only one user at * a time): we would prefer not to enlarge the shmem inode just for that. */ struct shmem_falloc { wait_queue_head_t *waitq; /* faults into hole wait for punch to end */ pgoff_t start; /* start of range currently being fallocated */ pgoff_t next; /* the next page offset to be fallocated */ pgoff_t nr_falloced; /* how many new pages have been fallocated */ pgoff_t nr_unswapped; /* how often writeout refused to swap out */ }; struct shmem_options { unsigned long long blocks; unsigned long long inodes; struct mempolicy *mpol; kuid_t uid; kgid_t gid; umode_t mode; bool full_inums; int huge; int seen; bool noswap; unsigned short quota_types; struct shmem_quota_limits qlimits; #if IS_ENABLED(CONFIG_UNICODE) struct unicode_map *encoding; bool strict_encoding; #endif #define SHMEM_SEEN_BLOCKS 1 #define SHMEM_SEEN_INODES 2 #define SHMEM_SEEN_HUGE 4 #define SHMEM_SEEN_INUMS 8 #define SHMEM_SEEN_QUOTA 16 }; #ifdef CONFIG_TRANSPARENT_HUGEPAGE static unsigned long huge_shmem_orders_always __read_mostly; static unsigned long huge_shmem_orders_madvise __read_mostly; static unsigned long huge_shmem_orders_inherit __read_mostly; static unsigned long huge_shmem_orders_within_size __read_mostly; static bool shmem_orders_configured __initdata; #endif #ifdef CONFIG_TMPFS static unsigned long shmem_default_max_blocks(void) { return totalram_pages() / 2; } static unsigned long shmem_default_max_inodes(void) { unsigned long nr_pages = totalram_pages(); return min3(nr_pages - totalhigh_pages(), nr_pages / 2, ULONG_MAX / BOGO_INODE_SIZE); } #endif static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp, gfp_t gfp, struct vm_area_struct *vma, vm_fault_t *fault_type); static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) { return sb->s_fs_info; } /* * shmem_file_setup pre-accounts the whole fixed size of a VM object, * for shared memory and for shared anonymous (/dev/zero) mappings * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1), * consistent with the pre-accounting of private mappings ... */ static inline int shmem_acct_size(unsigned long flags, loff_t size) { return (flags & VM_NORESERVE) ? 0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size)); } static inline void shmem_unacct_size(unsigned long flags, loff_t size) { if (!(flags & VM_NORESERVE)) vm_unacct_memory(VM_ACCT(size)); } static inline int shmem_reacct_size(unsigned long flags, loff_t oldsize, loff_t newsize) { if (!(flags & VM_NORESERVE)) { if (VM_ACCT(newsize) > VM_ACCT(oldsize)) return security_vm_enough_memory_mm(current->mm, VM_ACCT(newsize) - VM_ACCT(oldsize)); else if (VM_ACCT(newsize) < VM_ACCT(oldsize)) vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize)); } return 0; } /* * ... whereas tmpfs objects are accounted incrementally as * pages are allocated, in order to allow large sparse files. * shmem_get_folio reports shmem_acct_blocks failure as -ENOSPC not -ENOMEM, * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. */ static inline int shmem_acct_blocks(unsigned long flags, long pages) { if (!(flags & VM_NORESERVE)) return 0; return security_vm_enough_memory_mm(current->mm, pages * VM_ACCT(PAGE_SIZE)); } static inline void shmem_unacct_blocks(unsigned long flags, long pages) { if (flags & VM_NORESERVE) vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE)); } static int shmem_inode_acct_blocks(struct inode *inode, long pages) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); int err = -ENOSPC; if (shmem_acct_blocks(info->flags, pages)) return err; might_sleep(); /* when quotas */ if (sbinfo->max_blocks) { if (!percpu_counter_limited_add(&sbinfo->used_blocks, sbinfo->max_blocks, pages)) goto unacct; err = dquot_alloc_block_nodirty(inode, pages); if (err) { percpu_counter_sub(&sbinfo->used_blocks, pages); goto unacct; } } else { err = dquot_alloc_block_nodirty(inode, pages); if (err) goto unacct; } return 0; unacct: shmem_unacct_blocks(info->flags, pages); return err; } static void shmem_inode_unacct_blocks(struct inode *inode, long pages) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); might_sleep(); /* when quotas */ dquot_free_block_nodirty(inode, pages); if (sbinfo->max_blocks) percpu_counter_sub(&sbinfo->used_blocks, pages); shmem_unacct_blocks(info->flags, pages); } static const struct super_operations shmem_ops; static const struct address_space_operations shmem_aops; static const struct file_operations shmem_file_operations; static const struct inode_operations shmem_inode_operations; static const struct inode_operations shmem_dir_inode_operations; static const struct inode_operations shmem_special_inode_operations; static const struct vm_operations_struct shmem_vm_ops; static const struct vm_operations_struct shmem_anon_vm_ops; static struct file_system_type shmem_fs_type; bool shmem_mapping(const struct address_space *mapping) { return mapping->a_ops == &shmem_aops; } EXPORT_SYMBOL_GPL(shmem_mapping); bool vma_is_anon_shmem(const struct vm_area_struct *vma) { return vma->vm_ops == &shmem_anon_vm_ops; } bool vma_is_shmem(const struct vm_area_struct *vma) { return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops; } static LIST_HEAD(shmem_swaplist); static DEFINE_SPINLOCK(shmem_swaplist_lock); #ifdef CONFIG_TMPFS_QUOTA static int shmem_enable_quotas(struct super_block *sb, unsigned short quota_types) { int type, err = 0; sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY; for (type = 0; type < SHMEM_MAXQUOTAS; type++) { if (!(quota_types & (1 << type))) continue; err = dquot_load_quota_sb(sb, type, QFMT_SHMEM, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); if (err) goto out_err; } return 0; out_err: pr_warn("tmpfs: failed to enable quota tracking (type=%d, err=%d)\n", type, err); for (type--; type >= 0; type--) dquot_quota_off(sb, type); return err; } static void shmem_disable_quotas(struct super_block *sb) { int type; for (type = 0; type < SHMEM_MAXQUOTAS; type++) dquot_quota_off(sb, type); } static struct dquot __rcu **shmem_get_dquots(struct inode *inode) { return SHMEM_I(inode)->i_dquot; } #endif /* CONFIG_TMPFS_QUOTA */ /* * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and * produces a novel ino for the newly allocated inode. * * It may also be called when making a hard link to permit the space needed by * each dentry. However, in that case, no new inode number is needed since that * internally draws from another pool of inode numbers (currently global * get_next_ino()). This case is indicated by passing NULL as inop. */ #define SHMEM_INO_BATCH 1024 static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); ino_t ino; if (!(sb->s_flags & SB_KERNMOUNT)) { raw_spin_lock(&sbinfo->stat_lock); if (sbinfo->max_inodes) { if (sbinfo->free_ispace < BOGO_INODE_SIZE) { raw_spin_unlock(&sbinfo->stat_lock); return -ENOSPC; } sbinfo->free_ispace -= BOGO_INODE_SIZE; } if (inop) { ino = sbinfo->next_ino++; if (unlikely(is_zero_ino(ino))) ino = sbinfo->next_ino++; if (unlikely(!sbinfo->full_inums && ino > UINT_MAX)) { /* * Emulate get_next_ino uint wraparound for * compatibility */ if (IS_ENABLED(CONFIG_64BIT)) pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n", __func__, MINOR(sb->s_dev)); sbinfo->next_ino = 1; ino = sbinfo->next_ino++; } *inop = ino; } raw_spin_unlock(&sbinfo->stat_lock); } else if (inop) { /* * __shmem_file_setup, one of our callers, is lock-free: it * doesn't hold stat_lock in shmem_reserve_inode since * max_inodes is always 0, and is called from potentially * unknown contexts. As such, use a per-cpu batched allocator * which doesn't require the per-sb stat_lock unless we are at * the batch boundary. * * We don't need to worry about inode{32,64} since SB_KERNMOUNT * shmem mounts are not exposed to userspace, so we don't need * to worry about things like glibc compatibility. */ ino_t *next_ino; next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu()); ino = *next_ino; if (unlikely(ino % SHMEM_INO_BATCH == 0)) { raw_spin_lock(&sbinfo->stat_lock); ino = sbinfo->next_ino; sbinfo->next_ino += SHMEM_INO_BATCH; raw_spin_unlock(&sbinfo->stat_lock); if (unlikely(is_zero_ino(ino))) ino++; } *inop = ino; *next_ino = ++ino; put_cpu(); } return 0; } static void shmem_free_inode(struct super_block *sb, size_t freed_ispace) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); if (sbinfo->max_inodes) { raw_spin_lock(&sbinfo->stat_lock); sbinfo->free_ispace += BOGO_INODE_SIZE + freed_ispace; raw_spin_unlock(&sbinfo->stat_lock); } } /** * shmem_recalc_inode - recalculate the block usage of an inode * @inode: inode to recalc * @alloced: the change in number of pages allocated to inode * @swapped: the change in number of pages swapped from inode * * We have to calculate the free blocks since the mm can drop * undirtied hole pages behind our back. * * But normally info->alloced == inode->i_mapping->nrpages + info->swapped * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped) * * Return: true if swapped was incremented from 0, for shmem_writeout(). */ static bool shmem_recalc_inode(struct inode *inode, long alloced, long swapped) { struct shmem_inode_info *info = SHMEM_I(inode); bool first_swapped = false; long freed; spin_lock(&info->lock); info->alloced += alloced; info->swapped += swapped; freed = info->alloced - info->swapped - READ_ONCE(inode->i_mapping->nrpages); /* * Special case: whereas normally shmem_recalc_inode() is called * after i_mapping->nrpages has already been adjusted (up or down), * shmem_writeout() has to raise swapped before nrpages is lowered - * to stop a racing shmem_recalc_inode() from thinking that a page has * been freed. Compensate here, to avoid the need for a followup call. */ if (swapped > 0) { if (info->swapped == swapped) first_swapped = true; freed += swapped; } if (freed > 0) info->alloced -= freed; spin_unlock(&info->lock); /* The quota case may block */ if (freed > 0) shmem_inode_unacct_blocks(inode, freed); return first_swapped; } bool shmem_charge(struct inode *inode, long pages) { struct address_space *mapping = inode->i_mapping; if (shmem_inode_acct_blocks(inode, pages)) return false; /* nrpages adjustment first, then shmem_recalc_inode() when balanced */ xa_lock_irq(&mapping->i_pages); mapping->nrpages += pages; xa_unlock_irq(&mapping->i_pages); shmem_recalc_inode(inode, pages, 0); return true; } void shmem_uncharge(struct inode *inode, long pages) { /* pages argument is currently unused: keep it to help debugging */ /* nrpages adjustment done by __filemap_remove_folio() or caller */ shmem_recalc_inode(inode, 0, 0); } /* * Replace item expected in xarray by a new item, while holding xa_lock. */ static int shmem_replace_entry(struct address_space *mapping, pgoff_t index, void *expected, void *replacement) { XA_STATE(xas, &mapping->i_pages, index); void *item; VM_BUG_ON(!expected); VM_BUG_ON(!replacement); item = xas_load(&xas); if (item != expected) return -ENOENT; xas_store(&xas, replacement); return 0; } /* * Sometimes, before we decide whether to proceed or to fail, we must check * that an entry was not already brought back or split by a racing thread. * * Checking folio is not enough: by the time a swapcache folio is locked, it * might be reused, and again be swapcache, using the same swap as before. * Returns the swap entry's order if it still presents, else returns -1. */ static int shmem_confirm_swap(struct address_space *mapping, pgoff_t index, swp_entry_t swap) { XA_STATE(xas, &mapping->i_pages, index); int ret = -1; void *entry; rcu_read_lock(); do { entry = xas_load(&xas); if (entry == swp_to_radix_entry(swap)) ret = xas_get_order(&xas); } while (xas_retry(&xas, entry)); rcu_read_unlock(); return ret; } /* * Definitions for "huge tmpfs": tmpfs mounted with the huge= option * * SHMEM_HUGE_NEVER: * disables huge pages for the mount; * SHMEM_HUGE_ALWAYS: * enables huge pages for the mount; * SHMEM_HUGE_WITHIN_SIZE: * only allocate huge pages if the page will be fully within i_size, * also respect madvise() hints; * SHMEM_HUGE_ADVISE: * only allocate huge pages if requested with madvise(); */ #define SHMEM_HUGE_NEVER 0 #define SHMEM_HUGE_ALWAYS 1 #define SHMEM_HUGE_WITHIN_SIZE 2 #define SHMEM_HUGE_ADVISE 3 /* * Special values. * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled: * * SHMEM_HUGE_DENY: * disables huge on shm_mnt and all mounts, for emergency use; * SHMEM_HUGE_FORCE: * enables huge on shm_mnt and all mounts, w/o needing option, for testing; * */ #define SHMEM_HUGE_DENY (-1) #define SHMEM_HUGE_FORCE (-2) #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* ifdef here to avoid bloating shmem.o when not necessary */ static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; static int tmpfs_huge __read_mostly = SHMEM_HUGE_NEVER; static unsigned int shmem_get_orders_within_size(struct inode *inode, unsigned long within_size_orders, pgoff_t index, loff_t write_end) { pgoff_t aligned_index; unsigned long order; loff_t i_size; order = highest_order(within_size_orders); while (within_size_orders) { aligned_index = round_up(index + 1, 1 << order); i_size = max(write_end, i_size_read(inode)); i_size = round_up(i_size, PAGE_SIZE); if (i_size >> PAGE_SHIFT >= aligned_index) return within_size_orders; order = next_order(&within_size_orders, order); } return 0; } static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index, loff_t write_end, bool shmem_huge_force, struct vm_area_struct *vma, vm_flags_t vm_flags) { unsigned int maybe_pmd_order = HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER ? 0 : BIT(HPAGE_PMD_ORDER); unsigned long within_size_orders; if (!S_ISREG(inode->i_mode)) return 0; if (shmem_huge == SHMEM_HUGE_DENY) return 0; if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE) return maybe_pmd_order; /* * The huge order allocation for anon shmem is controlled through * the mTHP interface, so we still use PMD-sized huge order to * check whether global control is enabled. * * For tmpfs mmap()'s huge order, we still use PMD-sized order to * allocate huge pages due to lack of a write size hint. * * For tmpfs with 'huge=always' or 'huge=within_size' mount option, * we will always try PMD-sized order first. If that failed, it will * fall back to small large folios. */ switch (SHMEM_SB(inode->i_sb)->huge) { case SHMEM_HUGE_ALWAYS: if (vma) return maybe_pmd_order; return THP_ORDERS_ALL_FILE_DEFAULT; case SHMEM_HUGE_WITHIN_SIZE: if (vma) within_size_orders = maybe_pmd_order; else within_size_orders = THP_ORDERS_ALL_FILE_DEFAULT; within_size_orders = shmem_get_orders_within_size(inode, within_size_orders, index, write_end); if (within_size_orders > 0) return within_size_orders; fallthrough; case SHMEM_HUGE_ADVISE: if (vm_flags & VM_HUGEPAGE) return maybe_pmd_order; fallthrough; default: return 0; } } static int shmem_parse_huge(const char *str) { int huge; if (!str) return -EINVAL; if (!strcmp(str, "never")) huge = SHMEM_HUGE_NEVER; else if (!strcmp(str, "always")) huge = SHMEM_HUGE_ALWAYS; else if (!strcmp(str, "within_size")) huge = SHMEM_HUGE_WITHIN_SIZE; else if (!strcmp(str, "advise")) huge = SHMEM_HUGE_ADVISE; else if (!strcmp(str, "deny")) huge = SHMEM_HUGE_DENY; else if (!strcmp(str, "force")) huge = SHMEM_HUGE_FORCE; else return -EINVAL; if (!has_transparent_hugepage() && huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY) return -EINVAL; /* Do not override huge allocation policy with non-PMD sized mTHP */ if (huge == SHMEM_HUGE_FORCE && huge_shmem_orders_inherit != BIT(HPAGE_PMD_ORDER)) return -EINVAL; return huge; } #if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS) static const char *shmem_format_huge(int huge) { switch (huge) { case SHMEM_HUGE_NEVER: return "never"; case SHMEM_HUGE_ALWAYS: return "always"; case SHMEM_HUGE_WITHIN_SIZE: return "within_size"; case SHMEM_HUGE_ADVISE: return "advise"; case SHMEM_HUGE_DENY: return "deny"; case SHMEM_HUGE_FORCE: return "force"; default: VM_BUG_ON(1); return "bad_val"; } } #endif static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, struct shrink_control *sc, unsigned long nr_to_free) { LIST_HEAD(list), *pos, *next; struct inode *inode; struct shmem_inode_info *info; struct folio *folio; unsigned long batch = sc ? sc->nr_to_scan : 128; unsigned long split = 0, freed = 0; if (list_empty(&sbinfo->shrinklist)) return SHRINK_STOP; spin_lock(&sbinfo->shrinklist_lock); list_for_each_safe(pos, next, &sbinfo->shrinklist) { info = list_entry(pos, struct shmem_inode_info, shrinklist); /* pin the inode */ inode = igrab(&info->vfs_inode); /* inode is about to be evicted */ if (!inode) { list_del_init(&info->shrinklist); goto next; } list_move(&info->shrinklist, &list); next: sbinfo->shrinklist_len--; if (!--batch) break; } spin_unlock(&sbinfo->shrinklist_lock); list_for_each_safe(pos, next, &list) { pgoff_t next, end; loff_t i_size; int ret; info = list_entry(pos, struct shmem_inode_info, shrinklist); inode = &info->vfs_inode; if (nr_to_free && freed >= nr_to_free) goto move_back; i_size = i_size_read(inode); folio = filemap_get_entry(inode->i_mapping, i_size / PAGE_SIZE); if (!folio || xa_is_value(folio)) goto drop; /* No large folio at the end of the file: nothing to split */ if (!folio_test_large(folio)) { folio_put(folio); goto drop; } /* Check if there is anything to gain from splitting */ next = folio_next_index(folio); end = shmem_fallocend(inode, DIV_ROUND_UP(i_size, PAGE_SIZE)); if (end <= folio->index || end >= next) { folio_put(folio); goto drop; } /* * Move the inode on the list back to shrinklist if we failed * to lock the page at this time. * * Waiting for the lock may lead to deadlock in the * reclaim path. */ if (!folio_trylock(folio)) { folio_put(folio); goto move_back; } ret = split_folio(folio); folio_unlock(folio); folio_put(folio); /* If split failed move the inode on the list back to shrinklist */ if (ret) goto move_back; freed += next - end; split++; drop: list_del_init(&info->shrinklist); goto put; move_back: /* * Make sure the inode is either on the global list or deleted * from any local list before iput() since it could be deleted * in another thread once we put the inode (then the local list * is corrupted). */ spin_lock(&sbinfo->shrinklist_lock); list_move(&info->shrinklist, &sbinfo->shrinklist); sbinfo->shrinklist_len++; spin_unlock(&sbinfo->shrinklist_lock); put: iput(inode); } return split; } static long shmem_unused_huge_scan(struct super_block *sb, struct shrink_control *sc) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); if (!READ_ONCE(sbinfo->shrinklist_len)) return SHRINK_STOP; return shmem_unused_huge_shrink(sbinfo, sc, 0); } static long shmem_unused_huge_count(struct super_block *sb, struct shrink_control *sc) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); return READ_ONCE(sbinfo->shrinklist_len); } #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ #define shmem_huge SHMEM_HUGE_DENY static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, struct shrink_control *sc, unsigned long nr_to_free) { return 0; } static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index, loff_t write_end, bool shmem_huge_force, struct vm_area_struct *vma, vm_flags_t vm_flags) { return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static void shmem_update_stats(struct folio *folio, int nr_pages) { if (folio_test_pmd_mappable(folio)) __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr_pages); __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); __lruvec_stat_mod_folio(folio, NR_SHMEM, nr_pages); } /* * Somewhat like filemap_add_folio, but error if expected item has gone. */ static int shmem_add_to_page_cache(struct folio *folio, struct address_space *mapping, pgoff_t index, void *expected, gfp_t gfp) { XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio)); unsigned long nr = folio_nr_pages(folio); swp_entry_t iter, swap; void *entry; VM_BUG_ON_FOLIO(index != round_down(index, nr), folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); folio_ref_add(folio, nr); folio->mapping = mapping; folio->index = index; gfp &= GFP_RECLAIM_MASK; folio_throttle_swaprate(folio, gfp); swap = radix_to_swp_entry(expected); do { iter = swap; xas_lock_irq(&xas); xas_for_each_conflict(&xas, entry) { /* * The range must either be empty, or filled with * expected swap entries. Shmem swap entries are never * partially freed without split of both entry and * folio, so there shouldn't be any holes. */ if (!expected || entry != swp_to_radix_entry(iter)) { xas_set_err(&xas, -EEXIST); goto unlock; } iter.val += 1 << xas_get_order(&xas); } if (expected && iter.val - nr != swap.val) { xas_set_err(&xas, -EEXIST); goto unlock; } xas_store(&xas, folio); if (xas_error(&xas)) goto unlock; shmem_update_stats(folio, nr); mapping->nrpages += nr; unlock: xas_unlock_irq(&xas); } while (xas_nomem(&xas, gfp)); if (xas_error(&xas)) { folio->mapping = NULL; folio_ref_sub(folio, nr); return xas_error(&xas); } return 0; } /* * Somewhat like filemap_remove_folio, but substitutes swap for @folio. */ static void shmem_delete_from_page_cache(struct folio *folio, void *radswap) { struct address_space *mapping = folio->mapping; long nr = folio_nr_pages(folio); int error; xa_lock_irq(&mapping->i_pages); error = shmem_replace_entry(mapping, folio->index, folio, radswap); folio->mapping = NULL; mapping->nrpages -= nr; shmem_update_stats(folio, -nr); xa_unlock_irq(&mapping->i_pages); folio_put_refs(folio, nr); BUG_ON(error); } /* * Remove swap entry from page cache, free the swap and its page cache. Returns * the number of pages being freed. 0 means entry not found in XArray (0 pages * being freed). */ static long shmem_free_swap(struct address_space *mapping, pgoff_t index, void *radswap) { int order = xa_get_order(&mapping->i_pages, index); void *old; old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0); if (old != radswap) return 0; free_swap_and_cache_nr(radix_to_swp_entry(radswap), 1 << order); return 1 << order; } /* * Determine (in bytes) how many of the shmem object's pages mapped by the * given offsets are swapped out. * * This is safe to call without i_rwsem or the i_pages lock thanks to RCU, * as long as the inode doesn't go away and racy results are not a problem. */ unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end) { XA_STATE(xas, &mapping->i_pages, start); struct folio *folio; unsigned long swapped = 0; unsigned long max = end - 1; rcu_read_lock(); xas_for_each(&xas, folio, max) { if (xas_retry(&xas, folio)) continue; if (xa_is_value(folio)) swapped += 1 << xas_get_order(&xas); if (xas.xa_index == max) break; if (need_resched()) { xas_pause(&xas); cond_resched_rcu(); } } rcu_read_unlock(); return swapped << PAGE_SHIFT; } /* * Determine (in bytes) how many of the shmem object's pages mapped by the * given vma is swapped out. * * This is safe to call without i_rwsem or the i_pages lock thanks to RCU, * as long as the inode doesn't go away and racy results are not a problem. */ unsigned long shmem_swap_usage(struct vm_area_struct *vma) { struct inode *inode = file_inode(vma->vm_file); struct shmem_inode_info *info = SHMEM_I(inode); struct address_space *mapping = inode->i_mapping; unsigned long swapped; /* Be careful as we don't hold info->lock */ swapped = READ_ONCE(info->swapped); /* * The easier cases are when the shmem object has nothing in swap, or * the vma maps it whole. Then we can simply use the stats that we * already track. */ if (!swapped) return 0; if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size) return swapped << PAGE_SHIFT; /* Here comes the more involved part */ return shmem_partial_swap_usage(mapping, vma->vm_pgoff, vma->vm_pgoff + vma_pages(vma)); } /* * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists. */ void shmem_unlock_mapping(struct address_space *mapping) { struct folio_batch fbatch; pgoff_t index = 0; folio_batch_init(&fbatch); /* * Minor point, but we might as well stop if someone else SHM_LOCKs it. */ while (!mapping_unevictable(mapping) && filemap_get_folios(mapping, &index, ~0UL, &fbatch)) { check_move_unevictable_folios(&fbatch); folio_batch_release(&fbatch); cond_resched(); } } static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) { struct folio *folio; /* * At first avoid shmem_get_folio(,,,SGP_READ): that fails * beyond i_size, and reports fallocated folios as holes. */ folio = filemap_get_entry(inode->i_mapping, index); if (!folio) return folio; if (!xa_is_value(folio)) { folio_lock(folio); if (folio->mapping == inode->i_mapping) return folio; /* The folio has been swapped out */ folio_unlock(folio); folio_put(folio); } /* * But read a folio back from swap if any of it is within i_size * (although in some cases this is just a waste of time). */ folio = NULL; shmem_get_folio(inode, index, 0, &folio, SGP_READ); return folio; } /* * Remove range of pages and swap entries from page cache, and free them. * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. */ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, bool unfalloc) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT; pgoff_t end = (lend + 1) >> PAGE_SHIFT; struct folio_batch fbatch; pgoff_t indices[PAGEVEC_SIZE]; struct folio *folio; bool same_folio; long nr_swaps_freed = 0; pgoff_t index; int i; if (lend == -1) end = -1; /* unsigned, so actually very big */ if (info->fallocend > start && info->fallocend <= end && !unfalloc) info->fallocend = start; folio_batch_init(&fbatch); index = start; while (index < end && find_lock_entries(mapping, &index, end - 1, &fbatch, indices)) { for (i = 0; i < folio_batch_count(&fbatch); i++) { folio = fbatch.folios[i]; if (xa_is_value(folio)) { if (unfalloc) continue; nr_swaps_freed += shmem_free_swap(mapping, indices[i], folio); continue; } if (!unfalloc || !folio_test_uptodate(folio)) truncate_inode_folio(mapping, folio); folio_unlock(folio); } folio_batch_remove_exceptionals(&fbatch); folio_batch_release(&fbatch); cond_resched(); } /* * When undoing a failed fallocate, we want none of the partial folio * zeroing and splitting below, but shall want to truncate the whole * folio when !uptodate indicates that it was added by this fallocate, * even when [lstart, lend] covers only a part of the folio. */ if (unfalloc) goto whole_folios; same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT); folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT); if (folio) { same_folio = lend < folio_pos(folio) + folio_size(folio); folio_mark_dirty(folio); if (!truncate_inode_partial_folio(folio, lstart, lend)) { start = folio_next_index(folio); if (same_folio) end = folio->index; } folio_unlock(folio); folio_put(folio); folio = NULL; } if (!same_folio) folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT); if (folio) { folio_mark_dirty(folio); if (!truncate_inode_partial_folio(folio, lstart, lend)) end = folio->index; folio_unlock(folio); folio_put(folio); } whole_folios: index = start; while (index < end) { cond_resched(); if (!find_get_entries(mapping, &index, end - 1, &fbatch, indices)) { /* If all gone or hole-punch or unfalloc, we're done */ if (index == start || end != -1) break; /* But if truncating, restart to make sure all gone */ index = start; continue; } for (i = 0; i < folio_batch_count(&fbatch); i++) { folio = fbatch.folios[i]; if (xa_is_value(folio)) { long swaps_freed; if (unfalloc) continue; swaps_freed = shmem_free_swap(mapping, indices[i], folio); if (!swaps_freed) { /* Swap was replaced by page: retry */ index = indices[i]; break; } nr_swaps_freed += swaps_freed; continue; } folio_lock(folio); if (!unfalloc || !folio_test_uptodate(folio)) { if (folio_mapping(folio) != mapping) { /* Page was replaced by swap: retry */ folio_unlock(folio); index = indices[i]; break; } VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); if (!folio_test_large(folio)) { truncate_inode_folio(mapping, folio); } else if (truncate_inode_partial_folio(folio, lstart, lend)) { /* * If we split a page, reset the loop so * that we pick up the new sub pages. * Otherwise the THP was entirely * dropped or the target range was * zeroed, so just continue the loop as * is. */ if (!folio_test_large(folio)) { folio_unlock(folio); index = start; break; } } } folio_unlock(folio); } folio_batch_remove_exceptionals(&fbatch); folio_batch_release(&fbatch); } shmem_recalc_inode(inode, 0, -nr_swaps_freed); } void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) { shmem_undo_range(inode, lstart, lend, false); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); inode_inc_iversion(inode); } EXPORT_SYMBOL_GPL(shmem_truncate_range); static int shmem_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = path->dentry->d_inode; struct shmem_inode_info *info = SHMEM_I(inode); if (info->alloced - info->swapped != inode->i_mapping->nrpages) shmem_recalc_inode(inode, 0, 0); if (info->fsflags & FS_APPEND_FL) stat->attributes |= STATX_ATTR_APPEND; if (info->fsflags & FS_IMMUTABLE_FL) stat->attributes |= STATX_ATTR_IMMUTABLE; if (info->fsflags & FS_NODUMP_FL) stat->attributes |= STATX_ATTR_NODUMP; stat->attributes_mask |= (STATX_ATTR_APPEND | STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); generic_fillattr(idmap, request_mask, inode, stat); if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0)) stat->blksize = HPAGE_PMD_SIZE; if (request_mask & STATX_BTIME) { stat->result_mask |= STATX_BTIME; stat->btime.tv_sec = info->i_crtime.tv_sec; stat->btime.tv_nsec = info->i_crtime.tv_nsec; } return 0; } static int shmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); struct shmem_inode_info *info = SHMEM_I(inode); int error; bool update_mtime = false; bool update_ctime = true; error = setattr_prepare(idmap, dentry, attr); if (error) return error; if ((info->seals & F_SEAL_EXEC) && (attr->ia_valid & ATTR_MODE)) { if ((inode->i_mode ^ attr->ia_mode) & 0111) { return -EPERM; } } if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { loff_t oldsize = inode->i_size; loff_t newsize = attr->ia_size; /* protected by i_rwsem */ if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || (newsize > oldsize && (info->seals & F_SEAL_GROW))) return -EPERM; if (newsize != oldsize) { error = shmem_reacct_size(SHMEM_I(inode)->flags, oldsize, newsize); if (error) return error; i_size_write(inode, newsize); update_mtime = true; } else { update_ctime = false; } if (newsize <= oldsize) { loff_t holebegin = round_up(newsize, PAGE_SIZE); if (oldsize > holebegin) unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); if (info->alloced) shmem_truncate_range(inode, newsize, (loff_t)-1); /* unmap again to remove racily COWed private pages */ if (oldsize > holebegin) unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); } } if (is_quota_modification(idmap, inode, attr)) { error = dquot_initialize(inode); if (error) return error; } /* Transfer quota accounting */ if (i_uid_needs_update(idmap, attr, inode) || i_gid_needs_update(idmap, attr, inode)) { error = dquot_transfer(idmap, inode, attr); if (error) return error; } setattr_copy(idmap, inode, attr); if (attr->ia_valid & ATTR_MODE) error = posix_acl_chmod(idmap, dentry, inode->i_mode); if (!error && update_ctime) { inode_set_ctime_current(inode); if (update_mtime) inode_set_mtime_to_ts(inode, inode_get_ctime(inode)); inode_inc_iversion(inode); } return error; } static void shmem_evict_inode(struct inode *inode) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); size_t freed = 0; if (shmem_mapping(inode->i_mapping)) { shmem_unacct_size(info->flags, inode->i_size); inode->i_size = 0; mapping_set_exiting(inode->i_mapping); shmem_truncate_range(inode, 0, (loff_t)-1); if (!list_empty(&info->shrinklist)) { spin_lock(&sbinfo->shrinklist_lock); if (!list_empty(&info->shrinklist)) { list_del_init(&info->shrinklist); sbinfo->shrinklist_len--; } spin_unlock(&sbinfo->shrinklist_lock); } while (!list_empty(&info->swaplist)) { /* Wait while shmem_unuse() is scanning this inode... */ wait_var_event(&info->stop_eviction, !atomic_read(&info->stop_eviction)); spin_lock(&shmem_swaplist_lock); /* ...but beware of the race if we peeked too early */ if (!atomic_read(&info->stop_eviction)) list_del_init(&info->swaplist); spin_unlock(&shmem_swaplist_lock); } } simple_xattrs_free(&info->xattrs, sbinfo->max_inodes ? &freed : NULL); shmem_free_inode(inode->i_sb, freed); WARN_ON(inode->i_blocks); clear_inode(inode); #ifdef CONFIG_TMPFS_QUOTA dquot_free_inode(inode); dquot_drop(inode); #endif } static unsigned int shmem_find_swap_entries(struct address_space *mapping, pgoff_t start, struct folio_batch *fbatch, pgoff_t *indices, unsigned int type) { XA_STATE(xas, &mapping->i_pages, start); struct folio *folio; swp_entry_t entry; rcu_read_lock(); xas_for_each(&xas, folio, ULONG_MAX) { if (xas_retry(&xas, folio)) continue; if (!xa_is_value(folio)) continue; entry = radix_to_swp_entry(folio); /* * swapin error entries can be found in the mapping. But they're * deliberately ignored here as we've done everything we can do. */ if (swp_type(entry) != type) continue; indices[folio_batch_count(fbatch)] = xas.xa_index; if (!folio_batch_add(fbatch, folio)) break; if (need_resched()) { xas_pause(&xas); cond_resched_rcu(); } } rcu_read_unlock(); return folio_batch_count(fbatch); } /* * Move the swapped pages for an inode to page cache. Returns the count * of pages swapped in, or the error in case of failure. */ static int shmem_unuse_swap_entries(struct inode *inode, struct folio_batch *fbatch, pgoff_t *indices) { int i = 0; int ret = 0; int error = 0; struct address_space *mapping = inode->i_mapping; for (i = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; error = shmem_swapin_folio(inode, indices[i], &folio, SGP_CACHE, mapping_gfp_mask(mapping), NULL, NULL); if (error == 0) { folio_unlock(folio); folio_put(folio); ret++; } if (error == -ENOMEM) break; error = 0; } return error ? error : ret; } /* * If swap found in inode, free it and move page from swapcache to filecache. */ static int shmem_unuse_inode(struct inode *inode, unsigned int type) { struct address_space *mapping = inode->i_mapping; pgoff_t start = 0; struct folio_batch fbatch; pgoff_t indices[PAGEVEC_SIZE]; int ret = 0; do { folio_batch_init(&fbatch); if (!shmem_find_swap_entries(mapping, start, &fbatch, indices, type)) { ret = 0; break; } ret = shmem_unuse_swap_entries(inode, &fbatch, indices); if (ret < 0) break; start = indices[folio_batch_count(&fbatch) - 1]; } while (true); return ret; } /* * Read all the shared memory data that resides in the swap * device 'type' back into memory, so the swap device can be * unused. */ int shmem_unuse(unsigned int type) { struct shmem_inode_info *info, *next; int error = 0; if (list_empty(&shmem_swaplist)) return 0; spin_lock(&shmem_swaplist_lock); start_over: list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) { if (!info->swapped) { list_del_init(&info->swaplist); continue; } /* * Drop the swaplist mutex while searching the inode for swap; * but before doing so, make sure shmem_evict_inode() will not * remove placeholder inode from swaplist, nor let it be freed * (igrab() would protect from unlink, but not from unmount). */ atomic_inc(&info->stop_eviction); spin_unlock(&shmem_swaplist_lock); error = shmem_unuse_inode(&info->vfs_inode, type); cond_resched(); spin_lock(&shmem_swaplist_lock); if (atomic_dec_and_test(&info->stop_eviction)) wake_up_var(&info->stop_eviction); if (error) break; if (list_empty(&info->swaplist)) goto start_over; next = list_next_entry(info, swaplist); if (!info->swapped) list_del_init(&info->swaplist); } spin_unlock(&shmem_swaplist_lock); return error; } /** * shmem_writeout - Write the folio to swap * @folio: The folio to write * @plug: swap plug * @folio_list: list to put back folios on split * * Move the folio from the page cache to the swap cache. */ int shmem_writeout(struct folio *folio, struct swap_iocb **plug, struct list_head *folio_list) { struct address_space *mapping = folio->mapping; struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); pgoff_t index; int nr_pages; bool split = false; if ((info->flags & VM_LOCKED) || sbinfo->noswap) goto redirty; if (!total_swap_pages) goto redirty; /* * If CONFIG_THP_SWAP is not enabled, the large folio should be * split when swapping. * * And shrinkage of pages beyond i_size does not split swap, so * swapout of a large folio crossing i_size needs to split too * (unless fallocate has been used to preallocate beyond EOF). */ if (folio_test_large(folio)) { index = shmem_fallocend(inode, DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE)); if ((index > folio->index && index < folio_next_index(folio)) || !IS_ENABLED(CONFIG_THP_SWAP)) split = true; } if (split) { try_split: /* Ensure the subpages are still dirty */ folio_test_set_dirty(folio); if (split_folio_to_list(folio, folio_list)) goto redirty; folio_clear_dirty(folio); } index = folio->index; nr_pages = folio_nr_pages(folio); /* * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC * value into swapfile.c, the only way we can correctly account for a * fallocated folio arriving here is now to initialize it and write it. * * That's okay for a folio already fallocated earlier, but if we have * not yet completed the fallocation, then (a) we want to keep track * of this folio in case we have to undo it, and (b) it may not be a * good idea to continue anyway, once we're pushing into swap. So * reactivate the folio, and let shmem_fallocate() quit when too many. */ if (!folio_test_uptodate(folio)) { if (inode->i_private) { struct shmem_falloc *shmem_falloc; spin_lock(&inode->i_lock); shmem_falloc = inode->i_private; if (shmem_falloc && !shmem_falloc->waitq && index >= shmem_falloc->start && index < shmem_falloc->next) shmem_falloc->nr_unswapped += nr_pages; else shmem_falloc = NULL; spin_unlock(&inode->i_lock); if (shmem_falloc) goto redirty; } folio_zero_range(folio, 0, folio_size(folio)); flush_dcache_folio(folio); folio_mark_uptodate(folio); } if (!folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN)) { bool first_swapped = shmem_recalc_inode(inode, 0, nr_pages); int error; /* * Add inode to shmem_unuse()'s list of swapped-out inodes, * if it's not already there. Do it now before the folio is * removed from page cache, when its pagelock no longer * protects the inode from eviction. And do it now, after * we've incremented swapped, because shmem_unuse() will * prune a !swapped inode from the swaplist. */ if (first_swapped) { spin_lock(&shmem_swaplist_lock); if (list_empty(&info->swaplist)) list_add(&info->swaplist, &shmem_swaplist); spin_unlock(&shmem_swaplist_lock); } swap_shmem_alloc(folio->swap, nr_pages); shmem_delete_from_page_cache(folio, swp_to_radix_entry(folio->swap)); BUG_ON(folio_mapped(folio)); error = swap_writeout(folio, plug); if (error != AOP_WRITEPAGE_ACTIVATE) { /* folio has been unlocked */ return error; } /* * The intention here is to avoid holding on to the swap when * zswap was unable to compress and unable to writeback; but * it will be appropriate if other reactivate cases are added. */ error = shmem_add_to_page_cache(folio, mapping, index, swp_to_radix_entry(folio->swap), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); /* Swap entry might be erased by racing shmem_free_swap() */ if (!error) { shmem_recalc_inode(inode, 0, -nr_pages); swap_free_nr(folio->swap, nr_pages); } /* * The swap_cache_del_folio() below could be left for * shrink_folio_list()'s folio_free_swap() to dispose of; * but I'm a little nervous about letting this folio out of * shmem_writeout() in a hybrid half-tmpfs-half-swap state * e.g. folio_mapping(folio) might give an unexpected answer. */ swap_cache_del_folio(folio); goto redirty; } if (nr_pages > 1) goto try_split; redirty: folio_mark_dirty(folio); return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */ } EXPORT_SYMBOL_GPL(shmem_writeout); #if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS) static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) { char buffer[64]; if (!mpol || mpol->mode == MPOL_DEFAULT) return; /* show nothing */ mpol_to_str(buffer, sizeof(buffer), mpol); seq_printf(seq, ",mpol=%s", buffer); } static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) { struct mempolicy *mpol = NULL; if (sbinfo->mpol) { raw_spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ mpol = sbinfo->mpol; mpol_get(mpol); raw_spin_unlock(&sbinfo->stat_lock); } return mpol; } #else /* !CONFIG_NUMA || !CONFIG_TMPFS */ static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) { } static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) { return NULL; } #endif /* CONFIG_NUMA && CONFIG_TMPFS */ static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, pgoff_t index, unsigned int order, pgoff_t *ilx); static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { struct mempolicy *mpol; pgoff_t ilx; struct folio *folio; mpol = shmem_get_pgoff_policy(info, index, 0, &ilx); folio = swap_cluster_readahead(swap, gfp, mpol, ilx); mpol_cond_put(mpol); return folio; } /* * Make sure huge_gfp is always more limited than limit_gfp. * Some of the flags set permissions, while others set limitations. */ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) { gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM; gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY; gfp_t zoneflags = limit_gfp & GFP_ZONEMASK; gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK); /* Allow allocations only from the originally specified zones. */ result |= zoneflags; /* * Minimize the result gfp by taking the union with the deny flags, * and the intersection of the allow flags. */ result |= (limit_gfp & denyflags); result |= (huge_gfp & limit_gfp) & allowflags; return result; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE bool shmem_hpage_pmd_enabled(void) { if (shmem_huge == SHMEM_HUGE_DENY) return false; if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_always)) return true; if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_madvise)) return true; if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_within_size)) return true; if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_inherit) && shmem_huge != SHMEM_HUGE_NEVER) return true; return false; } unsigned long shmem_allowable_huge_orders(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, loff_t write_end, bool shmem_huge_force) { unsigned long mask = READ_ONCE(huge_shmem_orders_always); unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size); vm_flags_t vm_flags = vma ? vma->vm_flags : 0; unsigned int global_orders; if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags, shmem_huge_force))) return 0; global_orders = shmem_huge_global_enabled(inode, index, write_end, shmem_huge_force, vma, vm_flags); /* Tmpfs huge pages allocation */ if (!vma || !vma_is_anon_shmem(vma)) return global_orders; /* * Following the 'deny' semantics of the top level, force the huge * option off from all mounts. */ if (shmem_huge == SHMEM_HUGE_DENY) return 0; /* * Only allow inherit orders if the top-level value is 'force', which * means non-PMD sized THP can not override 'huge' mount option now. */ if (shmem_huge == SHMEM_HUGE_FORCE) return READ_ONCE(huge_shmem_orders_inherit); /* Allow mTHP that will be fully within i_size. */ mask |= shmem_get_orders_within_size(inode, within_size_orders, index, 0); if (vm_flags & VM_HUGEPAGE) mask |= READ_ONCE(huge_shmem_orders_madvise); if (global_orders > 0) mask |= READ_ONCE(huge_shmem_orders_inherit); return THP_ORDERS_ALL_FILE_DEFAULT & mask; } static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf, struct address_space *mapping, pgoff_t index, unsigned long orders) { struct vm_area_struct *vma = vmf ? vmf->vma : NULL; pgoff_t aligned_index; unsigned long pages; int order; if (vma) { orders = thp_vma_suitable_orders(vma, vmf->address, orders); if (!orders) return 0; } /* Find the highest order that can add into the page cache */ order = highest_order(orders); while (orders) { pages = 1UL << order; aligned_index = round_down(index, pages); /* * Check for conflict before waiting on a huge allocation. * Conflict might be that a huge page has just been allocated * and added to page cache by a racing thread, or that there * is already at least one small page in the huge extent. * Be careful to retry when appropriate, but not forever! * Elsewhere -EEXIST would be the right code, but not here. */ if (!xa_find(&mapping->i_pages, &aligned_index, aligned_index + pages - 1, XA_PRESENT)) break; order = next_order(&orders, order); } return orders; } #else static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf, struct address_space *mapping, pgoff_t index, unsigned long orders) { return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static struct folio *shmem_alloc_folio(gfp_t gfp, int order, struct shmem_inode_info *info, pgoff_t index) { struct mempolicy *mpol; pgoff_t ilx; struct folio *folio; mpol = shmem_get_pgoff_policy(info, index, order, &ilx); folio = folio_alloc_mpol(gfp, order, mpol, ilx, numa_node_id()); mpol_cond_put(mpol); return folio; } static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, gfp_t gfp, struct inode *inode, pgoff_t index, struct mm_struct *fault_mm, unsigned long orders) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); unsigned long suitable_orders = 0; struct folio *folio = NULL; pgoff_t aligned_index; long pages; int error, order; if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) orders = 0; if (orders > 0) { suitable_orders = shmem_suitable_orders(inode, vmf, mapping, index, orders); order = highest_order(suitable_orders); while (suitable_orders) { pages = 1UL << order; aligned_index = round_down(index, pages); folio = shmem_alloc_folio(gfp, order, info, aligned_index); if (folio) { index = aligned_index; goto allocated; } if (pages == HPAGE_PMD_NR) count_vm_event(THP_FILE_FALLBACK); count_mthp_stat(order, MTHP_STAT_SHMEM_FALLBACK); order = next_order(&suitable_orders, order); } } else { pages = 1; folio = shmem_alloc_folio(gfp, 0, info, index); } if (!folio) return ERR_PTR(-ENOMEM); allocated: __folio_set_locked(folio); __folio_set_swapbacked(folio); gfp &= GFP_RECLAIM_MASK; error = mem_cgroup_charge(folio, fault_mm, gfp); if (error) { if (xa_find(&mapping->i_pages, &index, index + pages - 1, XA_PRESENT)) { error = -EEXIST; } else if (pages > 1) { if (pages == HPAGE_PMD_NR) { count_vm_event(THP_FILE_FALLBACK); count_vm_event(THP_FILE_FALLBACK_CHARGE); } count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK); count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK_CHARGE); } goto unlock; } error = shmem_add_to_page_cache(folio, mapping, index, NULL, gfp); if (error) goto unlock; error = shmem_inode_acct_blocks(inode, pages); if (error) { struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); long freed; /* * Try to reclaim some space by splitting a few * large folios beyond i_size on the filesystem. */ shmem_unused_huge_shrink(sbinfo, NULL, pages); /* * And do a shmem_recalc_inode() to account for freed pages: * except our folio is there in cache, so not quite balanced. */ spin_lock(&info->lock); freed = pages + info->alloced - info->swapped - READ_ONCE(mapping->nrpages); if (freed > 0) info->alloced -= freed; spin_unlock(&info->lock); if (freed > 0) shmem_inode_unacct_blocks(inode, freed); error = shmem_inode_acct_blocks(inode, pages); if (error) { filemap_remove_folio(folio); goto unlock; } } shmem_recalc_inode(inode, pages, 0); folio_add_lru(folio); return folio; unlock: folio_unlock(folio); folio_put(folio); return ERR_PTR(error); } static struct folio *shmem_swap_alloc_folio(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, swp_entry_t entry, int order, gfp_t gfp) { struct shmem_inode_info *info = SHMEM_I(inode); int nr_pages = 1 << order; struct folio *new; gfp_t alloc_gfp; void *shadow; /* * We have arrived here because our zones are constrained, so don't * limit chance of success with further cpuset and node constraints. */ gfp &= ~GFP_CONSTRAINT_MASK; alloc_gfp = gfp; if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { if (WARN_ON_ONCE(order)) return ERR_PTR(-EINVAL); } else if (order) { /* * If uffd is active for the vma, we need per-page fault * fidelity to maintain the uffd semantics, then fallback * to swapin order-0 folio, as well as for zswap case. * Any existing sub folio in the swap cache also blocks * mTHP swapin. */ if ((vma && unlikely(userfaultfd_armed(vma))) || !zswap_never_enabled() || non_swapcache_batch(entry, nr_pages) != nr_pages) goto fallback; alloc_gfp = limit_gfp_mask(vma_thp_gfp_mask(vma), gfp); } retry: new = shmem_alloc_folio(alloc_gfp, order, info, index); if (!new) { new = ERR_PTR(-ENOMEM); goto fallback; } if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL, alloc_gfp, entry)) { folio_put(new); new = ERR_PTR(-ENOMEM); goto fallback; } /* * Prevent parallel swapin from proceeding with the swap cache flag. * * Of course there is another possible concurrent scenario as well, * that is to say, the swap cache flag of a large folio has already * been set by swapcache_prepare(), while another thread may have * already split the large swap entry stored in the shmem mapping. * In this case, shmem_add_to_page_cache() will help identify the * concurrent swapin and return -EEXIST. */ if (swapcache_prepare(entry, nr_pages)) { folio_put(new); new = ERR_PTR(-EEXIST); /* Try smaller folio to avoid cache conflict */ goto fallback; } __folio_set_locked(new); __folio_set_swapbacked(new); new->swap = entry; memcg1_swapin(entry, nr_pages); shadow = swap_cache_get_shadow(entry); if (shadow) workingset_refault(new, shadow); folio_add_lru(new); swap_read_folio(new, NULL); return new; fallback: /* Order 0 swapin failed, nothing to fallback to, abort */ if (!order) return new; entry.val += index - round_down(index, nr_pages); alloc_gfp = gfp; nr_pages = 1; order = 0; goto retry; } /* * When a page is moved from swapcache to shmem filecache (either by the * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of * shmem_unuse_inode()), it may have been read in earlier from swap, in * ignorance of the mapping it belongs to. If that mapping has special * constraints (like the gma500 GEM driver, which requires RAM below 4GB), * we may need to copy to a suitable page before moving to filecache. * * In a future release, this may well be extended to respect cpuset and * NUMA mempolicy, and applied also to anonymous pages in do_swap_page(); * but for now it is a simple matter of zone. */ static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp) { return folio_zonenum(folio) > gfp_zone(gfp); } static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index, struct vm_area_struct *vma) { struct swap_cluster_info *ci; struct folio *new, *old = *foliop; swp_entry_t entry = old->swap; int nr_pages = folio_nr_pages(old); int error = 0; /* * We have arrived here because our zones are constrained, so don't * limit chance of success by further cpuset and node constraints. */ gfp &= ~GFP_CONSTRAINT_MASK; #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (nr_pages > 1) { gfp_t huge_gfp = vma_thp_gfp_mask(vma); gfp = limit_gfp_mask(huge_gfp, gfp); } #endif new = shmem_alloc_folio(gfp, folio_order(old), info, index); if (!new) return -ENOMEM; folio_ref_add(new, nr_pages); folio_copy(new, old); flush_dcache_folio(new); __folio_set_locked(new); __folio_set_swapbacked(new); folio_mark_uptodate(new); new->swap = entry; folio_set_swapcache(new); ci = swap_cluster_get_and_lock_irq(old); __swap_cache_replace_folio(ci, old, new); mem_cgroup_replace_folio(old, new); shmem_update_stats(new, nr_pages); shmem_update_stats(old, -nr_pages); swap_cluster_unlock_irq(ci); folio_add_lru(new); *foliop = new; folio_clear_swapcache(old); old->private = NULL; folio_unlock(old); /* * The old folio are removed from swap cache, drop the 'nr_pages' * reference, as well as one temporary reference getting from swap * cache. */ folio_put_refs(old, nr_pages + 1); return error; } static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, struct folio *folio, swp_entry_t swap, bool skip_swapcache) { struct address_space *mapping = inode->i_mapping; swp_entry_t swapin_error; void *old; int nr_pages; swapin_error = make_poisoned_swp_entry(); old = xa_cmpxchg_irq(&mapping->i_pages, index, swp_to_radix_entry(swap), swp_to_radix_entry(swapin_error), 0); if (old != swp_to_radix_entry(swap)) return; nr_pages = folio_nr_pages(folio); folio_wait_writeback(folio); if (!skip_swapcache) swap_cache_del_folio(folio); /* * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks) * in shmem_evict_inode(). */ shmem_recalc_inode(inode, -nr_pages, -nr_pages); swap_free_nr(swap, nr_pages); } static int shmem_split_large_entry(struct inode *inode, pgoff_t index, swp_entry_t swap, gfp_t gfp) { struct address_space *mapping = inode->i_mapping; XA_STATE_ORDER(xas, &mapping->i_pages, index, 0); int split_order = 0; int i; /* Convert user data gfp flags to xarray node gfp flags */ gfp &= GFP_RECLAIM_MASK; for (;;) { void *old = NULL; int cur_order; pgoff_t swap_index; xas_lock_irq(&xas); old = xas_load(&xas); if (!xa_is_value(old) || swp_to_radix_entry(swap) != old) { xas_set_err(&xas, -EEXIST); goto unlock; } cur_order = xas_get_order(&xas); if (!cur_order) goto unlock; /* Try to split large swap entry in pagecache */ swap_index = round_down(index, 1 << cur_order); split_order = xas_try_split_min_order(cur_order); while (cur_order > 0) { pgoff_t aligned_index = round_down(index, 1 << cur_order); pgoff_t swap_offset = aligned_index - swap_index; xas_set_order(&xas, index, split_order); xas_try_split(&xas, old, cur_order); if (xas_error(&xas)) goto unlock; /* * Re-set the swap entry after splitting, and the swap * offset of the original large entry must be continuous. */ for (i = 0; i < 1 << cur_order; i += (1 << split_order)) { swp_entry_t tmp; tmp = swp_entry(swp_type(swap), swp_offset(swap) + swap_offset + i); __xa_store(&mapping->i_pages, aligned_index + i, swp_to_radix_entry(tmp), 0); } cur_order = split_order; split_order = xas_try_split_min_order(split_order); } unlock: xas_unlock_irq(&xas); if (!xas_nomem(&xas, gfp)) break; } if (xas_error(&xas)) return xas_error(&xas); return 0; } /* * Swap in the folio pointed to by *foliop. * Caller has to make sure that *foliop contains a valid swapped folio. * Returns 0 and the folio in foliop if success. On failure, returns the * error code and NULL in *foliop. */ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp, gfp_t gfp, struct vm_area_struct *vma, vm_fault_t *fault_type) { struct address_space *mapping = inode->i_mapping; struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL; struct shmem_inode_info *info = SHMEM_I(inode); swp_entry_t swap, index_entry; struct swap_info_struct *si; struct folio *folio = NULL; bool skip_swapcache = false; int error, nr_pages, order; pgoff_t offset; VM_BUG_ON(!*foliop || !xa_is_value(*foliop)); index_entry = radix_to_swp_entry(*foliop); swap = index_entry; *foliop = NULL; if (is_poisoned_swp_entry(index_entry)) return -EIO; si = get_swap_device(index_entry); order = shmem_confirm_swap(mapping, index, index_entry); if (unlikely(!si)) { if (order < 0) return -EEXIST; else return -EINVAL; } if (unlikely(order < 0)) { put_swap_device(si); return -EEXIST; } /* index may point to the middle of a large entry, get the sub entry */ if (order) { offset = index - round_down(index, 1 << order); swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); } /* Look it up and read it in.. */ folio = swap_cache_get_folio(swap); if (!folio) { if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) { /* Direct swapin skipping swap cache & readahead */ folio = shmem_swap_alloc_folio(inode, vma, index, index_entry, order, gfp); if (IS_ERR(folio)) { error = PTR_ERR(folio); folio = NULL; goto failed; } skip_swapcache = true; } else { /* Cached swapin only supports order 0 folio */ folio = shmem_swapin_cluster(swap, gfp, info, index); if (!folio) { error = -ENOMEM; goto failed; } } if (fault_type) { *fault_type |= VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); count_memcg_event_mm(fault_mm, PGMAJFAULT); } } else { swap_update_readahead(folio, NULL, 0); } if (order > folio_order(folio)) { /* * Swapin may get smaller folios due to various reasons: * It may fallback to order 0 due to memory pressure or race, * swap readahead may swap in order 0 folios into swapcache * asynchronously, while the shmem mapping can still stores * large swap entries. In such cases, we should split the * large swap entry to prevent possible data corruption. */ error = shmem_split_large_entry(inode, index, index_entry, gfp); if (error) goto failed_nolock; } /* * If the folio is large, round down swap and index by folio size. * No matter what race occurs, the swap layer ensures we either get * a valid folio that has its swap entry aligned by size, or a * temporarily invalid one which we'll abort very soon and retry. * * shmem_add_to_page_cache ensures the whole range contains expected * entries and prevents any corruption, so any race split is fine * too, it will succeed as long as the entries are still there. */ nr_pages = folio_nr_pages(folio); if (nr_pages > 1) { swap.val = round_down(swap.val, nr_pages); index = round_down(index, nr_pages); } /* * We have to do this with the folio locked to prevent races. * The shmem_confirm_swap below only checks if the first swap * entry matches the folio, that's enough to ensure the folio * is not used outside of shmem, as shmem swap entries * and swap cache folios are never partially freed. */ folio_lock(folio); if ((!skip_swapcache && !folio_test_swapcache(folio)) || shmem_confirm_swap(mapping, index, swap) < 0 || folio->swap.val != swap.val) { error = -EEXIST; goto unlock; } if (!folio_test_uptodate(folio)) { error = -EIO; goto failed; } folio_wait_writeback(folio); /* * Some architectures may have to restore extra metadata to the * folio after reading from swap. */ arch_swap_restore(folio_swap(swap, folio), folio); if (shmem_should_replace_folio(folio, gfp)) { error = shmem_replace_folio(&folio, gfp, info, index, vma); if (error) goto failed; } error = shmem_add_to_page_cache(folio, mapping, index, swp_to_radix_entry(swap), gfp); if (error) goto failed; shmem_recalc_inode(inode, 0, -nr_pages); if (sgp == SGP_WRITE) folio_mark_accessed(folio); if (skip_swapcache) { folio->swap.val = 0; swapcache_clear(si, swap, nr_pages); } else { swap_cache_del_folio(folio); } folio_mark_dirty(folio); swap_free_nr(swap, nr_pages); put_swap_device(si); *foliop = folio; return 0; failed: if (shmem_confirm_swap(mapping, index, swap) < 0) error = -EEXIST; if (error == -EIO) shmem_set_folio_swapin_error(inode, index, folio, swap, skip_swapcache); unlock: if (folio) folio_unlock(folio); failed_nolock: if (skip_swapcache) swapcache_clear(si, folio->swap, folio_nr_pages(folio)); if (folio) folio_put(folio); put_swap_device(si); return error; } /* * shmem_get_folio_gfp - find page in cache, or get from swap, or allocate * * If we allocate a new one we do not mark it dirty. That's up to the * vm. If we swap it in we mark it dirty since we also free the swap * entry since a page cannot live in both the swap and page cache. * * vmf and fault_type are only supplied by shmem_fault: otherwise they are NULL. */ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, loff_t write_end, struct folio **foliop, enum sgp_type sgp, gfp_t gfp, struct vm_fault *vmf, vm_fault_t *fault_type) { struct vm_area_struct *vma = vmf ? vmf->vma : NULL; struct mm_struct *fault_mm; struct folio *folio; int error; bool alloced; unsigned long orders = 0; if (WARN_ON_ONCE(!shmem_mapping(inode->i_mapping))) return -EINVAL; if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) return -EFBIG; repeat: if (sgp <= SGP_CACHE && ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) return -EINVAL; alloced = false; fault_mm = vma ? vma->vm_mm : NULL; folio = filemap_get_entry(inode->i_mapping, index); if (folio && vma && userfaultfd_minor(vma)) { if (!xa_is_value(folio)) folio_put(folio); *fault_type = handle_userfault(vmf, VM_UFFD_MINOR); return 0; } if (xa_is_value(folio)) { error = shmem_swapin_folio(inode, index, &folio, sgp, gfp, vma, fault_type); if (error == -EEXIST) goto repeat; *foliop = folio; return error; } if (folio) { folio_lock(folio); /* Has the folio been truncated or swapped out? */ if (unlikely(folio->mapping != inode->i_mapping)) { folio_unlock(folio); folio_put(folio); goto repeat; } if (sgp == SGP_WRITE) folio_mark_accessed(folio); if (folio_test_uptodate(folio)) goto out; /* fallocated folio */ if (sgp != SGP_READ) goto clear; folio_unlock(folio); folio_put(folio); } /* * SGP_READ: succeed on hole, with NULL folio, letting caller zero. * SGP_NOALLOC: fail on hole, with NULL folio, letting caller fail. */ *foliop = NULL; if (sgp == SGP_READ) return 0; if (sgp == SGP_NOALLOC) return -ENOENT; /* * Fast cache lookup and swap lookup did not find it: allocate. */ if (vma && userfaultfd_missing(vma)) { *fault_type = handle_userfault(vmf, VM_UFFD_MISSING); return 0; } /* Find hugepage orders that are allowed for anonymous shmem and tmpfs. */ orders = shmem_allowable_huge_orders(inode, vma, index, write_end, false); if (orders > 0) { gfp_t huge_gfp; huge_gfp = vma_thp_gfp_mask(vma); huge_gfp = limit_gfp_mask(huge_gfp, gfp); folio = shmem_alloc_and_add_folio(vmf, huge_gfp, inode, index, fault_mm, orders); if (!IS_ERR(folio)) { if (folio_test_pmd_mappable(folio)) count_vm_event(THP_FILE_ALLOC); count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_ALLOC); goto alloced; } if (PTR_ERR(folio) == -EEXIST) goto repeat; } folio = shmem_alloc_and_add_folio(vmf, gfp, inode, index, fault_mm, 0); if (IS_ERR(folio)) { error = PTR_ERR(folio); if (error == -EEXIST) goto repeat; folio = NULL; goto unlock; } alloced: alloced = true; if (folio_test_large(folio) && DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < folio_next_index(folio)) { struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); struct shmem_inode_info *info = SHMEM_I(inode); /* * Part of the large folio is beyond i_size: subject * to shrink under memory pressure. */ spin_lock(&sbinfo->shrinklist_lock); /* * _careful to defend against unlocked access to * ->shrink_list in shmem_unused_huge_shrink() */ if (list_empty_careful(&info->shrinklist)) { list_add_tail(&info->shrinklist, &sbinfo->shrinklist); sbinfo->shrinklist_len++; } spin_unlock(&sbinfo->shrinklist_lock); } if (sgp == SGP_WRITE) folio_set_referenced(folio); /* * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio. */ if (sgp == SGP_FALLOC) sgp = SGP_WRITE; clear: /* * Let SGP_WRITE caller clear ends if write does not fill folio; * but SGP_FALLOC on a folio fallocated earlier must initialize * it now, lest undo on failure cancel our earlier guarantee. */ if (sgp != SGP_WRITE && !folio_test_uptodate(folio)) { long i, n = folio_nr_pages(folio); for (i = 0; i < n; i++) clear_highpage(folio_page(folio, i)); flush_dcache_folio(folio); folio_mark_uptodate(folio); } /* Perhaps the file has been truncated since we checked */ if (sgp <= SGP_CACHE && ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { error = -EINVAL; goto unlock; } out: *foliop = folio; return 0; /* * Error recovery. */ unlock: if (alloced) filemap_remove_folio(folio); shmem_recalc_inode(inode, 0, 0); if (folio) { folio_unlock(folio); folio_put(folio); } return error; } /** * shmem_get_folio - find, and lock a shmem folio. * @inode: inode to search * @index: the page index. * @write_end: end of a write, could extend inode size * @foliop: pointer to the folio if found * @sgp: SGP_* flags to control behavior * * Looks up the page cache entry at @inode & @index. If a folio is * present, it is returned locked with an increased refcount. * * If the caller modifies data in the folio, it must call folio_mark_dirty() * before unlocking the folio to ensure that the folio is not reclaimed. * There is no need to reserve space before calling folio_mark_dirty(). * * When no folio is found, the behavior depends on @sgp: * - for SGP_READ, *@foliop is %NULL and 0 is returned * - for SGP_NOALLOC, *@foliop is %NULL and -ENOENT is returned * - for all other flags a new folio is allocated, inserted into the * page cache and returned locked in @foliop. * * Context: May sleep. * Return: 0 if successful, else a negative error code. */ int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end, struct folio **foliop, enum sgp_type sgp) { return shmem_get_folio_gfp(inode, index, write_end, foliop, sgp, mapping_gfp_mask(inode->i_mapping), NULL, NULL); } EXPORT_SYMBOL_GPL(shmem_get_folio); /* * This is like autoremove_wake_function, but it removes the wait queue * entry unconditionally - even if something else had already woken the * target. */ static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) { int ret = default_wake_function(wait, mode, sync, key); list_del_init(&wait->entry); return ret; } /* * Trinity finds that probing a hole which tmpfs is punching can * prevent the hole-punch from ever completing: which in turn * locks writers out with its hold on i_rwsem. So refrain from * faulting pages into the hole while it's being punched. Although * shmem_undo_range() does remove the additions, it may be unable to * keep up, as each new page needs its own unmap_mapping_range() call, * and the i_mmap tree grows ever slower to scan if new vmas are added. * * It does not matter if we sometimes reach this check just before the * hole-punch begins, so that one fault then races with the punch: * we just need to make racing faults a rare case. * * The implementation below would be much simpler if we just used a * standard mutex or completion: but we cannot take i_rwsem in fault, * and bloating every shmem inode for this unlikely case would be sad. */ static vm_fault_t shmem_falloc_wait(struct vm_fault *vmf, struct inode *inode) { struct shmem_falloc *shmem_falloc; struct file *fpin = NULL; vm_fault_t ret = 0; spin_lock(&inode->i_lock); shmem_falloc = inode->i_private; if (shmem_falloc && shmem_falloc->waitq && vmf->pgoff >= shmem_falloc->start && vmf->pgoff < shmem_falloc->next) { wait_queue_head_t *shmem_falloc_waitq; DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function); ret = VM_FAULT_NOPAGE; fpin = maybe_unlock_mmap_for_io(vmf, NULL); shmem_falloc_waitq = shmem_falloc->waitq; prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, TASK_UNINTERRUPTIBLE); spin_unlock(&inode->i_lock); schedule(); /* * shmem_falloc_waitq points into the shmem_fallocate() * stack of the hole-punching task: shmem_falloc_waitq * is usually invalid by the time we reach here, but * finish_wait() does not dereference it in that case; * though i_lock needed lest racing with wake_up_all(). */ spin_lock(&inode->i_lock); finish_wait(shmem_falloc_waitq, &shmem_fault_wait); } spin_unlock(&inode->i_lock); if (fpin) { fput(fpin); ret = VM_FAULT_RETRY; } return ret; } static vm_fault_t shmem_fault(struct vm_fault *vmf) { struct inode *inode = file_inode(vmf->vma->vm_file); gfp_t gfp = mapping_gfp_mask(inode->i_mapping); struct folio *folio = NULL; vm_fault_t ret = 0; int err; /* * Trinity finds that probing a hole which tmpfs is punching can * prevent the hole-punch from ever completing: noted in i_private. */ if (unlikely(inode->i_private)) { ret = shmem_falloc_wait(vmf, inode); if (ret) return ret; } WARN_ON_ONCE(vmf->page != NULL); err = shmem_get_folio_gfp(inode, vmf->pgoff, 0, &folio, SGP_CACHE, gfp, vmf, &ret); if (err) return vmf_error(err); if (folio) { vmf->page = folio_file_page(folio, vmf->pgoff); ret |= VM_FAULT_LOCKED; } return ret; } unsigned long shmem_get_unmapped_area(struct file *file, unsigned long uaddr, unsigned long len, unsigned long pgoff, unsigned long flags) { unsigned long addr; unsigned long offset; unsigned long inflated_len; unsigned long inflated_addr; unsigned long inflated_offset; unsigned long hpage_size; if (len > TASK_SIZE) return -ENOMEM; addr = mm_get_unmapped_area(current->mm, file, uaddr, len, pgoff, flags); if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return addr; if (IS_ERR_VALUE(addr)) return addr; if (addr & ~PAGE_MASK) return addr; if (addr > TASK_SIZE - len) return addr; if (shmem_huge == SHMEM_HUGE_DENY) return addr; if (flags & MAP_FIXED) return addr; /* * Our priority is to support MAP_SHARED mapped hugely; * and support MAP_PRIVATE mapped hugely too, until it is COWed. * But if caller specified an address hint and we allocated area there * successfully, respect that as before. */ if (uaddr == addr) return addr; hpage_size = HPAGE_PMD_SIZE; if (shmem_huge != SHMEM_HUGE_FORCE) { struct super_block *sb; unsigned long __maybe_unused hpage_orders; int order = 0; if (file) { VM_BUG_ON(file->f_op != &shmem_file_operations); sb = file_inode(file)->i_sb; } else { /* * Called directly from mm/mmap.c, or drivers/char/mem.c * for "/dev/zero", to create a shared anonymous object. */ if (IS_ERR(shm_mnt)) return addr; sb = shm_mnt->mnt_sb; /* * Find the highest mTHP order used for anonymous shmem to * provide a suitable alignment address. */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE hpage_orders = READ_ONCE(huge_shmem_orders_always); hpage_orders |= READ_ONCE(huge_shmem_orders_within_size); hpage_orders |= READ_ONCE(huge_shmem_orders_madvise); if (SHMEM_SB(sb)->huge != SHMEM_HUGE_NEVER) hpage_orders |= READ_ONCE(huge_shmem_orders_inherit); if (hpage_orders > 0) { order = highest_order(hpage_orders); hpage_size = PAGE_SIZE << order; } #endif } if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER && !order) return addr; } if (len < hpage_size) return addr; offset = (pgoff << PAGE_SHIFT) & (hpage_size - 1); if (offset && offset + len < 2 * hpage_size) return addr; if ((addr & (hpage_size - 1)) == offset) return addr; inflated_len = len + hpage_size - PAGE_SIZE; if (inflated_len > TASK_SIZE) return addr; if (inflated_len < len) return addr; inflated_addr = mm_get_unmapped_area(current->mm, NULL, uaddr, inflated_len, 0, flags); if (IS_ERR_VALUE(inflated_addr)) return addr; if (inflated_addr & ~PAGE_MASK) return addr; inflated_offset = inflated_addr & (hpage_size - 1); inflated_addr += offset - inflated_offset; if (inflated_offset > offset) inflated_addr += hpage_size; if (inflated_addr > TASK_SIZE - len) return addr; return inflated_addr; } #ifdef CONFIG_NUMA static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) { struct inode *inode = file_inode(vma->vm_file); return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol); } static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, unsigned long addr, pgoff_t *ilx) { struct inode *inode = file_inode(vma->vm_file); pgoff_t index; /* * Bias interleave by inode number to distribute better across nodes; * but this interface is independent of which page order is used, so * supplies only that bias, letting caller apply the offset (adjusted * by page order, as in shmem_get_pgoff_policy() and get_vma_policy()). */ *ilx = inode->i_ino; index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index); } static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, pgoff_t index, unsigned int order, pgoff_t *ilx) { struct mempolicy *mpol; /* Bias interleave by inode number to distribute better across nodes */ *ilx = info->vfs_inode.i_ino + (index >> order); mpol = mpol_shared_policy_lookup(&info->policy, index); return mpol ? mpol : get_task_policy(current); } #else static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, pgoff_t index, unsigned int order, pgoff_t *ilx) { *ilx = 0; return NULL; } #endif /* CONFIG_NUMA */ int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) { struct inode *inode = file_inode(file); struct shmem_inode_info *info = SHMEM_I(inode); int retval = -ENOMEM; /* * What serializes the accesses to info->flags? * ipc_lock_object() when called from shmctl_do_lock(), * no serialization needed when called from shm_destroy(). */ if (lock && !(info->flags & VM_LOCKED)) { if (!user_shm_lock(inode->i_size, ucounts)) goto out_nomem; info->flags |= VM_LOCKED; mapping_set_unevictable(file->f_mapping); } if (!lock && (info->flags & VM_LOCKED) && ucounts) { user_shm_unlock(inode->i_size, ucounts); info->flags &= ~VM_LOCKED; mapping_clear_unevictable(file->f_mapping); } retval = 0; out_nomem: return retval; } static int shmem_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); file_accessed(file); /* This is anonymous shared memory if it is unlinked at the time of mmap */ if (inode->i_nlink) vma->vm_ops = &shmem_vm_ops; else vma->vm_ops = &shmem_anon_vm_ops; return 0; } static int shmem_file_open(struct inode *inode, struct file *file) { file->f_mode |= FMODE_CAN_ODIRECT; return generic_file_open(inode, file); } #ifdef CONFIG_TMPFS_XATTR static int shmem_initxattrs(struct inode *, const struct xattr *, void *); #if IS_ENABLED(CONFIG_UNICODE) /* * shmem_inode_casefold_flags - Deal with casefold file attribute flag * * The casefold file attribute needs some special checks. I can just be added to * an empty dir, and can't be removed from a non-empty dir. */ static int shmem_inode_casefold_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry, unsigned int *i_flags) { unsigned int old = inode->i_flags; struct super_block *sb = inode->i_sb; if (fsflags & FS_CASEFOLD_FL) { if (!(old & S_CASEFOLD)) { if (!sb->s_encoding) return -EOPNOTSUPP; if (!S_ISDIR(inode->i_mode)) return -ENOTDIR; if (dentry && !simple_empty(dentry)) return -ENOTEMPTY; } *i_flags = *i_flags | S_CASEFOLD; } else if (old & S_CASEFOLD) { if (dentry && !simple_empty(dentry)) return -ENOTEMPTY; } return 0; } #else static int shmem_inode_casefold_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry, unsigned int *i_flags) { if (fsflags & FS_CASEFOLD_FL) return -EOPNOTSUPP; return 0; } #endif /* * chattr's fsflags are unrelated to extended attributes, * but tmpfs has chosen to enable them under the same config option. */ static int shmem_set_inode_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry) { unsigned int i_flags = 0; int ret; ret = shmem_inode_casefold_flags(inode, fsflags, dentry, &i_flags); if (ret) return ret; if (fsflags & FS_NOATIME_FL) i_flags |= S_NOATIME; if (fsflags & FS_APPEND_FL) i_flags |= S_APPEND; if (fsflags & FS_IMMUTABLE_FL) i_flags |= S_IMMUTABLE; /* * But FS_NODUMP_FL does not require any action in i_flags. */ inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE | S_CASEFOLD); return 0; } #else static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry) { } #define shmem_initxattrs NULL #endif static struct offset_ctx *shmem_get_offset_ctx(struct inode *inode) { return &SHMEM_I(inode)->dir_offsets; } static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir, umode_t mode, dev_t dev, unsigned long flags) { struct inode *inode; struct shmem_inode_info *info; struct shmem_sb_info *sbinfo = SHMEM_SB(sb); ino_t ino; int err; err = shmem_reserve_inode(sb, &ino); if (err) return ERR_PTR(err); inode = new_inode(sb); if (!inode) { shmem_free_inode(sb, 0); return ERR_PTR(-ENOSPC); } inode->i_ino = ino; inode_init_owner(idmap, inode, dir, mode); inode->i_blocks = 0; simple_inode_init_ts(inode); inode->i_generation = get_random_u32(); info = SHMEM_I(inode); memset(info, 0, (char *)inode - (char *)info); spin_lock_init(&info->lock); atomic_set(&info->stop_eviction, 0); info->seals = F_SEAL_SEAL; info->flags = flags & VM_NORESERVE; info->i_crtime = inode_get_mtime(inode); info->fsflags = (dir == NULL) ? 0 : SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED; if (info->fsflags) shmem_set_inode_flags(inode, info->fsflags, NULL); INIT_LIST_HEAD(&info->shrinklist); INIT_LIST_HEAD(&info->swaplist); simple_xattrs_init(&info->xattrs); cache_no_acl(inode); if (sbinfo->noswap) mapping_set_unevictable(inode->i_mapping); /* Don't consider 'deny' for emergencies and 'force' for testing */ if (sbinfo->huge) mapping_set_large_folios(inode->i_mapping); switch (mode & S_IFMT) { default: inode->i_op = &shmem_special_inode_operations; init_special_inode(inode, mode, dev); break; case S_IFREG: inode->i_mapping->a_ops = &shmem_aops; inode->i_op = &shmem_inode_operations; inode->i_fop = &shmem_file_operations; mpol_shared_policy_init(&info->policy, shmem_get_sbmpol(sbinfo)); break; case S_IFDIR: inc_nlink(inode); /* Some things misbehave if size == 0 on a directory */ inode->i_size = 2 * BOGO_DIRENT_SIZE; inode->i_op = &shmem_dir_inode_operations; inode->i_fop = &simple_offset_dir_operations; simple_offset_init(shmem_get_offset_ctx(inode)); break; case S_IFLNK: /* * Must not load anything in the rbtree, * mpol_free_shared_policy will not be called. */ mpol_shared_policy_init(&info->policy, NULL); break; } lockdep_annotate_inode_mutex_key(inode); return inode; } #ifdef CONFIG_TMPFS_QUOTA static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir, umode_t mode, dev_t dev, unsigned long flags) { int err; struct inode *inode; inode = __shmem_get_inode(idmap, sb, dir, mode, dev, flags); if (IS_ERR(inode)) return inode; err = dquot_initialize(inode); if (err) goto errout; err = dquot_alloc_inode(inode); if (err) { dquot_drop(inode); goto errout; } return inode; errout: inode->i_flags |= S_NOQUOTA; iput(inode); return ERR_PTR(err); } #else static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir, umode_t mode, dev_t dev, unsigned long flags) { return __shmem_get_inode(idmap, sb, dir, mode, dev, flags); } #endif /* CONFIG_TMPFS_QUOTA */ #ifdef CONFIG_USERFAULTFD int shmem_mfill_atomic_pte(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, uffd_flags_t flags, struct folio **foliop) { struct inode *inode = file_inode(dst_vma->vm_file); struct shmem_inode_info *info = SHMEM_I(inode); struct address_space *mapping = inode->i_mapping; gfp_t gfp = mapping_gfp_mask(mapping); pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); void *page_kaddr; struct folio *folio; int ret; pgoff_t max_off; if (shmem_inode_acct_blocks(inode, 1)) { /* * We may have got a page, returned -ENOENT triggering a retry, * and now we find ourselves with -ENOMEM. Release the page, to * avoid a BUG_ON in our caller. */ if (unlikely(*foliop)) { folio_put(*foliop); *foliop = NULL; } return -ENOMEM; } if (!*foliop) { ret = -ENOMEM; folio = shmem_alloc_folio(gfp, 0, info, pgoff); if (!folio) goto out_unacct_blocks; if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) { page_kaddr = kmap_local_folio(folio, 0); /* * The read mmap_lock is held here. Despite the * mmap_lock being read recursive a deadlock is still * possible if a writer has taken a lock. For example: * * process A thread 1 takes read lock on own mmap_lock * process A thread 2 calls mmap, blocks taking write lock * process B thread 1 takes page fault, read lock on own mmap lock * process B thread 2 calls mmap, blocks taking write lock * process A thread 1 blocks taking read lock on process B * process B thread 1 blocks taking read lock on process A * * Disable page faults to prevent potential deadlock * and retry the copy outside the mmap_lock. */ pagefault_disable(); ret = copy_from_user(page_kaddr, (const void __user *)src_addr, PAGE_SIZE); pagefault_enable(); kunmap_local(page_kaddr); /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { *foliop = folio; ret = -ENOENT; /* don't free the page */ goto out_unacct_blocks; } flush_dcache_folio(folio); } else { /* ZEROPAGE */ clear_user_highpage(&folio->page, dst_addr); } } else { folio = *foliop; VM_BUG_ON_FOLIO(folio_test_large(folio), folio); *foliop = NULL; } VM_BUG_ON(folio_test_locked(folio)); VM_BUG_ON(folio_test_swapbacked(folio)); __folio_set_locked(folio); __folio_set_swapbacked(folio); __folio_mark_uptodate(folio); ret = -EFAULT; max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); if (unlikely(pgoff >= max_off)) goto out_release; ret = mem_cgroup_charge(folio, dst_vma->vm_mm, gfp); if (ret) goto out_release; ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp); if (ret) goto out_release; ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, &folio->page, true, flags); if (ret) goto out_delete_from_cache; shmem_recalc_inode(inode, 1, 0); folio_unlock(folio); return 0; out_delete_from_cache: filemap_remove_folio(folio); out_release: folio_unlock(folio); folio_put(folio); out_unacct_blocks: shmem_inode_unacct_blocks(inode, 1); return ret; } #endif /* CONFIG_USERFAULTFD */ #ifdef CONFIG_TMPFS static const struct inode_operations shmem_symlink_inode_operations; static const struct inode_operations shmem_short_symlink_operations; static int shmem_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t index = pos >> PAGE_SHIFT; struct folio *folio; int ret = 0; /* i_rwsem is held by caller */ if (unlikely(info->seals & (F_SEAL_GROW | F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) { if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) return -EPERM; if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size) return -EPERM; } ret = shmem_get_folio(inode, index, pos + len, &folio, SGP_WRITE); if (ret) return ret; if (folio_contain_hwpoisoned_page(folio)) { folio_unlock(folio); folio_put(folio); return -EIO; } *foliop = folio; return 0; } static int shmem_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; if (pos + copied > inode->i_size) i_size_write(inode, pos + copied); if (!folio_test_uptodate(folio)) { if (copied < folio_size(folio)) { size_t from = offset_in_folio(folio, pos); folio_zero_segments(folio, 0, from, from + copied, folio_size(folio)); } folio_mark_uptodate(folio); } folio_mark_dirty(folio); folio_unlock(folio); folio_put(folio); return copied; } static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; pgoff_t index; unsigned long offset; int error = 0; ssize_t retval = 0; for (;;) { struct folio *folio = NULL; struct page *page = NULL; unsigned long nr, ret; loff_t end_offset, i_size = i_size_read(inode); bool fallback_page_copy = false; size_t fsize; if (unlikely(iocb->ki_pos >= i_size)) break; index = iocb->ki_pos >> PAGE_SHIFT; error = shmem_get_folio(inode, index, 0, &folio, SGP_READ); if (error) { if (error == -EINVAL) error = 0; break; } if (folio) { folio_unlock(folio); page = folio_file_page(folio, index); if (PageHWPoison(page)) { folio_put(folio); error = -EIO; break; } if (folio_test_large(folio) && folio_test_has_hwpoisoned(folio)) fallback_page_copy = true; } /* * We must evaluate after, since reads (unlike writes) * are called without i_rwsem protection against truncate */ i_size = i_size_read(inode); if (unlikely(iocb->ki_pos >= i_size)) { if (folio) folio_put(folio); break; } end_offset = min_t(loff_t, i_size, iocb->ki_pos + to->count); if (folio && likely(!fallback_page_copy)) fsize = folio_size(folio); else fsize = PAGE_SIZE; offset = iocb->ki_pos & (fsize - 1); nr = min_t(loff_t, end_offset - iocb->ki_pos, fsize - offset); if (folio) { /* * If users can be writing to this page using arbitrary * virtual addresses, take care about potential aliasing * before reading the page on the kernel side. */ if (mapping_writably_mapped(mapping)) { if (likely(!fallback_page_copy)) flush_dcache_folio(folio); else flush_dcache_page(page); } /* * Mark the folio accessed if we read the beginning. */ if (!offset) folio_mark_accessed(folio); /* * Ok, we have the page, and it's up-to-date, so * now we can copy it to user space... */ if (likely(!fallback_page_copy)) ret = copy_folio_to_iter(folio, offset, nr, to); else ret = copy_page_to_iter(page, offset, nr, to); folio_put(folio); } else if (user_backed_iter(to)) { /* * Copy to user tends to be so well optimized, but * clear_user() not so much, that it is noticeably * faster to copy the zero page instead of clearing. */ ret = copy_page_to_iter(ZERO_PAGE(0), offset, nr, to); } else { /* * But submitting the same page twice in a row to * splice() - or others? - can result in confusion: * so don't attempt that optimization on pipes etc. */ ret = iov_iter_zero(nr, to); } retval += ret; iocb->ki_pos += ret; if (!iov_iter_count(to)) break; if (ret < nr) { error = -EFAULT; break; } cond_resched(); } file_accessed(file); return retval ? retval : error; } static ssize_t shmem_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; ssize_t ret; inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret <= 0) goto unlock; ret = file_remove_privs(file); if (ret) goto unlock; ret = file_update_time(file); if (ret) goto unlock; ret = generic_perform_write(iocb, from); unlock: inode_unlock(inode); return ret; } static bool zero_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { return true; } static void zero_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { } static bool zero_pipe_buf_try_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { return false; } static const struct pipe_buf_operations zero_pipe_buf_ops = { .release = zero_pipe_buf_release, .try_steal = zero_pipe_buf_try_steal, .get = zero_pipe_buf_get, }; static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe, loff_t fpos, size_t size) { size_t offset = fpos & ~PAGE_MASK; size = min_t(size_t, size, PAGE_SIZE - offset); if (!pipe_is_full(pipe)) { struct pipe_buffer *buf = pipe_head_buf(pipe); *buf = (struct pipe_buffer) { .ops = &zero_pipe_buf_ops, .page = ZERO_PAGE(0), .offset = offset, .len = size, }; pipe->head++; } return size; } static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct inode *inode = file_inode(in); struct address_space *mapping = inode->i_mapping; struct folio *folio = NULL; size_t total_spliced = 0, used, npages, n, part; loff_t isize; int error = 0; /* Work out how much data we can actually add into the pipe */ used = pipe_buf_usage(pipe); npages = max_t(ssize_t, pipe->max_usage - used, 0); len = min_t(size_t, len, npages * PAGE_SIZE); do { bool fallback_page_splice = false; struct page *page = NULL; pgoff_t index; size_t size; if (*ppos >= i_size_read(inode)) break; index = *ppos >> PAGE_SHIFT; error = shmem_get_folio(inode, index, 0, &folio, SGP_READ); if (error) { if (error == -EINVAL) error = 0; break; } if (folio) { folio_unlock(folio); page = folio_file_page(folio, index); if (PageHWPoison(page)) { error = -EIO; break; } if (folio_test_large(folio) && folio_test_has_hwpoisoned(folio)) fallback_page_splice = true; } /* * i_size must be checked after we know the pages are Uptodate. * * Checking i_size after the check allows us to calculate * the correct value for "nr", which means the zero-filled * part of the page is not copied back to userspace (unless * another truncate extends the file - this is desired though). */ isize = i_size_read(inode); if (unlikely(*ppos >= isize)) break; /* * Fallback to PAGE_SIZE splice if the large folio has hwpoisoned * pages. */ size = len; if (unlikely(fallback_page_splice)) { size_t offset = *ppos & ~PAGE_MASK; size = umin(size, PAGE_SIZE - offset); } part = min_t(loff_t, isize - *ppos, size); if (folio) { /* * If users can be writing to this page using arbitrary * virtual addresses, take care about potential aliasing * before reading the page on the kernel side. */ if (mapping_writably_mapped(mapping)) { if (likely(!fallback_page_splice)) flush_dcache_folio(folio); else flush_dcache_page(page); } folio_mark_accessed(folio); /* * Ok, we have the page, and it's up-to-date, so we can * now splice it into the pipe. */ n = splice_folio_into_pipe(pipe, folio, *ppos, part); folio_put(folio); folio = NULL; } else { n = splice_zeropage_into_pipe(pipe, *ppos, part); } if (!n) break; len -= n; total_spliced += n; *ppos += n; in->f_ra.prev_pos = *ppos; if (pipe_is_full(pipe)) break; cond_resched(); } while (len); if (folio) folio_put(folio); file_accessed(in); return total_spliced ? total_spliced : error; } static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) { struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; if (whence != SEEK_DATA && whence != SEEK_HOLE) return generic_file_llseek_size(file, offset, whence, MAX_LFS_FILESIZE, i_size_read(inode)); if (offset < 0) return -ENXIO; inode_lock(inode); /* We're holding i_rwsem so we can access i_size directly */ offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence); if (offset >= 0) offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE); inode_unlock(inode); return offset; } static long shmem_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_falloc shmem_falloc; pgoff_t start, index, end, undo_fallocend; int error; if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; inode_lock(inode); if (mode & FALLOC_FL_PUNCH_HOLE) { struct address_space *mapping = file->f_mapping; loff_t unmap_start = round_up(offset, PAGE_SIZE); loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); /* protected by i_rwsem */ if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { error = -EPERM; goto out; } shmem_falloc.waitq = &shmem_falloc_waitq; shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT; shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT; spin_lock(&inode->i_lock); inode->i_private = &shmem_falloc; spin_unlock(&inode->i_lock); if ((u64)unmap_end > (u64)unmap_start) unmap_mapping_range(mapping, unmap_start, 1 + unmap_end - unmap_start, 0); shmem_truncate_range(inode, offset, offset + len - 1); /* No need to unmap again: hole-punching leaves COWed pages */ spin_lock(&inode->i_lock); inode->i_private = NULL; wake_up_all(&shmem_falloc_waitq); WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head)); spin_unlock(&inode->i_lock); error = 0; goto out; } /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ error = inode_newsize_ok(inode, offset + len); if (error) goto out; if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) { error = -EPERM; goto out; } start = offset >> PAGE_SHIFT; end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT; /* Try to avoid a swapstorm if len is impossible to satisfy */ if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) { error = -ENOSPC; goto out; } shmem_falloc.waitq = NULL; shmem_falloc.start = start; shmem_falloc.next = start; shmem_falloc.nr_falloced = 0; shmem_falloc.nr_unswapped = 0; spin_lock(&inode->i_lock); inode->i_private = &shmem_falloc; spin_unlock(&inode->i_lock); /* * info->fallocend is only relevant when huge pages might be * involved: to prevent split_huge_page() freeing fallocated * pages when FALLOC_FL_KEEP_SIZE committed beyond i_size. */ undo_fallocend = info->fallocend; if (info->fallocend < end) info->fallocend = end; for (index = start; index < end; ) { struct folio *folio; /* * Check for fatal signal so that we abort early in OOM * situations. We don't want to abort in case of non-fatal * signals as large fallocate can take noticeable time and * e.g. periodic timers may result in fallocate constantly * restarting. */ if (fatal_signal_pending(current)) error = -EINTR; else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) error = -ENOMEM; else error = shmem_get_folio(inode, index, offset + len, &folio, SGP_FALLOC); if (error) { info->fallocend = undo_fallocend; /* Remove the !uptodate folios we added */ if (index > start) { shmem_undo_range(inode, (loff_t)start << PAGE_SHIFT, ((loff_t)index << PAGE_SHIFT) - 1, true); } goto undone; } /* * Here is a more important optimization than it appears: * a second SGP_FALLOC on the same large folio will clear it, * making it uptodate and un-undoable if we fail later. */ index = folio_next_index(folio); /* Beware 32-bit wraparound */ if (!index) index--; /* * Inform shmem_writeout() how far we have reached. * No need for lock or barrier: we have the page lock. */ if (!folio_test_uptodate(folio)) shmem_falloc.nr_falloced += index - shmem_falloc.next; shmem_falloc.next = index; /* * If !uptodate, leave it that way so that freeable folios * can be recognized if we need to rollback on error later. * But mark it dirty so that memory pressure will swap rather * than free the folios we are allocating (and SGP_CACHE folios * might still be clean: we now need to mark those dirty too). */ folio_mark_dirty(folio); folio_unlock(folio); folio_put(folio); cond_resched(); } if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) i_size_write(inode, offset + len); undone: spin_lock(&inode->i_lock); inode->i_private = NULL; spin_unlock(&inode->i_lock); out: if (!error) file_modified(file); inode_unlock(inode); return error; } static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) { struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); buf->f_type = TMPFS_MAGIC; buf->f_bsize = PAGE_SIZE; buf->f_namelen = NAME_MAX; if (sbinfo->max_blocks) { buf->f_blocks = sbinfo->max_blocks; buf->f_bavail = buf->f_bfree = sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks); } if (sbinfo->max_inodes) { buf->f_files = sbinfo->max_inodes; buf->f_ffree = sbinfo->free_ispace / BOGO_INODE_SIZE; } /* else leave those fields 0 like simple_statfs */ buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b); return 0; } /* * File creation. Allocate an inode, and we're done.. */ static int shmem_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { struct inode *inode; int error; if (!generic_ci_validate_strict_name(dir, &dentry->d_name)) return -EINVAL; inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE); if (IS_ERR(inode)) return PTR_ERR(inode); error = simple_acl_create(dir, inode); if (error) goto out_iput; error = security_inode_init_security(inode, dir, &dentry->d_name, shmem_initxattrs, NULL); if (error && error != -EOPNOTSUPP) goto out_iput; error = simple_offset_add(shmem_get_offset_ctx(dir), dentry); if (error) goto out_iput; dir->i_size += BOGO_DIRENT_SIZE; inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); inode_inc_iversion(dir); if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) d_add(dentry, inode); else d_instantiate(dentry, inode); dget(dentry); /* Extra count - pin the dentry in core */ return error; out_iput: iput(inode); return error; } static int shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct inode *inode; int error; inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE); if (IS_ERR(inode)) { error = PTR_ERR(inode); goto err_out; } error = security_inode_init_security(inode, dir, NULL, shmem_initxattrs, NULL); if (error && error != -EOPNOTSUPP) goto out_iput; error = simple_acl_create(dir, inode); if (error) goto out_iput; d_tmpfile(file, inode); err_out: return finish_open_simple(file, error); out_iput: iput(inode); return error; } static struct dentry *shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { int error; error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0); if (error) return ERR_PTR(error); inc_nlink(dir); return NULL; } static int shmem_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return shmem_mknod(idmap, dir, dentry, mode | S_IFREG, 0); } /* * Link a file.. */ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(old_dentry); int ret = 0; /* * No ordinary (disk based) filesystem counts links as inodes; * but each new link needs a new dentry, pinning lowmem, and * tmpfs dentries cannot be pruned until they are unlinked. * But if an O_TMPFILE file is linked into the tmpfs, the * first link must skip that, to get the accounting right. */ if (inode->i_nlink) { ret = shmem_reserve_inode(inode->i_sb, NULL); if (ret) goto out; } ret = simple_offset_add(shmem_get_offset_ctx(dir), dentry); if (ret) { if (inode->i_nlink) shmem_free_inode(inode->i_sb, 0); goto out; } dir->i_size += BOGO_DIRENT_SIZE; inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode))); inode_inc_iversion(dir); inc_nlink(inode); ihold(inode); /* New dentry reference */ dget(dentry); /* Extra pinning count for the created dentry */ if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) d_add(dentry, inode); else d_instantiate(dentry, inode); out: return ret; } static int shmem_unlink(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) shmem_free_inode(inode->i_sb, 0); simple_offset_remove(shmem_get_offset_ctx(dir), dentry); dir->i_size -= BOGO_DIRENT_SIZE; inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode))); inode_inc_iversion(dir); drop_nlink(inode); dput(dentry); /* Undo the count from "create" - does all the work */ /* * For now, VFS can't deal with case-insensitive negative dentries, so * we invalidate them */ if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) d_invalidate(dentry); return 0; } static int shmem_rmdir(struct inode *dir, struct dentry *dentry) { if (!simple_empty(dentry)) return -ENOTEMPTY; drop_nlink(d_inode(dentry)); drop_nlink(dir); return shmem_unlink(dir, dentry); } static int shmem_whiteout(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry) { struct dentry *whiteout; int error; whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name); if (!whiteout) return -ENOMEM; error = shmem_mknod(idmap, old_dir, whiteout, S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); dput(whiteout); if (error) return error; /* * Cheat and hash the whiteout while the old dentry is still in * place, instead of playing games with FS_RENAME_DOES_D_MOVE. * * d_lookup() will consistently find one of them at this point, * not sure which one, but that isn't even important. */ d_rehash(whiteout); return 0; } /* * The VFS layer already does all the dentry stuff for rename, * we just have to decrement the usage count for the target if * it exists so that the VFS layer correctly free's it when it * gets overwritten. */ static int shmem_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { struct inode *inode = d_inode(old_dentry); int they_are_dirs = S_ISDIR(inode->i_mode); int error; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; if (flags & RENAME_EXCHANGE) return simple_offset_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); if (!simple_empty(new_dentry)) return -ENOTEMPTY; if (flags & RENAME_WHITEOUT) { error = shmem_whiteout(idmap, old_dir, old_dentry); if (error) return error; } error = simple_offset_rename(old_dir, old_dentry, new_dir, new_dentry); if (error) return error; if (d_really_is_positive(new_dentry)) { (void) shmem_unlink(new_dir, new_dentry); if (they_are_dirs) { drop_nlink(d_inode(new_dentry)); drop_nlink(old_dir); } } else if (they_are_dirs) { drop_nlink(old_dir); inc_nlink(new_dir); } old_dir->i_size -= BOGO_DIRENT_SIZE; new_dir->i_size += BOGO_DIRENT_SIZE; simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); inode_inc_iversion(old_dir); inode_inc_iversion(new_dir); return 0; } static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { int error; int len; struct inode *inode; struct folio *folio; char *link; len = strlen(symname) + 1; if (len > PAGE_SIZE) return -ENAMETOOLONG; inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0, VM_NORESERVE); if (IS_ERR(inode)) return PTR_ERR(inode); error = security_inode_init_security(inode, dir, &dentry->d_name, shmem_initxattrs, NULL); if (error && error != -EOPNOTSUPP) goto out_iput; error = simple_offset_add(shmem_get_offset_ctx(dir), dentry); if (error) goto out_iput; inode->i_size = len-1; if (len <= SHORT_SYMLINK_LEN) { link = kmemdup(symname, len, GFP_KERNEL); if (!link) { error = -ENOMEM; goto out_remove_offset; } inode->i_op = &shmem_short_symlink_operations; inode_set_cached_link(inode, link, len - 1); } else { inode_nohighmem(inode); inode->i_mapping->a_ops = &shmem_aops; error = shmem_get_folio(inode, 0, 0, &folio, SGP_WRITE); if (error) goto out_remove_offset; inode->i_op = &shmem_symlink_inode_operations; memcpy(folio_address(folio), symname, len); folio_mark_uptodate(folio); folio_mark_dirty(folio); folio_unlock(folio); folio_put(folio); } dir->i_size += BOGO_DIRENT_SIZE; inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); inode_inc_iversion(dir); if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) d_add(dentry, inode); else d_instantiate(dentry, inode); dget(dentry); return 0; out_remove_offset: simple_offset_remove(shmem_get_offset_ctx(dir), dentry); out_iput: iput(inode); return error; } static void shmem_put_link(void *arg) { folio_mark_accessed(arg); folio_put(arg); } static const char *shmem_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { struct folio *folio = NULL; int error; if (!dentry) { folio = filemap_get_folio(inode->i_mapping, 0); if (IS_ERR(folio)) return ERR_PTR(-ECHILD); if (PageHWPoison(folio_page(folio, 0)) || !folio_test_uptodate(folio)) { folio_put(folio); return ERR_PTR(-ECHILD); } } else { error = shmem_get_folio(inode, 0, 0, &folio, SGP_READ); if (error) return ERR_PTR(error); if (!folio) return ERR_PTR(-ECHILD); if (PageHWPoison(folio_page(folio, 0))) { folio_unlock(folio); folio_put(folio); return ERR_PTR(-ECHILD); } folio_unlock(folio); } set_delayed_call(done, shmem_put_link, folio); return folio_address(folio); } #ifdef CONFIG_TMPFS_XATTR static int shmem_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); fileattr_fill_flags(fa, info->fsflags & SHMEM_FL_USER_VISIBLE); return 0; } static int shmem_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct shmem_inode_info *info = SHMEM_I(inode); int ret, flags; if (fileattr_has_fsx(fa)) return -EOPNOTSUPP; if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE) return -EOPNOTSUPP; flags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) | (fa->flags & SHMEM_FL_USER_MODIFIABLE); ret = shmem_set_inode_flags(inode, flags, dentry); if (ret) return ret; info->fsflags = flags; inode_set_ctime_current(inode); inode_inc_iversion(inode); return 0; } /* * Superblocks without xattr inode operations may get some security.* xattr * support from the LSM "for free". As soon as we have any other xattrs * like ACLs, we also need to implement the security.* handlers at * filesystem level, though. */ /* * Callback for security_inode_init_security() for acquiring xattrs. */ static int shmem_initxattrs(struct inode *inode, const struct xattr *xattr_array, void *fs_info) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); const struct xattr *xattr; struct simple_xattr *new_xattr; size_t ispace = 0; size_t len; if (sbinfo->max_inodes) { for (xattr = xattr_array; xattr->name != NULL; xattr++) { ispace += simple_xattr_space(xattr->name, xattr->value_len + XATTR_SECURITY_PREFIX_LEN); } if (ispace) { raw_spin_lock(&sbinfo->stat_lock); if (sbinfo->free_ispace < ispace) ispace = 0; else sbinfo->free_ispace -= ispace; raw_spin_unlock(&sbinfo->stat_lock); if (!ispace) return -ENOSPC; } } for (xattr = xattr_array; xattr->name != NULL; xattr++) { new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len); if (!new_xattr) break; len = strlen(xattr->name) + 1; new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len, GFP_KERNEL_ACCOUNT); if (!new_xattr->name) { kvfree(new_xattr); break; } memcpy(new_xattr->name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN); memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN, xattr->name, len); simple_xattr_add(&info->xattrs, new_xattr); } if (xattr->name != NULL) { if (ispace) { raw_spin_lock(&sbinfo->stat_lock); sbinfo->free_ispace += ispace; raw_spin_unlock(&sbinfo->stat_lock); } simple_xattrs_free(&info->xattrs, NULL); return -ENOMEM; } return 0; } static int shmem_xattr_handler_get(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *name, void *buffer, size_t size) { struct shmem_inode_info *info = SHMEM_I(inode); name = xattr_full_name(handler, name); return simple_xattr_get(&info->xattrs, name, buffer, size); } static int shmem_xattr_handler_set(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); struct simple_xattr *old_xattr; size_t ispace = 0; name = xattr_full_name(handler, name); if (value && sbinfo->max_inodes) { ispace = simple_xattr_space(name, size); raw_spin_lock(&sbinfo->stat_lock); if (sbinfo->free_ispace < ispace) ispace = 0; else sbinfo->free_ispace -= ispace; raw_spin_unlock(&sbinfo->stat_lock); if (!ispace) return -ENOSPC; } old_xattr = simple_xattr_set(&info->xattrs, name, value, size, flags); if (!IS_ERR(old_xattr)) { ispace = 0; if (old_xattr && sbinfo->max_inodes) ispace = simple_xattr_space(old_xattr->name, old_xattr->size); simple_xattr_free(old_xattr); old_xattr = NULL; inode_set_ctime_current(inode); inode_inc_iversion(inode); } if (ispace) { raw_spin_lock(&sbinfo->stat_lock); sbinfo->free_ispace += ispace; raw_spin_unlock(&sbinfo->stat_lock); } return PTR_ERR(old_xattr); } static const struct xattr_handler shmem_security_xattr_handler = { .prefix = XATTR_SECURITY_PREFIX, .get = shmem_xattr_handler_get, .set = shmem_xattr_handler_set, }; static const struct xattr_handler shmem_trusted_xattr_handler = { .prefix = XATTR_TRUSTED_PREFIX, .get = shmem_xattr_handler_get, .set = shmem_xattr_handler_set, }; static const struct xattr_handler shmem_user_xattr_handler = { .prefix = XATTR_USER_PREFIX, .get = shmem_xattr_handler_get, .set = shmem_xattr_handler_set, }; static const struct xattr_handler * const shmem_xattr_handlers[] = { &shmem_security_xattr_handler, &shmem_trusted_xattr_handler, &shmem_user_xattr_handler, NULL }; static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) { struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size); } #endif /* CONFIG_TMPFS_XATTR */ static const struct inode_operations shmem_short_symlink_operations = { .getattr = shmem_getattr, .setattr = shmem_setattr, .get_link = simple_get_link, #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, #endif }; static const struct inode_operations shmem_symlink_inode_operations = { .getattr = shmem_getattr, .setattr = shmem_setattr, .get_link = shmem_get_link, #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, #endif }; static struct dentry *shmem_get_parent(struct dentry *child) { return ERR_PTR(-ESTALE); } static int shmem_match(struct inode *ino, void *vfh) { __u32 *fh = vfh; __u64 inum = fh[2]; inum = (inum << 32) | fh[1]; return ino->i_ino == inum && fh[0] == ino->i_generation; } /* Find any alias of inode, but prefer a hashed alias */ static struct dentry *shmem_find_alias(struct inode *inode) { struct dentry *alias = d_find_alias(inode); return alias ?: d_find_any_alias(inode); } static struct dentry *shmem_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { struct inode *inode; struct dentry *dentry = NULL; u64 inum; if (fh_len < 3) return NULL; inum = fid->raw[2]; inum = (inum << 32) | fid->raw[1]; inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]), shmem_match, fid->raw); if (inode) { dentry = shmem_find_alias(inode); iput(inode); } return dentry; } static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len, struct inode *parent) { if (*len < 3) { *len = 3; return FILEID_INVALID; } if (inode_unhashed(inode)) { /* Unfortunately insert_inode_hash is not idempotent, * so as we hash inodes here rather than at creation * time, we need a lock to ensure we only try * to do it once */ static DEFINE_SPINLOCK(lock); spin_lock(&lock); if (inode_unhashed(inode)) __insert_inode_hash(inode, inode->i_ino + inode->i_generation); spin_unlock(&lock); } fh[0] = inode->i_generation; fh[1] = inode->i_ino; fh[2] = ((__u64)inode->i_ino) >> 32; *len = 3; return 1; } static const struct export_operations shmem_export_ops = { .get_parent = shmem_get_parent, .encode_fh = shmem_encode_fh, .fh_to_dentry = shmem_fh_to_dentry, }; enum shmem_param { Opt_gid, Opt_huge, Opt_mode, Opt_mpol, Opt_nr_blocks, Opt_nr_inodes, Opt_size, Opt_uid, Opt_inode32, Opt_inode64, Opt_noswap, Opt_quota, Opt_usrquota, Opt_grpquota, Opt_usrquota_block_hardlimit, Opt_usrquota_inode_hardlimit, Opt_grpquota_block_hardlimit, Opt_grpquota_inode_hardlimit, Opt_casefold_version, Opt_casefold, Opt_strict_encoding, }; static const struct constant_table shmem_param_enums_huge[] = { {"never", SHMEM_HUGE_NEVER }, {"always", SHMEM_HUGE_ALWAYS }, {"within_size", SHMEM_HUGE_WITHIN_SIZE }, {"advise", SHMEM_HUGE_ADVISE }, {} }; const struct fs_parameter_spec shmem_fs_parameters[] = { fsparam_gid ("gid", Opt_gid), fsparam_enum ("huge", Opt_huge, shmem_param_enums_huge), fsparam_u32oct("mode", Opt_mode), fsparam_string("mpol", Opt_mpol), fsparam_string("nr_blocks", Opt_nr_blocks), fsparam_string("nr_inodes", Opt_nr_inodes), fsparam_string("size", Opt_size), fsparam_uid ("uid", Opt_uid), fsparam_flag ("inode32", Opt_inode32), fsparam_flag ("inode64", Opt_inode64), fsparam_flag ("noswap", Opt_noswap), #ifdef CONFIG_TMPFS_QUOTA fsparam_flag ("quota", Opt_quota), fsparam_flag ("usrquota", Opt_usrquota), fsparam_flag ("grpquota", Opt_grpquota), fsparam_string("usrquota_block_hardlimit", Opt_usrquota_block_hardlimit), fsparam_string("usrquota_inode_hardlimit", Opt_usrquota_inode_hardlimit), fsparam_string("grpquota_block_hardlimit", Opt_grpquota_block_hardlimit), fsparam_string("grpquota_inode_hardlimit", Opt_grpquota_inode_hardlimit), #endif fsparam_string("casefold", Opt_casefold_version), fsparam_flag ("casefold", Opt_casefold), fsparam_flag ("strict_encoding", Opt_strict_encoding), {} }; #if IS_ENABLED(CONFIG_UNICODE) static int shmem_parse_opt_casefold(struct fs_context *fc, struct fs_parameter *param, bool latest_version) { struct shmem_options *ctx = fc->fs_private; int version = UTF8_LATEST; struct unicode_map *encoding; char *version_str = param->string + 5; if (!latest_version) { if (strncmp(param->string, "utf8-", 5)) return invalfc(fc, "Only UTF-8 encodings are supported " "in the format: utf8-<version number>"); version = utf8_parse_version(version_str); if (version < 0) return invalfc(fc, "Invalid UTF-8 version: %s", version_str); } encoding = utf8_load(version); if (IS_ERR(encoding)) { return invalfc(fc, "Failed loading UTF-8 version: utf8-%u.%u.%u\n", unicode_major(version), unicode_minor(version), unicode_rev(version)); } pr_info("tmpfs: Using encoding : utf8-%u.%u.%u\n", unicode_major(version), unicode_minor(version), unicode_rev(version)); ctx->encoding = encoding; return 0; } #else static int shmem_parse_opt_casefold(struct fs_context *fc, struct fs_parameter *param, bool latest_version) { return invalfc(fc, "tmpfs: Kernel not built with CONFIG_UNICODE\n"); } #endif static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) { struct shmem_options *ctx = fc->fs_private; struct fs_parse_result result; unsigned long long size; char *rest; int opt; kuid_t kuid; kgid_t kgid; opt = fs_parse(fc, shmem_fs_parameters, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_size: size = memparse(param->string, &rest); if (*rest == '%') { size <<= PAGE_SHIFT; size *= totalram_pages(); do_div(size, 100); rest++; } if (*rest) goto bad_value; ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE); ctx->seen |= SHMEM_SEEN_BLOCKS; break; case Opt_nr_blocks: ctx->blocks = memparse(param->string, &rest); if (*rest || ctx->blocks > LONG_MAX) goto bad_value; ctx->seen |= SHMEM_SEEN_BLOCKS; break; case Opt_nr_inodes: ctx->inodes = memparse(param->string, &rest); if (*rest || ctx->inodes > ULONG_MAX / BOGO_INODE_SIZE) goto bad_value; ctx->seen |= SHMEM_SEEN_INODES; break; case Opt_mode: ctx->mode = result.uint_32 & 07777; break; case Opt_uid: kuid = result.uid; /* * The requested uid must be representable in the * filesystem's idmapping. */ if (!kuid_has_mapping(fc->user_ns, kuid)) goto bad_value; ctx->uid = kuid; break; case Opt_gid: kgid = result.gid; /* * The requested gid must be representable in the * filesystem's idmapping. */ if (!kgid_has_mapping(fc->user_ns, kgid)) goto bad_value; ctx->gid = kgid; break; case Opt_huge: ctx->huge = result.uint_32; if (ctx->huge != SHMEM_HUGE_NEVER && !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && has_transparent_hugepage())) goto unsupported_parameter; ctx->seen |= SHMEM_SEEN_HUGE; break; case Opt_mpol: if (IS_ENABLED(CONFIG_NUMA)) { mpol_put(ctx->mpol); ctx->mpol = NULL; if (mpol_parse_str(param->string, &ctx->mpol)) goto bad_value; break; } goto unsupported_parameter; case Opt_inode32: ctx->full_inums = false; ctx->seen |= SHMEM_SEEN_INUMS; break; case Opt_inode64: if (sizeof(ino_t) < 8) { return invalfc(fc, "Cannot use inode64 with <64bit inums in kernel\n"); } ctx->full_inums = true; ctx->seen |= SHMEM_SEEN_INUMS; break; case Opt_noswap: if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) { return invalfc(fc, "Turning off swap in unprivileged tmpfs mounts unsupported"); } ctx->noswap = true; break; case Opt_quota: if (fc->user_ns != &init_user_ns) return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported"); ctx->seen |= SHMEM_SEEN_QUOTA; ctx->quota_types |= (QTYPE_MASK_USR | QTYPE_MASK_GRP); break; case Opt_usrquota: if (fc->user_ns != &init_user_ns) return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported"); ctx->seen |= SHMEM_SEEN_QUOTA; ctx->quota_types |= QTYPE_MASK_USR; break; case Opt_grpquota: if (fc->user_ns != &init_user_ns) return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported"); ctx->seen |= SHMEM_SEEN_QUOTA; ctx->quota_types |= QTYPE_MASK_GRP; break; case Opt_usrquota_block_hardlimit: size = memparse(param->string, &rest); if (*rest || !size) goto bad_value; if (size > SHMEM_QUOTA_MAX_SPC_LIMIT) return invalfc(fc, "User quota block hardlimit too large."); ctx->qlimits.usrquota_bhardlimit = size; break; case Opt_grpquota_block_hardlimit: size = memparse(param->string, &rest); if (*rest || !size) goto bad_value; if (size > SHMEM_QUOTA_MAX_SPC_LIMIT) return invalfc(fc, "Group quota block hardlimit too large."); ctx->qlimits.grpquota_bhardlimit = size; break; case Opt_usrquota_inode_hardlimit: size = memparse(param->string, &rest); if (*rest || !size) goto bad_value; if (size > SHMEM_QUOTA_MAX_INO_LIMIT) return invalfc(fc, "User quota inode hardlimit too large."); ctx->qlimits.usrquota_ihardlimit = size; break; case Opt_grpquota_inode_hardlimit: size = memparse(param->string, &rest); if (*rest || !size) goto bad_value; if (size > SHMEM_QUOTA_MAX_INO_LIMIT) return invalfc(fc, "Group quota inode hardlimit too large."); ctx->qlimits.grpquota_ihardlimit = size; break; case Opt_casefold_version: return shmem_parse_opt_casefold(fc, param, false); case Opt_casefold: return shmem_parse_opt_casefold(fc, param, true); case Opt_strict_encoding: #if IS_ENABLED(CONFIG_UNICODE) ctx->strict_encoding = true; break; #else return invalfc(fc, "tmpfs: Kernel not built with CONFIG_UNICODE\n"); #endif } return 0; unsupported_parameter: return invalfc(fc, "Unsupported parameter '%s'", param->key); bad_value: return invalfc(fc, "Bad value for '%s'", param->key); } static char *shmem_next_opt(char **s) { char *sbegin = *s; char *p; if (sbegin == NULL) return NULL; /* * NUL-terminate this option: unfortunately, * mount options form a comma-separated list, * but mpol's nodelist may also contain commas. */ for (;;) { p = strchr(*s, ','); if (p == NULL) break; *s = p + 1; if (!isdigit(*(p+1))) { *p = '\0'; return sbegin; } } *s = NULL; return sbegin; } static int shmem_parse_monolithic(struct fs_context *fc, void *data) { return vfs_parse_monolithic_sep(fc, data, shmem_next_opt); } /* * Reconfigure a shmem filesystem. */ static int shmem_reconfigure(struct fs_context *fc) { struct shmem_options *ctx = fc->fs_private; struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb); unsigned long used_isp; struct mempolicy *mpol = NULL; const char *err; raw_spin_lock(&sbinfo->stat_lock); used_isp = sbinfo->max_inodes * BOGO_INODE_SIZE - sbinfo->free_ispace; if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) { if (!sbinfo->max_blocks) { err = "Cannot retroactively limit size"; goto out; } if (percpu_counter_compare(&sbinfo->used_blocks, ctx->blocks) > 0) { err = "Too small a size for current use"; goto out; } } if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) { if (!sbinfo->max_inodes) { err = "Cannot retroactively limit inodes"; goto out; } if (ctx->inodes * BOGO_INODE_SIZE < used_isp) { err = "Too few inodes for current use"; goto out; } } if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums && sbinfo->next_ino > UINT_MAX) { err = "Current inum too high to switch to 32-bit inums"; goto out; } /* * "noswap" doesn't use fsparam_flag_no, i.e. there's no "swap" * counterpart for (re-)enabling swap. */ if (ctx->noswap && !sbinfo->noswap) { err = "Cannot disable swap on remount"; goto out; } if (ctx->seen & SHMEM_SEEN_QUOTA && !sb_any_quota_loaded(fc->root->d_sb)) { err = "Cannot enable quota on remount"; goto out; } #ifdef CONFIG_TMPFS_QUOTA #define CHANGED_LIMIT(name) \ (ctx->qlimits.name## hardlimit && \ (ctx->qlimits.name## hardlimit != sbinfo->qlimits.name## hardlimit)) if (CHANGED_LIMIT(usrquota_b) || CHANGED_LIMIT(usrquota_i) || CHANGED_LIMIT(grpquota_b) || CHANGED_LIMIT(grpquota_i)) { err = "Cannot change global quota limit on remount"; goto out; } #endif /* CONFIG_TMPFS_QUOTA */ if (ctx->seen & SHMEM_SEEN_HUGE) sbinfo->huge = ctx->huge; if (ctx->seen & SHMEM_SEEN_INUMS) sbinfo->full_inums = ctx->full_inums; if (ctx->seen & SHMEM_SEEN_BLOCKS) sbinfo->max_blocks = ctx->blocks; if (ctx->seen & SHMEM_SEEN_INODES) { sbinfo->max_inodes = ctx->inodes; sbinfo->free_ispace = ctx->inodes * BOGO_INODE_SIZE - used_isp; } /* * Preserve previous mempolicy unless mpol remount option was specified. */ if (ctx->mpol) { mpol = sbinfo->mpol; sbinfo->mpol = ctx->mpol; /* transfers initial ref */ ctx->mpol = NULL; } if (ctx->noswap) sbinfo->noswap = true; raw_spin_unlock(&sbinfo->stat_lock); mpol_put(mpol); return 0; out: raw_spin_unlock(&sbinfo->stat_lock); return invalfc(fc, "%s", err); } static int shmem_show_options(struct seq_file *seq, struct dentry *root) { struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb); struct mempolicy *mpol; if (sbinfo->max_blocks != shmem_default_max_blocks()) seq_printf(seq, ",size=%luk", K(sbinfo->max_blocks)); if (sbinfo->max_inodes != shmem_default_max_inodes()) seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes); if (sbinfo->mode != (0777 | S_ISVTX)) seq_printf(seq, ",mode=%03ho", sbinfo->mode); if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID)) seq_printf(seq, ",uid=%u", from_kuid_munged(&init_user_ns, sbinfo->uid)); if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID)) seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, sbinfo->gid)); /* * Showing inode{64,32} might be useful even if it's the system default, * since then people don't have to resort to checking both here and * /proc/config.gz to confirm 64-bit inums were successfully applied * (which may not even exist if IKCONFIG_PROC isn't enabled). * * We hide it when inode64 isn't the default and we are using 32-bit * inodes, since that probably just means the feature isn't even under * consideration. * * As such: * * +-----------------+-----------------+ * | TMPFS_INODE64=y | TMPFS_INODE64=n | * +------------------+-----------------+-----------------+ * | full_inums=true | show | show | * | full_inums=false | show | hide | * +------------------+-----------------+-----------------+ * */ if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums) seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32)); #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */ if (sbinfo->huge) seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge)); #endif mpol = shmem_get_sbmpol(sbinfo); shmem_show_mpol(seq, mpol); mpol_put(mpol); if (sbinfo->noswap) seq_printf(seq, ",noswap"); #ifdef CONFIG_TMPFS_QUOTA if (sb_has_quota_active(root->d_sb, USRQUOTA)) seq_printf(seq, ",usrquota"); if (sb_has_quota_active(root->d_sb, GRPQUOTA)) seq_printf(seq, ",grpquota"); if (sbinfo->qlimits.usrquota_bhardlimit) seq_printf(seq, ",usrquota_block_hardlimit=%lld", sbinfo->qlimits.usrquota_bhardlimit); if (sbinfo->qlimits.grpquota_bhardlimit) seq_printf(seq, ",grpquota_block_hardlimit=%lld", sbinfo->qlimits.grpquota_bhardlimit); if (sbinfo->qlimits.usrquota_ihardlimit) seq_printf(seq, ",usrquota_inode_hardlimit=%lld", sbinfo->qlimits.usrquota_ihardlimit); if (sbinfo->qlimits.grpquota_ihardlimit) seq_printf(seq, ",grpquota_inode_hardlimit=%lld", sbinfo->qlimits.grpquota_ihardlimit); #endif return 0; } #endif /* CONFIG_TMPFS */ static void shmem_put_super(struct super_block *sb) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); #if IS_ENABLED(CONFIG_UNICODE) if (sb->s_encoding) utf8_unload(sb->s_encoding); #endif #ifdef CONFIG_TMPFS_QUOTA shmem_disable_quotas(sb); #endif free_percpu(sbinfo->ino_batch); percpu_counter_destroy(&sbinfo->used_blocks); mpol_put(sbinfo->mpol); kfree(sbinfo); sb->s_fs_info = NULL; } #if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_TMPFS) static const struct dentry_operations shmem_ci_dentry_ops = { .d_hash = generic_ci_d_hash, .d_compare = generic_ci_d_compare, }; #endif static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) { struct shmem_options *ctx = fc->fs_private; struct inode *inode; struct shmem_sb_info *sbinfo; int error = -ENOMEM; /* Round up to L1_CACHE_BYTES to resist false sharing */ sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info), L1_CACHE_BYTES), GFP_KERNEL); if (!sbinfo) return error; sb->s_fs_info = sbinfo; #ifdef CONFIG_TMPFS /* * Per default we only allow half of the physical ram per * tmpfs instance, limiting inodes to one per page of lowmem; * but the internal instance is left unlimited. */ if (!(sb->s_flags & SB_KERNMOUNT)) { if (!(ctx->seen & SHMEM_SEEN_BLOCKS)) ctx->blocks = shmem_default_max_blocks(); if (!(ctx->seen & SHMEM_SEEN_INODES)) ctx->inodes = shmem_default_max_inodes(); if (!(ctx->seen & SHMEM_SEEN_INUMS)) ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64); sbinfo->noswap = ctx->noswap; } else { sb->s_flags |= SB_NOUSER; } sb->s_export_op = &shmem_export_ops; sb->s_flags |= SB_NOSEC; #if IS_ENABLED(CONFIG_UNICODE) if (!ctx->encoding && ctx->strict_encoding) { pr_err("tmpfs: strict_encoding option without encoding is forbidden\n"); error = -EINVAL; goto failed; } if (ctx->encoding) { sb->s_encoding = ctx->encoding; set_default_d_op(sb, &shmem_ci_dentry_ops); if (ctx->strict_encoding) sb->s_encoding_flags = SB_ENC_STRICT_MODE_FL; } #endif #else sb->s_flags |= SB_NOUSER; #endif /* CONFIG_TMPFS */ sb->s_d_flags |= DCACHE_DONTCACHE; sbinfo->max_blocks = ctx->blocks; sbinfo->max_inodes = ctx->inodes; sbinfo->free_ispace = sbinfo->max_inodes * BOGO_INODE_SIZE; if (sb->s_flags & SB_KERNMOUNT) { sbinfo->ino_batch = alloc_percpu(ino_t); if (!sbinfo->ino_batch) goto failed; } sbinfo->uid = ctx->uid; sbinfo->gid = ctx->gid; sbinfo->full_inums = ctx->full_inums; sbinfo->mode = ctx->mode; #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (ctx->seen & SHMEM_SEEN_HUGE) sbinfo->huge = ctx->huge; else sbinfo->huge = tmpfs_huge; #endif sbinfo->mpol = ctx->mpol; ctx->mpol = NULL; raw_spin_lock_init(&sbinfo->stat_lock); if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) goto failed; spin_lock_init(&sbinfo->shrinklist_lock); INIT_LIST_HEAD(&sbinfo->shrinklist); sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = TMPFS_MAGIC; sb->s_op = &shmem_ops; sb->s_time_gran = 1; #ifdef CONFIG_TMPFS_XATTR sb->s_xattr = shmem_xattr_handlers; #endif #ifdef CONFIG_TMPFS_POSIX_ACL sb->s_flags |= SB_POSIXACL; #endif uuid_t uuid; uuid_gen(&uuid); super_set_uuid(sb, uuid.b, sizeof(uuid)); #ifdef CONFIG_TMPFS_QUOTA if (ctx->seen & SHMEM_SEEN_QUOTA) { sb->dq_op = &shmem_quota_operations; sb->s_qcop = &dquot_quotactl_sysfile_ops; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; /* Copy the default limits from ctx into sbinfo */ memcpy(&sbinfo->qlimits, &ctx->qlimits, sizeof(struct shmem_quota_limits)); if (shmem_enable_quotas(sb, ctx->quota_types)) goto failed; } #endif /* CONFIG_TMPFS_QUOTA */ inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); if (IS_ERR(inode)) { error = PTR_ERR(inode); goto failed; } inode->i_uid = sbinfo->uid; inode->i_gid = sbinfo->gid; sb->s_root = d_make_root(inode); if (!sb->s_root) goto failed; return 0; failed: shmem_put_super(sb); return error; } static int shmem_get_tree(struct fs_context *fc) { return get_tree_nodev(fc, shmem_fill_super); } static void shmem_free_fc(struct fs_context *fc) { struct shmem_options *ctx = fc->fs_private; if (ctx) { mpol_put(ctx->mpol); kfree(ctx); } } static const struct fs_context_operations shmem_fs_context_ops = { .free = shmem_free_fc, .get_tree = shmem_get_tree, #ifdef CONFIG_TMPFS .parse_monolithic = shmem_parse_monolithic, .parse_param = shmem_parse_one, .reconfigure = shmem_reconfigure, #endif }; static struct kmem_cache *shmem_inode_cachep __ro_after_init; static struct inode *shmem_alloc_inode(struct super_block *sb) { struct shmem_inode_info *info; info = alloc_inode_sb(sb, shmem_inode_cachep, GFP_KERNEL); if (!info) return NULL; return &info->vfs_inode; } static void shmem_free_in_core_inode(struct inode *inode) { if (S_ISLNK(inode->i_mode)) kfree(inode->i_link); kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); } static void shmem_destroy_inode(struct inode *inode) { if (S_ISREG(inode->i_mode)) mpol_free_shared_policy(&SHMEM_I(inode)->policy); if (S_ISDIR(inode->i_mode)) simple_offset_destroy(shmem_get_offset_ctx(inode)); } static void shmem_init_inode(void *foo) { struct shmem_inode_info *info = foo; inode_init_once(&info->vfs_inode); } static void __init shmem_init_inodecache(void) { shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", sizeof(struct shmem_inode_info), 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode); } static void __init shmem_destroy_inodecache(void) { kmem_cache_destroy(shmem_inode_cachep); } /* Keep the page in page cache instead of truncating it */ static int shmem_error_remove_folio(struct address_space *mapping, struct folio *folio) { return 0; } static const struct address_space_operations shmem_aops = { .dirty_folio = noop_dirty_folio, #ifdef CONFIG_TMPFS .write_begin = shmem_write_begin, .write_end = shmem_write_end, #endif #ifdef CONFIG_MIGRATION .migrate_folio = migrate_folio, #endif .error_remove_folio = shmem_error_remove_folio, }; static const struct file_operations shmem_file_operations = { .mmap = shmem_mmap, .open = shmem_file_open, .get_unmapped_area = shmem_get_unmapped_area, #ifdef CONFIG_TMPFS .llseek = shmem_file_llseek, .read_iter = shmem_file_read_iter, .write_iter = shmem_file_write_iter, .fsync = noop_fsync, .splice_read = shmem_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = shmem_fallocate, #endif }; static const struct inode_operations shmem_inode_operations = { .getattr = shmem_getattr, .setattr = shmem_setattr, #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, .set_acl = simple_set_acl, .fileattr_get = shmem_fileattr_get, .fileattr_set = shmem_fileattr_set, #endif }; static const struct inode_operations shmem_dir_inode_operations = { #ifdef CONFIG_TMPFS .getattr = shmem_getattr, .create = shmem_create, .lookup = simple_lookup, .link = shmem_link, .unlink = shmem_unlink, .symlink = shmem_symlink, .mkdir = shmem_mkdir, .rmdir = shmem_rmdir, .mknod = shmem_mknod, .rename = shmem_rename2, .tmpfile = shmem_tmpfile, .get_offset_ctx = shmem_get_offset_ctx, #endif #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, .fileattr_get = shmem_fileattr_get, .fileattr_set = shmem_fileattr_set, #endif #ifdef CONFIG_TMPFS_POSIX_ACL .setattr = shmem_setattr, .set_acl = simple_set_acl, #endif }; static const struct inode_operations shmem_special_inode_operations = { .getattr = shmem_getattr, #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, #endif #ifdef CONFIG_TMPFS_POSIX_ACL .setattr = shmem_setattr, .set_acl = simple_set_acl, #endif }; static const struct super_operations shmem_ops = { .alloc_inode = shmem_alloc_inode, .free_inode = shmem_free_in_core_inode, .destroy_inode = shmem_destroy_inode, #ifdef CONFIG_TMPFS .statfs = shmem_statfs, .show_options = shmem_show_options, #endif #ifdef CONFIG_TMPFS_QUOTA .get_dquots = shmem_get_dquots, #endif .evict_inode = shmem_evict_inode, .drop_inode = inode_just_drop, .put_super = shmem_put_super, #ifdef CONFIG_TRANSPARENT_HUGEPAGE .nr_cached_objects = shmem_unused_huge_count, .free_cached_objects = shmem_unused_huge_scan, #endif }; static const struct vm_operations_struct shmem_vm_ops = { .fault = shmem_fault, .map_pages = filemap_map_pages, #ifdef CONFIG_NUMA .set_policy = shmem_set_policy, .get_policy = shmem_get_policy, #endif }; static const struct vm_operations_struct shmem_anon_vm_ops = { .fault = shmem_fault, .map_pages = filemap_map_pages, #ifdef CONFIG_NUMA .set_policy = shmem_set_policy, .get_policy = shmem_get_policy, #endif }; int shmem_init_fs_context(struct fs_context *fc) { struct shmem_options *ctx; ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL); if (!ctx) return -ENOMEM; ctx->mode = 0777 | S_ISVTX; ctx->uid = current_fsuid(); ctx->gid = current_fsgid(); #if IS_ENABLED(CONFIG_UNICODE) ctx->encoding = NULL; #endif fc->fs_private = ctx; fc->ops = &shmem_fs_context_ops; #ifdef CONFIG_TMPFS fc->sb_flags |= SB_I_VERSION; #endif return 0; } static struct file_system_type shmem_fs_type = { .owner = THIS_MODULE, .name = "tmpfs", .init_fs_context = shmem_init_fs_context, #ifdef CONFIG_TMPFS .parameters = shmem_fs_parameters, #endif .kill_sb = kill_litter_super, .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP | FS_MGTIME, }; #if defined(CONFIG_SYSFS) && defined(CONFIG_TMPFS) #define __INIT_KOBJ_ATTR(_name, _mode, _show, _store) \ { \ .attr = { .name = __stringify(_name), .mode = _mode }, \ .show = _show, \ .store = _store, \ } #define TMPFS_ATTR_W(_name, _store) \ static struct kobj_attribute tmpfs_attr_##_name = \ __INIT_KOBJ_ATTR(_name, 0200, NULL, _store) #define TMPFS_ATTR_RW(_name, _show, _store) \ static struct kobj_attribute tmpfs_attr_##_name = \ __INIT_KOBJ_ATTR(_name, 0644, _show, _store) #define TMPFS_ATTR_RO(_name, _show) \ static struct kobj_attribute tmpfs_attr_##_name = \ __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) #if IS_ENABLED(CONFIG_UNICODE) static ssize_t casefold_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { return sysfs_emit(buf, "supported\n"); } TMPFS_ATTR_RO(casefold, casefold_show); #endif static struct attribute *tmpfs_attributes[] = { #if IS_ENABLED(CONFIG_UNICODE) &tmpfs_attr_casefold.attr, #endif NULL }; static const struct attribute_group tmpfs_attribute_group = { .attrs = tmpfs_attributes, .name = "features" }; static struct kobject *tmpfs_kobj; static int __init tmpfs_sysfs_init(void) { int ret; tmpfs_kobj = kobject_create_and_add("tmpfs", fs_kobj); if (!tmpfs_kobj) return -ENOMEM; ret = sysfs_create_group(tmpfs_kobj, &tmpfs_attribute_group); if (ret) kobject_put(tmpfs_kobj); return ret; } #endif /* CONFIG_SYSFS && CONFIG_TMPFS */ void __init shmem_init(void) { int error; shmem_init_inodecache(); #ifdef CONFIG_TMPFS_QUOTA register_quota_format(&shmem_quota_format); #endif error = register_filesystem(&shmem_fs_type); if (error) { pr_err("Could not register tmpfs\n"); goto out2; } shm_mnt = kern_mount(&shmem_fs_type); if (IS_ERR(shm_mnt)) { error = PTR_ERR(shm_mnt); pr_err("Could not kern_mount tmpfs\n"); goto out1; } #if defined(CONFIG_SYSFS) && defined(CONFIG_TMPFS) error = tmpfs_sysfs_init(); if (error) { pr_err("Could not init tmpfs sysfs\n"); goto out1; } #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY) SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; else shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */ /* * Default to setting PMD-sized THP to inherit the global setting and * disable all other multi-size THPs. */ if (!shmem_orders_configured) huge_shmem_orders_inherit = BIT(HPAGE_PMD_ORDER); #endif return; out1: unregister_filesystem(&shmem_fs_type); out2: #ifdef CONFIG_TMPFS_QUOTA unregister_quota_format(&shmem_quota_format); #endif shmem_destroy_inodecache(); shm_mnt = ERR_PTR(error); } #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS) static ssize_t shmem_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { static const int values[] = { SHMEM_HUGE_ALWAYS, SHMEM_HUGE_WITHIN_SIZE, SHMEM_HUGE_ADVISE, SHMEM_HUGE_NEVER, SHMEM_HUGE_DENY, SHMEM_HUGE_FORCE, }; int len = 0; int i; for (i = 0; i < ARRAY_SIZE(values); i++) { len += sysfs_emit_at(buf, len, shmem_huge == values[i] ? "%s[%s]" : "%s%s", i ? " " : "", shmem_format_huge(values[i])); } len += sysfs_emit_at(buf, len, "\n"); return len; } static ssize_t shmem_enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { char tmp[16]; int huge, err; if (count + 1 > sizeof(tmp)) return -EINVAL; memcpy(tmp, buf, count); tmp[count] = '\0'; if (count && tmp[count - 1] == '\n') tmp[count - 1] = '\0'; huge = shmem_parse_huge(tmp); if (huge == -EINVAL) return huge; shmem_huge = huge; if (shmem_huge > SHMEM_HUGE_DENY) SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; err = start_stop_khugepaged(); return err ? err : count; } struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled); static DEFINE_SPINLOCK(huge_shmem_orders_lock); static ssize_t thpsize_shmem_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { int order = to_thpsize(kobj)->order; const char *output; if (test_bit(order, &huge_shmem_orders_always)) output = "[always] inherit within_size advise never"; else if (test_bit(order, &huge_shmem_orders_inherit)) output = "always [inherit] within_size advise never"; else if (test_bit(order, &huge_shmem_orders_within_size)) output = "always inherit [within_size] advise never"; else if (test_bit(order, &huge_shmem_orders_madvise)) output = "always inherit within_size [advise] never"; else output = "always inherit within_size advise [never]"; return sysfs_emit(buf, "%s\n", output); } static ssize_t thpsize_shmem_enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int order = to_thpsize(kobj)->order; ssize_t ret = count; if (sysfs_streq(buf, "always")) { spin_lock(&huge_shmem_orders_lock); clear_bit(order, &huge_shmem_orders_inherit); clear_bit(order, &huge_shmem_orders_madvise); clear_bit(order, &huge_shmem_orders_within_size); set_bit(order, &huge_shmem_orders_always); spin_unlock(&huge_shmem_orders_lock); } else if (sysfs_streq(buf, "inherit")) { /* Do not override huge allocation policy with non-PMD sized mTHP */ if (shmem_huge == SHMEM_HUGE_FORCE && order != HPAGE_PMD_ORDER) return -EINVAL; spin_lock(&huge_shmem_orders_lock); clear_bit(order, &huge_shmem_orders_always); clear_bit(order, &huge_shmem_orders_madvise); clear_bit(order, &huge_shmem_orders_within_size); set_bit(order, &huge_shmem_orders_inherit); spin_unlock(&huge_shmem_orders_lock); } else if (sysfs_streq(buf, "within_size")) { spin_lock(&huge_shmem_orders_lock); clear_bit(order, &huge_shmem_orders_always); clear_bit(order, &huge_shmem_orders_inherit); clear_bit(order, &huge_shmem_orders_madvise); set_bit(order, &huge_shmem_orders_within_size); spin_unlock(&huge_shmem_orders_lock); } else if (sysfs_streq(buf, "advise")) { spin_lock(&huge_shmem_orders_lock); clear_bit(order, &huge_shmem_orders_always); clear_bit(order, &huge_shmem_orders_inherit); clear_bit(order, &huge_shmem_orders_within_size); set_bit(order, &huge_shmem_orders_madvise); spin_unlock(&huge_shmem_orders_lock); } else if (sysfs_streq(buf, "never")) { spin_lock(&huge_shmem_orders_lock); clear_bit(order, &huge_shmem_orders_always); clear_bit(order, &huge_shmem_orders_inherit); clear_bit(order, &huge_shmem_orders_within_size); clear_bit(order, &huge_shmem_orders_madvise); spin_unlock(&huge_shmem_orders_lock); } else { ret = -EINVAL; } if (ret > 0) { int err = start_stop_khugepaged(); if (err) ret = err; } return ret; } struct kobj_attribute thpsize_shmem_enabled_attr = __ATTR(shmem_enabled, 0644, thpsize_shmem_enabled_show, thpsize_shmem_enabled_store); #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */ #if defined(CONFIG_TRANSPARENT_HUGEPAGE) static int __init setup_transparent_hugepage_shmem(char *str) { int huge; huge = shmem_parse_huge(str); if (huge == -EINVAL) { pr_warn("transparent_hugepage_shmem= cannot parse, ignored\n"); return huge; } shmem_huge = huge; return 1; } __setup("transparent_hugepage_shmem=", setup_transparent_hugepage_shmem); static int __init setup_transparent_hugepage_tmpfs(char *str) { int huge; huge = shmem_parse_huge(str); if (huge < 0) { pr_warn("transparent_hugepage_tmpfs= cannot parse, ignored\n"); return huge; } tmpfs_huge = huge; return 1; } __setup("transparent_hugepage_tmpfs=", setup_transparent_hugepage_tmpfs); static char str_dup[PAGE_SIZE] __initdata; static int __init setup_thp_shmem(char *str) { char *token, *range, *policy, *subtoken; unsigned long always, inherit, madvise, within_size; char *start_size, *end_size; int start, end, nr; char *p; if (!str || strlen(str) + 1 > PAGE_SIZE) goto err; strscpy(str_dup, str); always = huge_shmem_orders_always; inherit = huge_shmem_orders_inherit; madvise = huge_shmem_orders_madvise; within_size = huge_shmem_orders_within_size; p = str_dup; while ((token = strsep(&p, ";")) != NULL) { range = strsep(&token, ":"); policy = token; if (!policy) goto err; while ((subtoken = strsep(&range, ",")) != NULL) { if (strchr(subtoken, '-')) { start_size = strsep(&subtoken, "-"); end_size = subtoken; start = get_order_from_str(start_size, THP_ORDERS_ALL_FILE_DEFAULT); end = get_order_from_str(end_size, THP_ORDERS_ALL_FILE_DEFAULT); } else { start_size = end_size = subtoken; start = end = get_order_from_str(subtoken, THP_ORDERS_ALL_FILE_DEFAULT); } if (start < 0) { pr_err("invalid size %s in thp_shmem boot parameter\n", start_size); goto err; } if (end < 0) { pr_err("invalid size %s in thp_shmem boot parameter\n", end_size); goto err; } if (start > end) goto err; nr = end - start + 1; if (!strcmp(policy, "always")) { bitmap_set(&always, start, nr); bitmap_clear(&inherit, start, nr); bitmap_clear(&madvise, start, nr); bitmap_clear(&within_size, start, nr); } else if (!strcmp(policy, "advise")) { bitmap_set(&madvise, start, nr); bitmap_clear(&inherit, start, nr); bitmap_clear(&always, start, nr); bitmap_clear(&within_size, start, nr); } else if (!strcmp(policy, "inherit")) { bitmap_set(&inherit, start, nr); bitmap_clear(&madvise, start, nr); bitmap_clear(&always, start, nr); bitmap_clear(&within_size, start, nr); } else if (!strcmp(policy, "within_size")) { bitmap_set(&within_size, start, nr); bitmap_clear(&inherit, start, nr); bitmap_clear(&madvise, start, nr); bitmap_clear(&always, start, nr); } else if (!strcmp(policy, "never")) { bitmap_clear(&inherit, start, nr); bitmap_clear(&madvise, start, nr); bitmap_clear(&always, start, nr); bitmap_clear(&within_size, start, nr); } else { pr_err("invalid policy %s in thp_shmem boot parameter\n", policy); goto err; } } } huge_shmem_orders_always = always; huge_shmem_orders_madvise = madvise; huge_shmem_orders_inherit = inherit; huge_shmem_orders_within_size = within_size; shmem_orders_configured = true; return 1; err: pr_warn("thp_shmem=%s: error parsing string, ignoring setting\n", str); return 0; } __setup("thp_shmem=", setup_thp_shmem); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #else /* !CONFIG_SHMEM */ /* * tiny-shmem: simple shmemfs and tmpfs using ramfs code * * This is intended for small system where the benefits of the full * shmem code (swap-backed and resource-limited) are outweighed by * their complexity. On systems without swap this code should be * effectively equivalent, but much lighter weight. */ static struct file_system_type shmem_fs_type = { .name = "tmpfs", .init_fs_context = ramfs_init_fs_context, .parameters = ramfs_fs_parameters, .kill_sb = ramfs_kill_sb, .fs_flags = FS_USERNS_MOUNT, }; void __init shmem_init(void) { BUG_ON(register_filesystem(&shmem_fs_type) != 0); shm_mnt = kern_mount(&shmem_fs_type); BUG_ON(IS_ERR(shm_mnt)); } int shmem_unuse(unsigned int type) { return 0; } int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) { return 0; } void shmem_unlock_mapping(struct address_space *mapping) { } #ifdef CONFIG_MMU unsigned long shmem_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags); } #endif void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) { truncate_inode_pages_range(inode->i_mapping, lstart, lend); } EXPORT_SYMBOL_GPL(shmem_truncate_range); #define shmem_vm_ops generic_file_vm_ops #define shmem_anon_vm_ops generic_file_vm_ops #define shmem_file_operations ramfs_file_operations #define shmem_acct_size(flags, size) 0 #define shmem_unacct_size(flags, size) do {} while (0) static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir, umode_t mode, dev_t dev, unsigned long flags) { struct inode *inode = ramfs_get_inode(sb, dir, mode, dev); return inode ? inode : ERR_PTR(-ENOSPC); } #endif /* CONFIG_SHMEM */ /* common code */ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size, unsigned long flags, unsigned int i_flags) { struct inode *inode; struct file *res; if (IS_ERR(mnt)) return ERR_CAST(mnt); if (size < 0 || size > MAX_LFS_FILESIZE) return ERR_PTR(-EINVAL); if (is_idmapped_mnt(mnt)) return ERR_PTR(-EINVAL); if (shmem_acct_size(flags, size)) return ERR_PTR(-ENOMEM); inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); if (IS_ERR(inode)) { shmem_unacct_size(flags, size); return ERR_CAST(inode); } inode->i_flags |= i_flags; inode->i_size = size; clear_nlink(inode); /* It is unlinked */ res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size)); if (!IS_ERR(res)) res = alloc_file_pseudo(inode, mnt, name, O_RDWR, &shmem_file_operations); if (IS_ERR(res)) iput(inode); return res; } /** * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be * kernel internal. There will be NO LSM permission checks against the * underlying inode. So users of this interface must do LSM checks at a * higher layer. The users are the big_key and shm implementations. LSM * checks are provided at the key or shm level rather than the inode. * @name: name for dentry (to be seen in /proc/<pid>/maps) * @size: size to be set for the file * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size */ struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags) { return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE); } EXPORT_SYMBOL_GPL(shmem_kernel_file_setup); /** * shmem_file_setup - get an unlinked file living in tmpfs * @name: name for dentry (to be seen in /proc/<pid>/maps) * @size: size to be set for the file * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size */ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) { return __shmem_file_setup(shm_mnt, name, size, flags, 0); } EXPORT_SYMBOL_GPL(shmem_file_setup); /** * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs * @mnt: the tmpfs mount where the file will be created * @name: name for dentry (to be seen in /proc/<pid>/maps) * @size: size to be set for the file * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size */ struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name, loff_t size, unsigned long flags) { return __shmem_file_setup(mnt, name, size, flags, 0); } EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt); /** * shmem_zero_setup - setup a shared anonymous mapping * @vma: the vma to be mmapped is prepared by do_mmap */ int shmem_zero_setup(struct vm_area_struct *vma) { struct file *file; loff_t size = vma->vm_end - vma->vm_start; /* * Cloning a new file under mmap_lock leads to a lock ordering conflict * between XFS directory reading and selinux: since this file is only * accessible to the user through its mapping, use S_PRIVATE flag to * bypass file security, in the same way as shmem_kernel_file_setup(). */ file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags); if (IS_ERR(file)) return PTR_ERR(file); if (vma->vm_file) fput(vma->vm_file); vma->vm_file = file; vma->vm_ops = &shmem_anon_vm_ops; return 0; } /** * shmem_read_folio_gfp - read into page cache, using specified page allocation flags. * @mapping: the folio's address_space * @index: the folio index * @gfp: the page allocator flags to use if allocating * * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)", * with any new page allocations done using the specified allocation flags. * But read_cache_page_gfp() uses the ->read_folio() method: which does not * suit tmpfs, since it may have pages in swapcache, and needs to find those * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. * * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily. */ struct folio *shmem_read_folio_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp) { #ifdef CONFIG_SHMEM struct inode *inode = mapping->host; struct folio *folio; int error; error = shmem_get_folio_gfp(inode, index, i_size_read(inode), &folio, SGP_CACHE, gfp, NULL, NULL); if (error) return ERR_PTR(error); folio_unlock(folio); return folio; #else /* * The tiny !SHMEM case uses ramfs without swap */ return mapping_read_folio_gfp(mapping, index, gfp); #endif } EXPORT_SYMBOL_GPL(shmem_read_folio_gfp); struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp) { struct folio *folio = shmem_read_folio_gfp(mapping, index, gfp); struct page *page; if (IS_ERR(folio)) return &folio->page; page = folio_file_page(folio, index); if (PageHWPoison(page)) { folio_put(folio); return ERR_PTR(-EIO); } return page; } EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_DELAY_H #define _LINUX_DELAY_H /* * Copyright (C) 1993 Linus Torvalds * * Delay routines, using a pre-computed "loops_per_jiffy" value. * Sleep routines using timer list timers or hrtimers. */ #include <linux/math.h> #include <linux/sched.h> #include <linux/jiffies.h> extern unsigned long loops_per_jiffy; #include <asm/delay.h> /* * Using udelay() for intervals greater than a few milliseconds can * risk overflow for high loops_per_jiffy (high bogomips) machines. The * mdelay() provides a wrapper to prevent this. For delays greater * than MAX_UDELAY_MS milliseconds, the wrapper is used. Architecture * specific values can be defined in asm-???/delay.h as an override. * The 2nd mdelay() definition ensures GCC will optimize away the * while loop for the common cases where n <= MAX_UDELAY_MS -- Paul G. */ #ifndef MAX_UDELAY_MS #define MAX_UDELAY_MS 5 #endif #ifndef mdelay /** * mdelay - Inserting a delay based on milliseconds with busy waiting * @n: requested delay in milliseconds * * See udelay() for basic information about mdelay() and it's variants. * * Please double check, whether mdelay() is the right way to go or whether a * refactoring of the code is the better variant to be able to use msleep() * instead. */ #define mdelay(n) (\ (__builtin_constant_p(n) && (n)<=MAX_UDELAY_MS) ? udelay((n)*1000) : \ ({unsigned long __ms=(n); while (__ms--) udelay(1000);})) #endif #ifndef ndelay static inline void ndelay(unsigned long x) { udelay(DIV_ROUND_UP(x, 1000)); } #define ndelay(x) ndelay(x) #endif extern unsigned long lpj_fine; void calibrate_delay(void); unsigned long calibrate_delay_is_known(void); void __attribute__((weak)) calibration_delay_done(void); void msleep(unsigned int msecs); unsigned long msleep_interruptible(unsigned int msecs); void usleep_range_state(unsigned long min, unsigned long max, unsigned int state); /** * usleep_range - Sleep for an approximate time * @min: Minimum time in microseconds to sleep * @max: Maximum time in microseconds to sleep * * For basic information please refere to usleep_range_state(). * * The task will be in the state TASK_UNINTERRUPTIBLE during the sleep. */ static inline void usleep_range(unsigned long min, unsigned long max) { usleep_range_state(min, max, TASK_UNINTERRUPTIBLE); } /** * usleep_range_idle - Sleep for an approximate time with idle time accounting * @min: Minimum time in microseconds to sleep * @max: Maximum time in microseconds to sleep * * For basic information please refere to usleep_range_state(). * * The sleeping task has the state TASK_IDLE during the sleep to prevent * contribution to the load avarage. */ static inline void usleep_range_idle(unsigned long min, unsigned long max) { usleep_range_state(min, max, TASK_IDLE); } /** * ssleep - wrapper for seconds around msleep * @seconds: Requested sleep duration in seconds * * Please refere to msleep() for detailed information. */ static inline void ssleep(unsigned int seconds) { msleep(seconds * 1000); } static const unsigned int max_slack_shift = 2; #define USLEEP_RANGE_UPPER_BOUND ((TICK_NSEC << max_slack_shift) / NSEC_PER_USEC) /** * fsleep - flexible sleep which autoselects the best mechanism * @usecs: requested sleep duration in microseconds * * flseep() selects the best mechanism that will provide maximum 25% slack * to the requested sleep duration. Therefore it uses: * * * udelay() loop for sleep durations <= 10 microseconds to avoid hrtimer * overhead for really short sleep durations. * * usleep_range() for sleep durations which would lead with the usage of * msleep() to a slack larger than 25%. This depends on the granularity of * jiffies. * * msleep() for all other sleep durations. * * Note: When %CONFIG_HIGH_RES_TIMERS is not set, all sleeps are processed with * the granularity of jiffies and the slack might exceed 25% especially for * short sleep durations. */ static inline void fsleep(unsigned long usecs) { if (usecs <= 10) udelay(usecs); else if (usecs < USLEEP_RANGE_UPPER_BOUND) usleep_range(usecs, usecs + (usecs >> max_slack_shift)); else msleep(DIV_ROUND_UP(usecs, USEC_PER_MSEC)); } #endif /* defined(_LINUX_DELAY_H) */
19 19 19 3 3 16 2 3 42 34 35 27 11 15 15 15 15 15 15 21 21 20 262 34 262 9 5 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 // SPDX-License-Identifier: GPL-2.0 /* * Supplementary group IDs */ #include <linux/cred.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/security.h> #include <linux/sort.h> #include <linux/syscalls.h> #include <linux/user_namespace.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> struct group_info *groups_alloc(int gidsetsize) { struct group_info *gi; gi = kvmalloc(struct_size(gi, gid, gidsetsize), GFP_KERNEL_ACCOUNT); if (!gi) return NULL; refcount_set(&gi->usage, 1); gi->ngroups = gidsetsize; return gi; } EXPORT_SYMBOL(groups_alloc); void groups_free(struct group_info *group_info) { kvfree(group_info); } EXPORT_SYMBOL(groups_free); /* export the group_info to a user-space array */ static int groups_to_user(gid_t __user *grouplist, const struct group_info *group_info) { struct user_namespace *user_ns = current_user_ns(); int i; unsigned int count = group_info->ngroups; for (i = 0; i < count; i++) { gid_t gid; gid = from_kgid_munged(user_ns, group_info->gid[i]); if (put_user(gid, grouplist+i)) return -EFAULT; } return 0; } /* fill a group_info from a user-space array - it must be allocated already */ static int groups_from_user(struct group_info *group_info, gid_t __user *grouplist) { struct user_namespace *user_ns = current_user_ns(); int i; unsigned int count = group_info->ngroups; for (i = 0; i < count; i++) { gid_t gid; kgid_t kgid; if (get_user(gid, grouplist+i)) return -EFAULT; kgid = make_kgid(user_ns, gid); if (!gid_valid(kgid)) return -EINVAL; group_info->gid[i] = kgid; } return 0; } static int gid_cmp(const void *_a, const void *_b) { kgid_t a = *(kgid_t *)_a; kgid_t b = *(kgid_t *)_b; return gid_gt(a, b) - gid_lt(a, b); } void groups_sort(struct group_info *group_info) { sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid), gid_cmp, NULL); } EXPORT_SYMBOL(groups_sort); /* a simple bsearch */ int groups_search(const struct group_info *group_info, kgid_t grp) { unsigned int left, right; if (!group_info) return 0; left = 0; right = group_info->ngroups; while (left < right) { unsigned int mid = (left+right)/2; if (gid_gt(grp, group_info->gid[mid])) left = mid + 1; else if (gid_lt(grp, group_info->gid[mid])) right = mid; else return 1; } return 0; } /** * set_groups - Change a group subscription in a set of credentials * @new: The newly prepared set of credentials to alter * @group_info: The group list to install */ void set_groups(struct cred *new, struct group_info *group_info) { put_group_info(new->group_info); get_group_info(group_info); new->group_info = group_info; } EXPORT_SYMBOL(set_groups); /** * set_current_groups - Change current's group subscription * @group_info: The group list to impose * * Validate a group subscription and, if valid, impose it upon current's task * security record. */ int set_current_groups(struct group_info *group_info) { struct cred *new; const struct cred *old; int retval; new = prepare_creds(); if (!new) return -ENOMEM; old = current_cred(); set_groups(new, group_info); retval = security_task_fix_setgroups(new, old); if (retval < 0) goto error; return commit_creds(new); error: abort_creds(new); return retval; } EXPORT_SYMBOL(set_current_groups); SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist) { const struct cred *cred = current_cred(); int i; if (gidsetsize < 0) return -EINVAL; /* no need to grab task_lock here; it cannot change */ i = cred->group_info->ngroups; if (gidsetsize) { if (i > gidsetsize) { i = -EINVAL; goto out; } if (groups_to_user(grouplist, cred->group_info)) { i = -EFAULT; goto out; } } out: return i; } bool may_setgroups(void) { struct user_namespace *user_ns = current_user_ns(); return ns_capable_setid(user_ns, CAP_SETGID) && userns_may_setgroups(user_ns); } /* * SMP: Our groups are copy-on-write. We can set them safely * without another task interfering. */ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) { struct group_info *group_info; int retval; if (!may_setgroups()) return -EPERM; if ((unsigned)gidsetsize > NGROUPS_MAX) return -EINVAL; group_info = groups_alloc(gidsetsize); if (!group_info) return -ENOMEM; retval = groups_from_user(group_info, grouplist); if (retval) { put_group_info(group_info); return retval; } groups_sort(group_info); retval = set_current_groups(group_info); put_group_info(group_info); return retval; } /* * Check whether we're fsgid/egid or in the supplemental group.. */ int in_group_p(kgid_t grp) { const struct cred *cred = current_cred(); int retval = 1; if (!gid_eq(grp, cred->fsgid)) retval = groups_search(cred->group_info, grp); return retval; } EXPORT_SYMBOL(in_group_p); int in_egroup_p(kgid_t grp) { const struct cred *cred = current_cred(); int retval = 1; if (!gid_eq(grp, cred->egid)) retval = groups_search(cred->group_info, grp); return retval; } EXPORT_SYMBOL(in_egroup_p);
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Copyright (c) 2015 Tom Herbert <tom@herbertland.com> */ #ifndef __ILA_H #define __ILA_H #include <linux/errno.h> #include <linux/ip.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/socket.h> #include <linux/skbuff.h> #include <linux/types.h> #include <net/checksum.h> #include <net/genetlink.h> #include <net/ip.h> #include <net/protocol.h> #include <uapi/linux/ila.h> struct ila_locator { union { __u8 v8[8]; __be16 v16[4]; __be32 v32[2]; __be64 v64; }; }; struct ila_identifier { union { struct { #if defined(__LITTLE_ENDIAN_BITFIELD) u8 __space:4; u8 csum_neutral:1; u8 type:3; #elif defined(__BIG_ENDIAN_BITFIELD) u8 type:3; u8 csum_neutral:1; u8 __space:4; #else #error "Adjust your <asm/byteorder.h> defines" #endif u8 __space2[7]; }; __u8 v8[8]; __be16 v16[4]; __be32 v32[2]; __be64 v64; }; }; #define CSUM_NEUTRAL_FLAG htonl(0x10000000) struct ila_addr { union { struct in6_addr addr; struct { struct ila_locator loc; struct ila_identifier ident; }; }; }; static inline struct ila_addr *ila_a2i(struct in6_addr *addr) { return (struct ila_addr *)addr; } struct ila_params { struct ila_locator locator; struct ila_locator locator_match; __wsum csum_diff; u8 csum_mode; u8 ident_type; }; static inline __wsum compute_csum_diff8(const __be32 *from, const __be32 *to) { __be32 diff[] = { ~from[0], ~from[1], to[0], to[1], }; return csum_partial(diff, sizeof(diff), 0); } static inline bool ila_csum_neutral_set(struct ila_identifier ident) { return !!(ident.csum_neutral); } void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p, bool set_csum_neutral); void ila_init_saved_csum(struct ila_params *p); struct ila_net { struct { struct rhashtable rhash_table; spinlock_t *locks; /* Bucket locks for entry manipulation */ unsigned int locks_mask; bool hooks_registered; } xlat; }; int ila_lwt_init(void); void ila_lwt_fini(void); int ila_xlat_init_net(struct net *net); void ila_xlat_pre_exit_net(struct net *net); void ila_xlat_exit_net(struct net *net); int ila_xlat_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info); int ila_xlat_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info); int ila_xlat_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info); int ila_xlat_nl_cmd_flush(struct sk_buff *skb, struct genl_info *info); int ila_xlat_nl_dump_start(struct netlink_callback *cb); int ila_xlat_nl_dump_done(struct netlink_callback *cb); int ila_xlat_nl_dump(struct sk_buff *skb, struct netlink_callback *cb); extern unsigned int ila_net_id; extern struct genl_family ila_nl_family; #endif /* __ILA_H */
11 11 2 16 12 16 3 1 13 12 16 14 14 16 14 14 249 242 241 241 3 3 2 238 240 129 52 237 2 236 233 234 215 31 236 235 235 3 236 234 234 1 233 209 202 207 233 2 232 232 7 251 11 11 11 11 11 11 11 11 3 3 3 3 3 3 3 3 357 354 337 30 27 24 11 14 24 24 24 30 20 20 9 9 15 3 3 3 1 1 17 17 17 17 16 17 17 17 17 9 8 9 9 9 9 9 12 17 17 17 17 20 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 13 1 13 13 13 13 13 13 13 13 13 17 15 38 38 38 22 22 4 22 9 13 13 13 15 25 25 25 22 22 9 8 7 7 6 7 28 16 28 14 14 20 1 20 19 19 19 5 20 14 14 20 7 4 7 5 4 4 4 15 14 14 13 4 5 2 2 1 2 1 10 1 10 15 6 8 8 8 8 8 8 8 8 8 8 19 19 19 19 16 18 16 2 30 9 312 311 161 10 161 161 134 131 16 161 222 221 30 22 9 191 220 9 9 10 312 1 1 93 93 88 86 86 85 18 18 10 10 6 6 5 1 63 2 61 61 60 69 69 69 69 69 68 2 66 1 65 63 62 62 62 44 44 8 8 1 1 1 7 7 7 7 7 5 4 5 2 2 2 69 42 42 3 40 40 40 31 9 9 9 9 9 9 40 11 26 26 26 2 3 25 39 40 8 8 6 77 77 6 83 201 185 185 185 185 55 184 185 14 14 14 14 14 14 14 14 14 14 7 1 1 1 7 7 8 3 13 13 13 13 13 13 6 6 6 3 1 14 8 3 9 3 14 7 2 2 2 1 5 3 1 4 14 7 7 14 6 14 13 13 9 7 7 6 13 14 2 2 12 2 3 2 1 2 1 2 2 3 1 2 3 6 6 3 3 3 6 14 14 79 79 269 271 79 79 79 79 79 79 79 79 79 79 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * IPv4 specific functions * * code split from: * linux/ipv4/tcp.c * linux/ipv4/tcp_input.c * linux/ipv4/tcp_output.c * * See tcp.c for author information */ /* * Changes: * David S. Miller : New socket lookup architecture. * This code is dedicated to John Dyson. * David S. Miller : Change semantics of established hash, * half is devoted to TIME_WAIT sockets * and the rest go in the other half. * Andi Kleen : Add support for syncookies and fixed * some bugs: ip options weren't passed to * the TCP layer, missed a check for an * ACK bit. * Andi Kleen : Implemented fast path mtu discovery. * Fixed many serious bugs in the * request_sock handling and moved * most of it into the af independent code. * Added tail drop and some other bugfixes. * Added new listen semantics. * Mike McLagan : Routing by source * Juan Jose Ciarlante: ip_dynaddr bits * Andi Kleen: various fixes. * Vitaly E. Lavrov : Transparent proxy revived after year * coma. * Andi Kleen : Fix new listen. * Andi Kleen : Fix accept error reporting. * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind * a single port at the same time. */ #define pr_fmt(fmt) "TCP: " fmt #include <linux/bottom_half.h> #include <linux/types.h> #include <linux/fcntl.h> #include <linux/module.h> #include <linux/random.h> #include <linux/cache.h> #include <linux/jhash.h> #include <linux/init.h> #include <linux/times.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/sock_diag.h> #include <net/aligned_data.h> #include <net/net_namespace.h> #include <net/icmp.h> #include <net/inet_hashtables.h> #include <net/tcp.h> #include <net/tcp_ecn.h> #include <net/transp_v6.h> #include <net/ipv6.h> #include <net/inet_common.h> #include <net/inet_ecn.h> #include <net/timewait_sock.h> #include <net/xfrm.h> #include <net/secure_seq.h> #include <net/busy_poll.h> #include <net/rstreason.h> #include <net/psp.h> #include <linux/inet.h> #include <linux/ipv6.h> #include <linux/stddef.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/inetdevice.h> #include <linux/btf_ids.h> #include <linux/skbuff_ref.h> #include <crypto/hash.h> #include <linux/scatterlist.h> #include <trace/events/tcp.h> #ifdef CONFIG_TCP_MD5SIG static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, __be32 daddr, __be32 saddr, const struct tcphdr *th); #endif struct inet_hashinfo tcp_hashinfo; static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = { .bh_lock = INIT_LOCAL_LOCK(bh_lock), }; static DEFINE_MUTEX(tcp_exit_batch_mutex); static u32 tcp_v4_init_seq(const struct sk_buff *skb) { return secure_tcp_seq(ip_hdr(skb)->daddr, ip_hdr(skb)->saddr, tcp_hdr(skb)->dest, tcp_hdr(skb)->source); } static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) { return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); } int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) { int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse); const struct inet_timewait_sock *tw = inet_twsk(sktw); const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); struct tcp_sock *tp = tcp_sk(sk); int ts_recent_stamp; u32 reuse_thresh; if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) reuse = 0; if (reuse == 2) { /* Still does not detect *everything* that goes through * lo, since we require a loopback src or dst address * or direct binding to 'lo' interface. */ bool loopback = false; if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) loopback = true; #if IS_ENABLED(CONFIG_IPV6) if (tw->tw_family == AF_INET6) { if (ipv6_addr_loopback(&tw->tw_v6_daddr) || ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) loopback = true; } else #endif { if (ipv4_is_loopback(tw->tw_daddr) || ipv4_is_loopback(tw->tw_rcv_saddr)) loopback = true; } if (!loopback) reuse = 0; } /* With PAWS, it is safe from the viewpoint of data integrity. Even without PAWS it is safe provided sequence spaces do not overlap i.e. at data rates <= 80Mbit/sec. Actually, the idea is close to VJ's one, only timestamp cache is held not per host, but per port pair and TW bucket is used as state holder. If TW bucket has been already destroyed we fall back to VJ's scheme and use initial timestamp retrieved from peer table. */ ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp); reuse_thresh = READ_ONCE(tw->tw_entry_stamp) + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay); if (ts_recent_stamp && (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) { /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk * and releasing the bucket lock. */ if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt))) return 0; /* In case of repair and re-using TIME-WAIT sockets we still * want to be sure that it is safe as above but honor the * sequence numbers and time stamps set as part of the repair * process. * * Without this check re-using a TIME-WAIT socket with TCP * repair would accumulate a -1 on the repair assigned * sequence number. The first time it is reused the sequence * is -1, the second time -2, etc. This fixes that issue * without appearing to create any others. */ if (likely(!tp->repair)) { u32 seq = tcptw->tw_snd_nxt + 65535 + 2; if (!seq) seq = 1; WRITE_ONCE(tp->write_seq, seq); tp->rx_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent); tp->rx_opt.ts_recent_stamp = ts_recent_stamp; } return 1; } return 0; } EXPORT_IPV6_MOD_GPL(tcp_twsk_unique); static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { /* This check is replicated from tcp_v4_connect() and intended to * prevent BPF program called below from accessing bytes that are out * of the bound specified by user in addr_len. */ if (addr_len < sizeof(struct sockaddr_in)) return -EINVAL; sock_owned_by_me(sk); return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len); } /* This will initiate an outgoing connection. */ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; struct inet_timewait_death_row *tcp_death_row; struct inet_sock *inet = inet_sk(sk); struct tcp_sock *tp = tcp_sk(sk); struct ip_options_rcu *inet_opt; struct net *net = sock_net(sk); __be16 orig_sport, orig_dport; __be32 daddr, nexthop; struct flowi4 *fl4; struct rtable *rt; int err; if (addr_len < sizeof(struct sockaddr_in)) return -EINVAL; if (usin->sin_family != AF_INET) return -EAFNOSUPPORT; nexthop = daddr = usin->sin_addr.s_addr; inet_opt = rcu_dereference_protected(inet->inet_opt, lockdep_sock_is_held(sk)); if (inet_opt && inet_opt->opt.srr) { if (!daddr) return -EINVAL; nexthop = inet_opt->opt.faddr; } orig_sport = inet->inet_sport; orig_dport = usin->sin_port; fl4 = &inet->cork.fl.u.ip4; rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport, orig_dport, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); if (err == -ENETUNREACH) IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); return err; } if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { ip_rt_put(rt); return -ENETUNREACH; } if (!inet_opt || !inet_opt->opt.srr) daddr = fl4->daddr; tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; if (!inet->inet_saddr) { err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); if (err) { ip_rt_put(rt); return err; } } else { sk_rcv_saddr_set(sk, inet->inet_saddr); } if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { /* Reset inherited state */ tp->rx_opt.ts_recent = 0; tp->rx_opt.ts_recent_stamp = 0; if (likely(!tp->repair)) WRITE_ONCE(tp->write_seq, 0); } inet->inet_dport = usin->sin_port; sk_daddr_set(sk, daddr); inet_csk(sk)->icsk_ext_hdr_len = psp_sk_overhead(sk); if (inet_opt) inet_csk(sk)->icsk_ext_hdr_len += inet_opt->opt.optlen; tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; /* Socket identity is still unknown (sport may be zero). * However we set state to SYN-SENT and not releasing socket * lock select source port, enter ourselves into the hash tables and * complete initialization after this. */ tcp_set_state(sk, TCP_SYN_SENT); err = inet_hash_connect(tcp_death_row, sk); if (err) goto failure; sk_set_txhash(sk); rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, inet->inet_sport, inet->inet_dport, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; goto failure; } tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst); /* OK, now commit destination to socket. */ sk->sk_gso_type = SKB_GSO_TCPV4; sk_setup_caps(sk, &rt->dst); rt = NULL; if (likely(!tp->repair)) { if (!tp->write_seq) WRITE_ONCE(tp->write_seq, secure_tcp_seq(inet->inet_saddr, inet->inet_daddr, inet->inet_sport, usin->sin_port)); WRITE_ONCE(tp->tsoffset, secure_tcp_ts_off(net, inet->inet_saddr, inet->inet_daddr)); } atomic_set(&inet->inet_id, get_random_u16()); if (tcp_fastopen_defer_connect(sk, &err)) return err; if (err) goto failure; err = tcp_connect(sk); if (err) goto failure; return 0; failure: /* * This unhashes the socket and releases the local port, * if necessary. */ tcp_set_state(sk, TCP_CLOSE); inet_bhash2_reset_saddr(sk); ip_rt_put(rt); sk->sk_route_caps = 0; inet->inet_dport = 0; return err; } EXPORT_IPV6_MOD(tcp_v4_connect); /* * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. * It can be called through tcp_release_cb() if socket was owned by user * at the time tcp_v4_err() was called to handle ICMP message. */ void tcp_v4_mtu_reduced(struct sock *sk) { struct inet_sock *inet = inet_sk(sk); struct dst_entry *dst; u32 mtu; if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) return; mtu = READ_ONCE(tcp_sk(sk)->mtu_info); dst = inet_csk_update_pmtu(sk, mtu); if (!dst) return; /* Something is about to be wrong... Remember soft error * for the case, if this connection will not able to recover. */ if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) WRITE_ONCE(sk->sk_err_soft, EMSGSIZE); mtu = dst_mtu(dst); if (inet->pmtudisc != IP_PMTUDISC_DONT && ip_sk_accept_pmtu(sk) && inet_csk(sk)->icsk_pmtu_cookie > mtu) { tcp_sync_mss(sk, mtu); /* Resend the TCP packet because it's * clear that the old packet has been * dropped. This is the new "fast" path mtu * discovery. */ tcp_simple_retransmit(sk); } /* else let the usual retransmit timer handle it */ } EXPORT_IPV6_MOD(tcp_v4_mtu_reduced); static void do_redirect(struct sk_buff *skb, struct sock *sk) { struct dst_entry *dst = __sk_dst_check(sk, 0); if (dst) dst->ops->redirect(dst, sk, skb); } /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ void tcp_req_err(struct sock *sk, u32 seq, bool abort) { struct request_sock *req = inet_reqsk(sk); struct net *net = sock_net(sk); /* ICMPs are not backlogged, hence we cannot get * an established socket here. */ if (seq != tcp_rsk(req)->snt_isn) { __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); } else if (abort) { /* * Still in SYN_RECV, just remove it silently. * There is no good way to pass the error to the newly * created socket, and POSIX does not want network * errors returned from accept(). */ inet_csk_reqsk_queue_drop(req->rsk_listener, req); tcp_listendrop(req->rsk_listener); } reqsk_put(req); } EXPORT_IPV6_MOD(tcp_req_err); /* TCP-LD (RFC 6069) logic */ void tcp_ld_RTO_revert(struct sock *sk, u32 seq) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; s32 remaining; u32 delta_us; if (sock_owned_by_user(sk)) return; if (seq != tp->snd_una || !icsk->icsk_retransmits || !icsk->icsk_backoff) return; skb = tcp_rtx_queue_head(sk); if (WARN_ON_ONCE(!skb)) return; icsk->icsk_backoff--; icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk)); tcp_mstamp_refresh(tp); delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); if (remaining > 0) { tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, false); } else { /* RTO revert clocked out retransmission. * Will retransmit now. */ tcp_retransmit_timer(sk); } } EXPORT_IPV6_MOD(tcp_ld_RTO_revert); /* * This routine is called by the ICMP module when it gets some * sort of error condition. If err < 0 then the socket should * be closed and the error returned to the user. If err > 0 * it's just the icmp type << 8 | icmp code. After adjustment * header points to the first 8 bytes of the tcp header. We need * to find the appropriate port. * * The locking strategy used here is very "optimistic". When * someone else accesses the socket the ICMP is just dropped * and for some paths there is no check at all. * A more general error queue to queue errors for later handling * is probably better. * */ int tcp_v4_err(struct sk_buff *skb, u32 info) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); struct net *net = dev_net_rcu(skb->dev); const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; struct request_sock *fastopen; struct tcp_sock *tp; u32 seq, snd_una; struct sock *sk; int err; sk = __inet_lookup_established(net, iph->daddr, th->dest, iph->saddr, ntohs(th->source), inet_iif(skb), 0); if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return -ENOENT; } if (sk->sk_state == TCP_TIME_WAIT) { /* To increase the counter of ignored icmps for TCP-AO */ tcp_ao_ignore_icmp(sk, AF_INET, type, code); inet_twsk_put(inet_twsk(sk)); return 0; } seq = ntohl(th->seq); if (sk->sk_state == TCP_NEW_SYN_RECV) { tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || type == ICMP_TIME_EXCEEDED || (type == ICMP_DEST_UNREACH && (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))); return 0; } if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) { sock_put(sk); return 0; } bh_lock_sock(sk); /* If too many ICMPs get dropped on busy * servers this needs to be solved differently. * We do take care of PMTU discovery (RFC1191) special case : * we can receive locally generated ICMP messages while socket is held. */ if (sock_owned_by_user(sk)) { if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); } if (sk->sk_state == TCP_CLOSE) goto out; if (static_branch_unlikely(&ip4_min_ttl)) { /* min_ttl can be changed concurrently from do_ip_setsockopt() */ if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); goto out; } } tp = tcp_sk(sk); /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ fastopen = rcu_dereference(tp->fastopen_rsk); snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; if (sk->sk_state != TCP_LISTEN && !between(seq, snd_una, tp->snd_nxt)) { __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); goto out; } switch (type) { case ICMP_REDIRECT: if (!sock_owned_by_user(sk)) do_redirect(skb, sk); goto out; case ICMP_SOURCE_QUENCH: /* Just silently ignore these. */ goto out; case ICMP_PARAMETERPROB: err = EPROTO; break; case ICMP_DEST_UNREACH: if (code > NR_ICMP_UNREACH) goto out; if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ /* We are not interested in TCP_LISTEN and open_requests * (SYN-ACKs send out by Linux are always <576bytes so * they should go through unfragmented). */ if (sk->sk_state == TCP_LISTEN) goto out; WRITE_ONCE(tp->mtu_info, info); if (!sock_owned_by_user(sk)) { tcp_v4_mtu_reduced(sk); } else { if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) sock_hold(sk); } goto out; } err = icmp_err_convert[code].errno; /* check if this ICMP message allows revert of backoff. * (see RFC 6069) */ if (!fastopen && (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) tcp_ld_RTO_revert(sk, seq); break; case ICMP_TIME_EXCEEDED: err = EHOSTUNREACH; break; default: goto out; } switch (sk->sk_state) { case TCP_SYN_SENT: case TCP_SYN_RECV: /* Only in fast or simultaneous open. If a fast open socket is * already accepted it is treated as a connected one below. */ if (fastopen && !fastopen->sk) break; ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); if (!sock_owned_by_user(sk)) tcp_done_with_error(sk, err); else WRITE_ONCE(sk->sk_err_soft, err); goto out; } /* If we've already connected we will keep trying * until we time out, or the user gives up. * * rfc1122 4.2.3.9 allows to consider as hard errors * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, * but it is obsoleted by pmtu discovery). * * Note, that in modern internet, where routing is unreliable * and in each dark corner broken firewalls sit, sending random * errors ordered by their masters even this two messages finally lose * their original sense (even Linux sends invalid PORT_UNREACHs) * * Now we are in compliance with RFCs. * --ANK (980905) */ if (!sock_owned_by_user(sk) && inet_test_bit(RECVERR, sk)) { WRITE_ONCE(sk->sk_err, err); sk_error_report(sk); } else { /* Only an error on timeout */ WRITE_ONCE(sk->sk_err_soft, err); } out: bh_unlock_sock(sk); sock_put(sk); return 0; } void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) { struct tcphdr *th = tcp_hdr(skb); th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct tcphdr, check); } /* This routine computes an IPv4 TCP checksum. */ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) { const struct inet_sock *inet = inet_sk(sk); __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); } EXPORT_IPV6_MOD(tcp_v4_send_check); #define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32)) static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb, const struct tcp_ao_hdr *aoh, struct ip_reply_arg *arg, struct tcphdr *reply, __be32 reply_options[REPLY_OPTIONS_LEN]) { #ifdef CONFIG_TCP_AO int sdif = tcp_v4_sdif(skb); int dif = inet_iif(skb); int l3index = sdif ? dif : 0; bool allocated_traffic_key; struct tcp_ao_key *key; char *traffic_key; bool drop = true; u32 ao_sne = 0; u8 keyid; rcu_read_lock(); if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq), &key, &traffic_key, &allocated_traffic_key, &keyid, &ao_sne)) goto out; reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) | (aoh->rnext_keyid << 8) | keyid); arg->iov[0].iov_len += tcp_ao_len_aligned(key); reply->doff = arg->iov[0].iov_len / 4; if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1], key, traffic_key, (union tcp_ao_addr *)&ip_hdr(skb)->saddr, (union tcp_ao_addr *)&ip_hdr(skb)->daddr, reply, ao_sne)) goto out; drop = false; out: rcu_read_unlock(); if (allocated_traffic_key) kfree(traffic_key); return drop; #else return true; #endif } /* * This routine will send an RST to the other tcp. * * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) * for reset. * Answer: if a packet caused RST, it is not for a socket * existing in our system, if it is matched to a socket, * it is just duplicate segment or bug in other side's TCP. * So that we build reply only basing on parameters * arrived with segment. * Exception: precedence violation. We do not implement it in any case. */ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, enum sk_rst_reason reason) { const struct tcphdr *th = tcp_hdr(skb); struct { struct tcphdr th; __be32 opt[REPLY_OPTIONS_LEN]; } rep; const __u8 *md5_hash_location = NULL; const struct tcp_ao_hdr *aoh; struct ip_reply_arg arg; #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *key = NULL; unsigned char newhash[16]; struct sock *sk1 = NULL; int genhash; #endif u64 transmit_time = 0; struct sock *ctl_sk; struct net *net; u32 txhash = 0; /* Never send a reset in response to a reset. */ if (th->rst) return; /* If sk not NULL, it means we did a successful lookup and incoming * route had to be correct. prequeue might have dropped our dst. */ if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) return; /* Swap the send and the receive. */ memset(&rep, 0, sizeof(rep)); rep.th.dest = th->source; rep.th.source = th->dest; rep.th.doff = sizeof(struct tcphdr) / 4; rep.th.rst = 1; if (th->ack) { rep.th.seq = th->ack_seq; } else { rep.th.ack = 1; rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + skb->len - (th->doff << 2)); } memset(&arg, 0, sizeof(arg)); arg.iov[0].iov_base = (unsigned char *)&rep; arg.iov[0].iov_len = sizeof(rep.th); net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb); /* Invalid TCP option size or twice included auth */ if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh)) return; if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt)) return; #ifdef CONFIG_TCP_MD5SIG rcu_read_lock(); if (sk && sk_fullsock(sk)) { const union tcp_md5_addr *addr; int l3index; /* sdif set, means packet ingressed via a device * in an L3 domain and inet_iif is set to it. */ l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); } else if (md5_hash_location) { const union tcp_md5_addr *addr; int sdif = tcp_v4_sdif(skb); int dif = inet_iif(skb); int l3index; /* * active side is lost. Try to find listening socket through * source port, and then find md5 key through listening socket. * we are not loose security here: * Incoming packet is checked with md5 hash with finding key, * no RST generated if md5 hash doesn't match. */ sk1 = __inet_lookup_listener(net, NULL, 0, ip_hdr(skb)->saddr, th->source, ip_hdr(skb)->daddr, ntohs(th->source), dif, sdif); /* don't send rst if it can't find key */ if (!sk1) goto out; /* sdif set, means packet ingressed via a device * in an L3 domain and dif is set to it. */ l3index = sdif ? dif : 0; addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); if (!key) goto out; genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); if (genhash || memcmp(md5_hash_location, newhash, 16) != 0) goto out; } if (key) { rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG); /* Update length and the length the header thinks exists */ arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; rep.th.doff = arg.iov[0].iov_len / 4; tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], key, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, &rep.th); } #endif /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ if (rep.opt[0] == 0) { __be32 mrst = mptcp_reset_option(skb); if (mrst) { rep.opt[0] = mrst; arg.iov[0].iov_len += sizeof(mrst); rep.th.doff = arg.iov[0].iov_len / 4; } } arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, ip_hdr(skb)->saddr, /* XXX */ arg.iov[0].iov_len, IPPROTO_TCP, 0); arg.csumoffset = offsetof(struct tcphdr, check) / 2; arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; /* When socket is gone, all binding information is lost. * routing might fail in this case. No choice here, if we choose to force * input interface, we will misroute in case of asymmetric route. */ if (sk) arg.bound_dev_if = sk->sk_bound_dev_if; trace_tcp_send_reset(sk, skb, reason); BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != offsetof(struct inet_timewait_sock, tw_bound_dev_if)); /* ECN bits of TW reset are cleared */ arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK; arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); local_bh_disable(); local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); sock_net_set(ctl_sk, net); if (sk) { ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); transmit_time = tcp_transmit_time(sk); xfrm_sk_clone_policy(ctl_sk, sk); txhash = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_txhash : sk->sk_txhash; } else { ctl_sk->sk_mark = 0; ctl_sk->sk_priority = 0; } ip_send_unicast_reply(ctl_sk, sk, skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len, transmit_time, txhash); xfrm_sk_free_policy(ctl_sk); sock_net_set(ctl_sk, &init_net); __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); local_bh_enable(); #ifdef CONFIG_TCP_MD5SIG out: rcu_read_unlock(); #endif } /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states outside socket context is ugly, certainly. What can I do? */ static void tcp_v4_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_key *key, int reply_flags, u8 tos, u32 txhash) { const struct tcphdr *th = tcp_hdr(skb); struct { struct tcphdr th; __be32 opt[(MAX_TCP_OPTION_SPACE >> 2)]; } rep; struct net *net = sock_net(sk); struct ip_reply_arg arg; struct sock *ctl_sk; u64 transmit_time; memset(&rep.th, 0, sizeof(struct tcphdr)); memset(&arg, 0, sizeof(arg)); arg.iov[0].iov_base = (unsigned char *)&rep; arg.iov[0].iov_len = sizeof(rep.th); if (tsecr) { rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); rep.opt[1] = htonl(tsval); rep.opt[2] = htonl(tsecr); arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; } /* Swap the send and the receive. */ rep.th.dest = th->source; rep.th.source = th->dest; rep.th.doff = arg.iov[0].iov_len / 4; rep.th.seq = htonl(seq); rep.th.ack_seq = htonl(ack); rep.th.ack = 1; rep.th.window = htons(win); #ifdef CONFIG_TCP_MD5SIG if (tcp_key_is_md5(key)) { int offset = (tsecr) ? 3 : 0; rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG); arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; rep.th.doff = arg.iov[0].iov_len/4; tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], key->md5_key, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, &rep.th); } #endif #ifdef CONFIG_TCP_AO if (tcp_key_is_ao(key)) { int offset = (tsecr) ? 3 : 0; rep.opt[offset++] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key->ao_key) << 16) | (key->ao_key->sndid << 8) | key->rcv_next); arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key); rep.th.doff = arg.iov[0].iov_len / 4; tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset], key->ao_key, key->traffic_key, (union tcp_ao_addr *)&ip_hdr(skb)->saddr, (union tcp_ao_addr *)&ip_hdr(skb)->daddr, &rep.th, key->sne); } #endif arg.flags = reply_flags; arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, ip_hdr(skb)->saddr, /* XXX */ arg.iov[0].iov_len, IPPROTO_TCP, 0); arg.csumoffset = offsetof(struct tcphdr, check) / 2; if (oif) arg.bound_dev_if = oif; arg.tos = tos; arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); local_bh_disable(); local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); sock_net_set(ctl_sk, net); ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); transmit_time = tcp_transmit_time(sk); ip_send_unicast_reply(ctl_sk, sk, skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len, transmit_time, txhash); sock_net_set(ctl_sk, &init_net); __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); local_bh_enable(); } static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb, enum tcp_tw_status tw_status) { struct inet_timewait_sock *tw = inet_twsk(sk); struct tcp_timewait_sock *tcptw = tcp_twsk(sk); struct tcp_key key = {}; u8 tos = tw->tw_tos; /* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject, * while not cleaning ECN bits of other TW ACKs to avoid these ACKs * being placed in a different service queues (Classic rather than L4S) */ if (tw_status == TCP_TW_ACK_OOW) tos &= ~INET_ECN_MASK; #ifdef CONFIG_TCP_AO struct tcp_ao_info *ao_info; if (static_branch_unlikely(&tcp_ao_needed.key)) { /* FIXME: the segment to-be-acked is not verified yet */ ao_info = rcu_dereference(tcptw->ao_info); if (ao_info) { const struct tcp_ao_hdr *aoh; if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) { inet_twsk_put(tw); return; } if (aoh) key.ao_key = tcp_ao_established_key(sk, ao_info, aoh->rnext_keyid, -1); } } if (key.ao_key) { struct tcp_ao_key *rnext_key; key.traffic_key = snd_other_key(key.ao_key); key.sne = READ_ONCE(ao_info->snd_sne); rnext_key = READ_ONCE(ao_info->rnext_key); key.rcv_next = rnext_key->rcvid; key.type = TCP_KEY_AO; #else if (0) { #endif } else if (static_branch_tcp_md5()) { key.md5_key = tcp_twsk_md5_key(tcptw); if (key.md5_key) key.type = TCP_KEY_MD5; } tcp_v4_send_ack(sk, skb, tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt), tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcp_tw_tsval(tcptw), READ_ONCE(tcptw->tw_ts_recent), tw->tw_bound_dev_if, &key, tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, tos, tw->tw_txhash); inet_twsk_put(tw); } static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req) { struct tcp_key key = {}; /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV * sk->sk_state == TCP_SYN_RECV -> for Fast Open. */ u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt; #ifdef CONFIG_TCP_AO if (static_branch_unlikely(&tcp_ao_needed.key) && tcp_rsk_used_ao(req)) { const union tcp_md5_addr *addr; const struct tcp_ao_hdr *aoh; int l3index; /* Invalid TCP option size or twice included auth */ if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) return; if (!aoh) return; addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, aoh->rnext_keyid, -1); if (unlikely(!key.ao_key)) { /* Send ACK with any matching MKT for the peer */ key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1); /* Matching key disappeared (user removed the key?) * let the handshake timeout. */ if (!key.ao_key) { net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n", addr, ntohs(tcp_hdr(skb)->source), &ip_hdr(skb)->daddr, ntohs(tcp_hdr(skb)->dest)); return; } } key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC); if (!key.traffic_key) return; key.type = TCP_KEY_AO; key.rcv_next = aoh->keyid; tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req); #else if (0) { #endif } else if (static_branch_tcp_md5()) { const union tcp_md5_addr *addr; int l3index; addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); if (key.md5_key) key.type = TCP_KEY_MD5; } /* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */ tcp_v4_send_ack(sk, skb, seq, tcp_rsk(req)->rcv_nxt, tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, tcp_rsk_tsval(tcp_rsk(req)), req->ts_recent, 0, &key, inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, ip_hdr(skb)->tos & ~INET_ECN_MASK, READ_ONCE(tcp_rsk(req)->txhash)); if (tcp_key_is_ao(&key)) kfree(key.traffic_key); } /* * Send a SYN-ACK after having received a SYN. * This still operates on a request_sock only, not on a big * socket. */ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type, struct sk_buff *syn_skb) { struct inet_request_sock *ireq = inet_rsk(req); struct flowi4 fl4; int err = -1; struct sk_buff *skb; u8 tos; /* First, grab a route. */ if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) return -1; skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); if (skb) { tcp_rsk(req)->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK; __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); tos = READ_ONCE(inet_sk(sk)->tos); if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | (tos & INET_ECN_MASK); if (!INET_ECN_is_capable(tos) && tcp_bpf_ca_needs_ecn((struct sock *)req)) tos |= INET_ECN_ECT_0; rcu_read_lock(); err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, ireq->ir_rmt_addr, rcu_dereference(ireq->ireq_opt), tos); rcu_read_unlock(); err = net_xmit_eval(err); } return err; } /* * IPv4 request_sock destructor. */ static void tcp_v4_reqsk_destructor(struct request_sock *req) { kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); } #ifdef CONFIG_TCP_MD5SIG /* * RFC2385 MD5 checksumming requires a mapping of * IP address->MD5 Key. * We need to maintain these in the sk structure. */ DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ); EXPORT_IPV6_MOD(tcp_md5_needed); static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) { if (!old) return true; /* l3index always overrides non-l3index */ if (old->l3index && new->l3index == 0) return false; if (old->l3index == 0 && new->l3index) return true; return old->prefixlen < new->prefixlen; } /* Find the Key structure for an address. */ struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, const union tcp_md5_addr *addr, int family, bool any_l3index) { const struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *key; const struct tcp_md5sig_info *md5sig; __be32 mask; struct tcp_md5sig_key *best_match = NULL; bool match; /* caller either holds rcu_read_lock() or socket lock */ md5sig = rcu_dereference_check(tp->md5sig_info, lockdep_sock_is_held(sk)); if (!md5sig) return NULL; hlist_for_each_entry_rcu(key, &md5sig->head, node, lockdep_sock_is_held(sk)) { if (key->family != family) continue; if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index) continue; if (family == AF_INET) { mask = inet_make_mask(key->prefixlen); match = (key->addr.a4.s_addr & mask) == (addr->a4.s_addr & mask); #if IS_ENABLED(CONFIG_IPV6) } else if (family == AF_INET6) { match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, key->prefixlen); #endif } else { match = false; } if (match && better_md5_match(best_match, key)) best_match = key; } return best_match; } EXPORT_IPV6_MOD(__tcp_md5_do_lookup); static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index, u8 flags) { const struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *key; unsigned int size = sizeof(struct in_addr); const struct tcp_md5sig_info *md5sig; /* caller either holds rcu_read_lock() or socket lock */ md5sig = rcu_dereference_check(tp->md5sig_info, lockdep_sock_is_held(sk)); if (!md5sig) return NULL; #if IS_ENABLED(CONFIG_IPV6) if (family == AF_INET6) size = sizeof(struct in6_addr); #endif hlist_for_each_entry_rcu(key, &md5sig->head, node, lockdep_sock_is_held(sk)) { if (key->family != family) continue; if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX)) continue; if (key->l3index != l3index) continue; if (!memcmp(&key->addr, addr, size) && key->prefixlen == prefixlen) return key; } return NULL; } struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, const struct sock *addr_sk) { const union tcp_md5_addr *addr; int l3index; l3index = l3mdev_master_ifindex_by_index(sock_net(sk), addr_sk->sk_bound_dev_if); addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); } EXPORT_IPV6_MOD(tcp_v4_md5_lookup); static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_info *md5sig; md5sig = kmalloc(sizeof(*md5sig), gfp); if (!md5sig) return -ENOMEM; sk_gso_disable(sk); INIT_HLIST_HEAD(&md5sig->head); rcu_assign_pointer(tp->md5sig_info, md5sig); return 0; } /* This can be called on a newly created socket, from other files */ static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index, u8 flags, const u8 *newkey, u8 newkeylen, gfp_t gfp) { /* Add Key to the list */ struct tcp_md5sig_key *key; struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_info *md5sig; key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); if (key) { /* Pre-existing entry - just update that one. * Note that the key might be used concurrently. * data_race() is telling kcsan that we do not care of * key mismatches, since changing MD5 key on live flows * can lead to packet drops. */ data_race(memcpy(key->key, newkey, newkeylen)); /* Pairs with READ_ONCE() in tcp_md5_hash_key(). * Also note that a reader could catch new key->keylen value * but old key->key[], this is the reason we use __GFP_ZERO * at sock_kmalloc() time below these lines. */ WRITE_ONCE(key->keylen, newkeylen); return 0; } md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); if (!key) return -ENOMEM; memcpy(key->key, newkey, newkeylen); key->keylen = newkeylen; key->family = family; key->prefixlen = prefixlen; key->l3index = l3index; key->flags = flags; memcpy(&key->addr, addr, (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) : sizeof(struct in_addr)); hlist_add_head_rcu(&key->node, &md5sig->head); return 0; } int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index, u8 flags, const u8 *newkey, u8 newkeylen) { struct tcp_sock *tp = tcp_sk(sk); if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { if (tcp_md5_alloc_sigpool()) return -ENOMEM; if (tcp_md5sig_info_add(sk, GFP_KERNEL)) { tcp_md5_release_sigpool(); return -ENOMEM; } if (!static_branch_inc(&tcp_md5_needed.key)) { struct tcp_md5sig_info *md5sig; md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); rcu_assign_pointer(tp->md5sig_info, NULL); kfree_rcu(md5sig, rcu); tcp_md5_release_sigpool(); return -EUSERS; } } return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags, newkey, newkeylen, GFP_KERNEL); } EXPORT_IPV6_MOD(tcp_md5_do_add); int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index, struct tcp_md5sig_key *key) { struct tcp_sock *tp = tcp_sk(sk); if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { tcp_md5_add_sigpool(); if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) { tcp_md5_release_sigpool(); return -ENOMEM; } if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) { struct tcp_md5sig_info *md5sig; md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); net_warn_ratelimited("Too many TCP-MD5 keys in the system\n"); rcu_assign_pointer(tp->md5sig_info, NULL); kfree_rcu(md5sig, rcu); tcp_md5_release_sigpool(); return -EUSERS; } } return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, key->flags, key->key, key->keylen, sk_gfp_mask(sk, GFP_ATOMIC)); } EXPORT_IPV6_MOD(tcp_md5_key_copy); int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index, u8 flags) { struct tcp_md5sig_key *key; key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); if (!key) return -ENOENT; hlist_del_rcu(&key->node); atomic_sub(sizeof(*key), &sk->sk_omem_alloc); kfree_rcu(key, rcu); return 0; } EXPORT_IPV6_MOD(tcp_md5_do_del); void tcp_clear_md5_list(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *key; struct hlist_node *n; struct tcp_md5sig_info *md5sig; md5sig = rcu_dereference_protected(tp->md5sig_info, 1); hlist_for_each_entry_safe(key, n, &md5sig->head, node) { hlist_del(&key->node); atomic_sub(sizeof(*key), &sk->sk_omem_alloc); kfree(key); } } static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct tcp_md5sig cmd; struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; const union tcp_md5_addr *addr; u8 prefixlen = 32; int l3index = 0; bool l3flag; u8 flags; if (optlen < sizeof(cmd)) return -EINVAL; if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) return -EFAULT; if (sin->sin_family != AF_INET) return -EINVAL; flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; if (optname == TCP_MD5SIG_EXT && cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { prefixlen = cmd.tcpm_prefixlen; if (prefixlen > 32) return -EINVAL; } if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { struct net_device *dev; rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); if (dev && netif_is_l3_master(dev)) l3index = dev->ifindex; rcu_read_unlock(); /* ok to reference set/not set outside of rcu; * right now device MUST be an L3 master */ if (!dev || !l3index) return -EINVAL; } addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; if (!cmd.tcpm_keylen) return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags); if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) return -EINVAL; /* Don't allow keys for peers that have a matching TCP-AO key. * See the comment in tcp_ao_add_cmd() */ if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false)) return -EKEYREJECTED; return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, cmd.tcpm_key, cmd.tcpm_keylen); } static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp, __be32 daddr, __be32 saddr, const struct tcphdr *th, int nbytes) { struct tcp4_pseudohdr *bp; struct scatterlist sg; struct tcphdr *_th; bp = hp->scratch; bp->saddr = saddr; bp->daddr = daddr; bp->pad = 0; bp->protocol = IPPROTO_TCP; bp->len = cpu_to_be16(nbytes); _th = (struct tcphdr *)(bp + 1); memcpy(_th, th, sizeof(*th)); _th->check = 0; sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); ahash_request_set_crypt(hp->req, &sg, NULL, sizeof(*bp) + sizeof(*th)); return crypto_ahash_update(hp->req); } static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, __be32 daddr, __be32 saddr, const struct tcphdr *th) { struct tcp_sigpool hp; if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) goto clear_hash_nostart; if (crypto_ahash_init(hp.req)) goto clear_hash; if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2)) goto clear_hash; if (tcp_md5_hash_key(&hp, key)) goto clear_hash; ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); if (crypto_ahash_final(hp.req)) goto clear_hash; tcp_sigpool_end(&hp); return 0; clear_hash: tcp_sigpool_end(&hp); clear_hash_nostart: memset(md5_hash, 0, 16); return 1; } int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, const struct sock *sk, const struct sk_buff *skb) { const struct tcphdr *th = tcp_hdr(skb); struct tcp_sigpool hp; __be32 saddr, daddr; if (sk) { /* valid for establish/request sockets */ saddr = sk->sk_rcv_saddr; daddr = sk->sk_daddr; } else { const struct iphdr *iph = ip_hdr(skb); saddr = iph->saddr; daddr = iph->daddr; } if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) goto clear_hash_nostart; if (crypto_ahash_init(hp.req)) goto clear_hash; if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len)) goto clear_hash; if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2)) goto clear_hash; if (tcp_md5_hash_key(&hp, key)) goto clear_hash; ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); if (crypto_ahash_final(hp.req)) goto clear_hash; tcp_sigpool_end(&hp); return 0; clear_hash: tcp_sigpool_end(&hp); clear_hash_nostart: memset(md5_hash, 0, 16); return 1; } EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb); #endif static void tcp_v4_init_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb) { struct inet_request_sock *ireq = inet_rsk(req); struct net *net = sock_net(sk_listener); sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); } static struct dst_entry *tcp_v4_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, struct request_sock *req, u32 tw_isn) { tcp_v4_init_req(req, sk, skb); if (security_inet_conn_request(sk, skb, req)) return NULL; return inet_csk_route_req(sk, &fl->u.ip4, req); } struct request_sock_ops tcp_request_sock_ops __read_mostly = { .family = PF_INET, .obj_size = sizeof(struct tcp_request_sock), .send_ack = tcp_v4_reqsk_send_ack, .destructor = tcp_v4_reqsk_destructor, .send_reset = tcp_v4_send_reset, .syn_ack_timeout = tcp_syn_ack_timeout, }; const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { .mss_clamp = TCP_MSS_DEFAULT, #ifdef CONFIG_TCP_MD5SIG .req_md5_lookup = tcp_v4_md5_lookup, .calc_md5_hash = tcp_v4_md5_hash_skb, #endif #ifdef CONFIG_TCP_AO .ao_lookup = tcp_v4_ao_lookup_rsk, .ao_calc_key = tcp_v4_ao_calc_key_rsk, .ao_synack_hash = tcp_v4_ao_synack_hash, #endif #ifdef CONFIG_SYN_COOKIES .cookie_init_seq = cookie_v4_init_sequence, #endif .route_req = tcp_v4_route_req, .init_seq = tcp_v4_init_seq, .init_ts_off = tcp_v4_init_ts_off, .send_synack = tcp_v4_send_synack, }; int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) { /* Never answer to SYNs send to broadcast or multicast */ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) goto drop; return tcp_conn_request(&tcp_request_sock_ops, &tcp_request_sock_ipv4_ops, sk, skb); drop: tcp_listendrop(sk); return 0; } EXPORT_IPV6_MOD(tcp_v4_conn_request); /* * The three way handshake has completed - we got a valid synack - * now create the new socket. */ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst, struct request_sock *req_unhash, bool *own_req) { struct inet_request_sock *ireq; bool found_dup_sk = false; struct inet_sock *newinet; struct tcp_sock *newtp; struct sock *newsk; #ifdef CONFIG_TCP_MD5SIG const union tcp_md5_addr *addr; struct tcp_md5sig_key *key; int l3index; #endif struct ip_options_rcu *inet_opt; if (sk_acceptq_is_full(sk)) goto exit_overflow; newsk = tcp_create_openreq_child(sk, req, skb); if (!newsk) goto exit_nonewsk; newsk->sk_gso_type = SKB_GSO_TCPV4; inet_sk_rx_dst_set(newsk, skb); newtp = tcp_sk(newsk); newinet = inet_sk(newsk); ireq = inet_rsk(req); inet_opt = rcu_dereference(ireq->ireq_opt); RCU_INIT_POINTER(newinet->inet_opt, inet_opt); newinet->mc_index = inet_iif(skb); newinet->mc_ttl = ip_hdr(skb)->ttl; newinet->rcv_tos = ip_hdr(skb)->tos; inet_csk(newsk)->icsk_ext_hdr_len = 0; if (inet_opt) inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; atomic_set(&newinet->inet_id, get_random_u16()); /* Set ToS of the new socket based upon the value of incoming SYN. * ECT bits are set later in tcp_init_transfer(). */ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; if (!dst) { dst = inet_csk_route_child_sock(sk, newsk, req); if (!dst) goto put_and_exit; } else { /* syncookie case : see end of cookie_v4_check() */ } sk_setup_caps(newsk, dst); tcp_ca_openreq_child(newsk, dst); tcp_sync_mss(newsk, dst_mtu(dst)); newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); tcp_initialize_rcv_mss(newsk); #ifdef CONFIG_TCP_MD5SIG l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); /* Copy over the MD5 key from the original socket */ addr = (union tcp_md5_addr *)&newinet->inet_daddr; key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); if (key && !tcp_rsk_used_ao(req)) { if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key)) goto put_and_exit; sk_gso_disable(newsk); } #endif #ifdef CONFIG_TCP_AO if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET)) goto put_and_exit; /* OOM, release back memory */ #endif if (__inet_inherit_port(sk, newsk) < 0) goto put_and_exit; *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), &found_dup_sk); if (likely(*own_req)) { tcp_move_syn(newtp, req); ireq->ireq_opt = NULL; } else { newinet->inet_opt = NULL; if (!req_unhash && found_dup_sk) { /* This code path should only be executed in the * syncookie case only */ bh_unlock_sock(newsk); sock_put(newsk); newsk = NULL; } } return newsk; exit_overflow: NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); exit_nonewsk: dst_release(dst); exit: tcp_listendrop(sk); return NULL; put_and_exit: newinet->inet_opt = NULL; inet_csk_prepare_forced_close(newsk); tcp_done(newsk); goto exit; } EXPORT_IPV6_MOD(tcp_v4_syn_recv_sock); static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) { #ifdef CONFIG_SYN_COOKIES const struct tcphdr *th = tcp_hdr(skb); if (!th->syn) sk = cookie_v4_check(sk, skb); #endif return sk; } u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, struct tcphdr *th, u32 *cookie) { u16 mss = 0; #ifdef CONFIG_SYN_COOKIES mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, &tcp_request_sock_ipv4_ops, sk, th); if (mss) { *cookie = __cookie_v4_init_sequence(iph, th, &mss); tcp_synq_overflow(sk); } #endif return mss; } INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, u32)); /* The socket must have it's spinlock held when we get * here, unless it is a TCP_LISTEN socket. * * We have a potential double-lock case here, so even when * doing backlog processing we use the BH locking scheme. * This is because we cannot sleep with the original spinlock * held. */ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) { enum skb_drop_reason reason; struct sock *rsk; reason = psp_sk_rx_policy_check(sk, skb); if (reason) goto err_discard; if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ struct dst_entry *dst; dst = rcu_dereference_protected(sk->sk_rx_dst, lockdep_sock_is_held(sk)); sock_rps_save_rxhash(sk, skb); sk_mark_napi_id(sk, skb); if (dst) { if (sk->sk_rx_dst_ifindex != skb->skb_iif || !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, dst, 0)) { RCU_INIT_POINTER(sk->sk_rx_dst, NULL); dst_release(dst); } } tcp_rcv_established(sk, skb); return 0; } if (tcp_checksum_complete(skb)) goto csum_err; if (sk->sk_state == TCP_LISTEN) { struct sock *nsk = tcp_v4_cookie_check(sk, skb); if (!nsk) return 0; if (nsk != sk) { reason = tcp_child_process(sk, nsk, skb); if (reason) { rsk = nsk; goto reset; } return 0; } } else sock_rps_save_rxhash(sk, skb); reason = tcp_rcv_state_process(sk, skb); if (reason) { rsk = sk; goto reset; } return 0; reset: tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason)); discard: sk_skb_reason_drop(sk, skb, reason); /* Be careful here. If this function gets more complicated and * gcc suffers from register pressure on the x86, sk (in %ebx) * might be destroyed here. This current version compiles correctly, * but you have been warned. */ return 0; csum_err: reason = SKB_DROP_REASON_TCP_CSUM; trace_tcp_bad_csum(skb); TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); err_discard: TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); goto discard; } EXPORT_SYMBOL(tcp_v4_do_rcv); int tcp_v4_early_demux(struct sk_buff *skb) { struct net *net = dev_net_rcu(skb->dev); const struct iphdr *iph; const struct tcphdr *th; struct sock *sk; if (skb->pkt_type != PACKET_HOST) return 0; if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) return 0; iph = ip_hdr(skb); th = tcp_hdr(skb); if (th->doff < sizeof(struct tcphdr) / 4) return 0; sk = __inet_lookup_established(net, iph->saddr, th->source, iph->daddr, ntohs(th->dest), skb->skb_iif, inet_sdif(skb)); if (sk) { skb->sk = sk; skb->destructor = sock_edemux; if (sk_fullsock(sk)) { struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); if (dst) dst = dst_check(dst, 0); if (dst && sk->sk_rx_dst_ifindex == skb->skb_iif) skb_dst_set_noref(skb, dst); } } return 0; } bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason) { u32 tail_gso_size, tail_gso_segs; struct skb_shared_info *shinfo; const struct tcphdr *th; struct tcphdr *thtail; struct sk_buff *tail; unsigned int hdrlen; bool fragstolen; u32 gso_segs; u32 gso_size; u64 limit; int delta; int err; /* In case all data was pulled from skb frags (in __pskb_pull_tail()), * we can fix skb->truesize to its real value to avoid future drops. * This is valid because skb is not yet charged to the socket. * It has been noticed pure SACK packets were sometimes dropped * (if cooked by drivers without copybreak feature). */ skb_condense(skb); tcp_cleanup_skb(skb); if (unlikely(tcp_checksum_complete(skb))) { bh_unlock_sock(sk); trace_tcp_bad_csum(skb); *reason = SKB_DROP_REASON_TCP_CSUM; __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); return true; } /* Attempt coalescing to last skb in backlog, even if we are * above the limits. * This is okay because skb capacity is limited to MAX_SKB_FRAGS. */ th = (const struct tcphdr *)skb->data; hdrlen = th->doff * 4; tail = sk->sk_backlog.tail; if (!tail) goto no_coalesce; thtail = (struct tcphdr *)tail->data; if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || ((TCP_SKB_CB(tail)->tcp_flags | TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || !((TCP_SKB_CB(tail)->tcp_flags & TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || ((TCP_SKB_CB(tail)->tcp_flags ^ TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) || !tcp_skb_can_collapse_rx(tail, skb) || thtail->doff != th->doff || memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)) || /* prior to PSP Rx policy check, retain exact PSP metadata */ psp_skb_coalesce_diff(tail, skb)) goto no_coalesce; __skb_pull(skb, hdrlen); shinfo = skb_shinfo(skb); gso_size = shinfo->gso_size ?: skb->len; gso_segs = shinfo->gso_segs ?: 1; shinfo = skb_shinfo(tail); tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen); tail_gso_segs = shinfo->gso_segs ?: 1; if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) { TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; thtail->window = th->window; } /* We have to update both TCP_SKB_CB(tail)->tcp_flags and * thtail->fin, so that the fast path in tcp_rcv_established() * is not entered if we append a packet with a FIN. * SYN, RST, URG are not present. * ACK is set on both packets. * PSH : we do not really care in TCP stack, * at least for 'GRO' packets. */ thtail->fin |= th->fin; TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; if (TCP_SKB_CB(skb)->has_rxtstamp) { TCP_SKB_CB(tail)->has_rxtstamp = true; tail->tstamp = skb->tstamp; skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; } /* Not as strict as GRO. We only need to carry mss max value */ shinfo->gso_size = max(gso_size, tail_gso_size); shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF); sk->sk_backlog.len += delta; __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGCOALESCE); kfree_skb_partial(skb, fragstolen); return false; } __skb_push(skb, hdrlen); no_coalesce: /* sk->sk_backlog.len is reset only at the end of __release_sock(). * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach * sk_rcvbuf in normal conditions. */ limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1; limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1; /* Only socket owner can try to collapse/prune rx queues * to reduce memory overhead, so add a little headroom here. * Few sockets backlog are possibly concurrently non empty. */ limit += 64 * 1024; limit = min_t(u64, limit, UINT_MAX); err = sk_add_backlog(sk, skb, limit); if (unlikely(err)) { bh_unlock_sock(sk); if (err == -ENOMEM) { *reason = SKB_DROP_REASON_PFMEMALLOC; __NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP); } else { *reason = SKB_DROP_REASON_SOCKET_BACKLOG; __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); } return true; } return false; } EXPORT_IPV6_MOD(tcp_add_backlog); int tcp_filter(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason) { struct tcphdr *th = (struct tcphdr *)skb->data; return sk_filter_trim_cap(sk, skb, th->doff * 4, reason); } EXPORT_IPV6_MOD(tcp_filter); static void tcp_v4_restore_cb(struct sk_buff *skb) { memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, sizeof(struct inet_skb_parm)); } static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, const struct tcphdr *th) { /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() * barrier() makes sure compiler wont play fool^Waliasing games. */ memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), sizeof(struct inet_skb_parm)); barrier(); TCP_SKB_CB(skb)->seq = ntohl(th->seq); TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff * 4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th); TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); TCP_SKB_CB(skb)->sacked = 0; TCP_SKB_CB(skb)->has_rxtstamp = skb->tstamp || skb_hwtstamps(skb)->hwtstamp; } /* * From tcp_input.c */ int tcp_v4_rcv(struct sk_buff *skb) { struct net *net = dev_net_rcu(skb->dev); enum skb_drop_reason drop_reason; enum tcp_tw_status tw_status; int sdif = inet_sdif(skb); int dif = inet_iif(skb); const struct iphdr *iph; const struct tcphdr *th; struct sock *sk = NULL; bool refcounted; int ret; u32 isn; drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (skb->pkt_type != PACKET_HOST) goto discard_it; /* Count it even if it's bad */ __TCP_INC_STATS(net, TCP_MIB_INSEGS); if (!pskb_may_pull(skb, sizeof(struct tcphdr))) goto discard_it; th = (const struct tcphdr *)skb->data; if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) { drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; goto bad_packet; } if (!pskb_may_pull(skb, th->doff * 4)) goto discard_it; /* An explanation is required here, I think. * Packet length and doff are validated by header prediction, * provided case of th->doff==0 is eliminated. * So, we defer the checks. */ if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) goto csum_error; th = (const struct tcphdr *)skb->data; iph = ip_hdr(skb); lookup: sk = __inet_lookup_skb(skb, __tcp_hdrlen(th), th->source, th->dest, sdif, &refcounted); if (!sk) goto no_tcp_socket; if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; if (sk->sk_state == TCP_NEW_SYN_RECV) { struct request_sock *req = inet_reqsk(sk); bool req_stolen = false; struct sock *nsk; sk = req->rsk_listener; if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) drop_reason = SKB_DROP_REASON_XFRM_POLICY; else drop_reason = tcp_inbound_hash(sk, req, skb, &iph->saddr, &iph->daddr, AF_INET, dif, sdif); if (unlikely(drop_reason)) { sk_drops_skbadd(sk, skb); reqsk_put(req); goto discard_it; } if (tcp_checksum_complete(skb)) { reqsk_put(req); goto csum_error; } if (unlikely(sk->sk_state != TCP_LISTEN)) { nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); if (!nsk) { inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; } sk = nsk; /* reuseport_migrate_sock() has already held one sk_refcnt * before returning. */ } else { /* We own a reference on the listener, increase it again * as we might lose it too soon. */ sock_hold(sk); } refcounted = true; nsk = NULL; if (!tcp_filter(sk, skb, &drop_reason)) { th = (const struct tcphdr *)skb->data; iph = ip_hdr(skb); tcp_v4_fill_cb(skb, iph, th); nsk = tcp_check_req(sk, skb, req, false, &req_stolen, &drop_reason); } if (!nsk) { reqsk_put(req); if (req_stolen) { /* Another cpu got exclusive access to req * and created a full blown socket. * Try to feed this packet to this socket * instead of discarding it. */ tcp_v4_restore_cb(skb); sock_put(sk); goto lookup; } goto discard_and_relse; } nf_reset_ct(skb); if (nsk == sk) { reqsk_put(req); tcp_v4_restore_cb(skb); } else { drop_reason = tcp_child_process(sk, nsk, skb); if (drop_reason) { enum sk_rst_reason rst_reason; rst_reason = sk_rst_convert_drop_reason(drop_reason); tcp_v4_send_reset(nsk, skb, rst_reason); goto discard_and_relse; } sock_put(sk); return 0; } } process: if (static_branch_unlikely(&ip4_min_ttl)) { /* min_ttl can be changed concurrently from do_ip_setsockopt() */ if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); drop_reason = SKB_DROP_REASON_TCP_MINTTL; goto discard_and_relse; } } if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { drop_reason = SKB_DROP_REASON_XFRM_POLICY; goto discard_and_relse; } drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr, AF_INET, dif, sdif); if (drop_reason) goto discard_and_relse; nf_reset_ct(skb); if (tcp_filter(sk, skb, &drop_reason)) goto discard_and_relse; th = (const struct tcphdr *)skb->data; iph = ip_hdr(skb); tcp_v4_fill_cb(skb, iph, th); skb->dev = NULL; if (sk->sk_state == TCP_LISTEN) { ret = tcp_v4_do_rcv(sk, skb); goto put_and_return; } sk_incoming_cpu_update(sk); bh_lock_sock_nested(sk); tcp_segs_in(tcp_sk(sk), skb); ret = 0; if (!sock_owned_by_user(sk)) { ret = tcp_v4_do_rcv(sk, skb); } else { if (tcp_add_backlog(sk, skb, &drop_reason)) goto discard_and_relse; } bh_unlock_sock(sk); put_and_return: if (refcounted) sock_put(sk); return ret; no_tcp_socket: drop_reason = SKB_DROP_REASON_NO_SOCKET; if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto discard_it; tcp_v4_fill_cb(skb, iph, th); if (tcp_checksum_complete(skb)) { csum_error: drop_reason = SKB_DROP_REASON_TCP_CSUM; trace_tcp_bad_csum(skb); __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); bad_packet: __TCP_INC_STATS(net, TCP_MIB_INERRS); } else { tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason)); } discard_it: SKB_DR_OR(drop_reason, NOT_SPECIFIED); /* Discard frame. */ sk_skb_reason_drop(sk, skb, drop_reason); return 0; discard_and_relse: sk_drops_skbadd(sk, skb); if (refcounted) sock_put(sk); goto discard_it; do_time_wait: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { drop_reason = SKB_DROP_REASON_XFRM_POLICY; inet_twsk_put(inet_twsk(sk)); goto discard_it; } tcp_v4_fill_cb(skb, iph, th); if (tcp_checksum_complete(skb)) { inet_twsk_put(inet_twsk(sk)); goto csum_error; } tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn, &drop_reason); switch (tw_status) { case TCP_TW_SYN: { struct sock *sk2 = inet_lookup_listener(net, skb, __tcp_hdrlen(th), iph->saddr, th->source, iph->daddr, th->dest, inet_iif(skb), sdif); if (sk2) { inet_twsk_deschedule_put(inet_twsk(sk)); sk = sk2; tcp_v4_restore_cb(skb); refcounted = false; __this_cpu_write(tcp_tw_isn, isn); goto process; } drop_reason = psp_twsk_rx_policy_check(inet_twsk(sk), skb); if (drop_reason) break; } /* to ACK */ fallthrough; case TCP_TW_ACK: case TCP_TW_ACK_OOW: tcp_v4_timewait_ack(sk, skb, tw_status); break; case TCP_TW_RST: tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET); inet_twsk_deschedule_put(inet_twsk(sk)); goto discard_it; case TCP_TW_SUCCESS:; } goto discard_it; } static struct timewait_sock_ops tcp_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp_timewait_sock), }; void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); if (dst && dst_hold_safe(dst)) { rcu_assign_pointer(sk->sk_rx_dst, dst); sk->sk_rx_dst_ifindex = skb->skb_iif; } } EXPORT_IPV6_MOD(inet_sk_rx_dst_set); const struct inet_connection_sock_af_ops ipv4_specific = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, .sk_rx_dst_set = inet_sk_rx_dst_set, .conn_request = tcp_v4_conn_request, .syn_recv_sock = tcp_v4_syn_recv_sock, .net_header_len = sizeof(struct iphdr), .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, .mtu_reduced = tcp_v4_mtu_reduced, }; EXPORT_IPV6_MOD(ipv4_specific); #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { #ifdef CONFIG_TCP_MD5SIG .md5_lookup = tcp_v4_md5_lookup, .calc_md5_hash = tcp_v4_md5_hash_skb, .md5_parse = tcp_v4_parse_md5_keys, #endif #ifdef CONFIG_TCP_AO .ao_lookup = tcp_v4_ao_lookup, .calc_ao_hash = tcp_v4_ao_hash_skb, .ao_parse = tcp_v4_parse_ao, .ao_calc_key_sk = tcp_v4_ao_calc_key_sk, #endif }; static void tcp4_destruct_sock(struct sock *sk) { tcp_md5_destruct_sock(sk); tcp_ao_destroy_sock(sk, false); inet_sock_destruct(sk); } #endif /* NOTE: A lot of things set to zero explicitly by call to * sk_alloc() so need not be done here. */ static int tcp_v4_init_sock(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); tcp_init_sock(sk); icsk->icsk_af_ops = &ipv4_specific; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; sk->sk_destruct = tcp4_destruct_sock; #endif return 0; } static void tcp_release_user_frags(struct sock *sk) { #ifdef CONFIG_PAGE_POOL unsigned long index; void *netmem; xa_for_each(&sk->sk_user_frags, index, netmem) WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem)); #endif } void tcp_v4_destroy_sock(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); tcp_release_user_frags(sk); xa_destroy(&sk->sk_user_frags); trace_tcp_destroy_sock(sk); tcp_clear_xmit_timers(sk); tcp_cleanup_congestion_control(sk); tcp_cleanup_ulp(sk); /* Cleanup up the write buffer. */ tcp_write_queue_purge(sk); /* Check if we want to disable active TFO */ tcp_fastopen_active_disable_ofo_check(sk); /* Cleans up our, hopefully empty, out_of_order_queue. */ skb_rbtree_purge(&tp->out_of_order_queue); /* Clean up a referenced TCP bind bucket. */ if (inet_csk(sk)->icsk_bind_hash) inet_put_port(sk); BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); /* If socket is aborted during connect operation */ tcp_free_fastopen_req(tp); tcp_fastopen_destroy_cipher(sk); tcp_saved_syn_free(tp); sk_sockets_allocated_dec(sk); } EXPORT_IPV6_MOD(tcp_v4_destroy_sock); #ifdef CONFIG_PROC_FS /* Proc filesystem TCP sock list dumping. */ static unsigned short seq_file_family(const struct seq_file *seq); static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) { unsigned short family = seq_file_family(seq); /* AF_UNSPEC is used as a match all */ return ((family == AF_UNSPEC || family == sk->sk_family) && net_eq(sock_net(sk), seq_file_net(seq))); } /* Find a non empty bucket (starting from st->bucket) * and return the first sk from it. */ static void *listening_get_first(struct seq_file *seq) { struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; struct tcp_iter_state *st = seq->private; st->offset = 0; for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) { struct inet_listen_hashbucket *ilb2; struct hlist_nulls_node *node; struct sock *sk; ilb2 = &hinfo->lhash2[st->bucket]; if (hlist_nulls_empty(&ilb2->nulls_head)) continue; spin_lock(&ilb2->lock); sk_nulls_for_each(sk, node, &ilb2->nulls_head) { if (seq_sk_match(seq, sk)) return sk; } spin_unlock(&ilb2->lock); } return NULL; } /* Find the next sk of "cur" within the same bucket (i.e. st->bucket). * If "cur" is the last one in the st->bucket, * call listening_get_first() to return the first sk of the next * non empty bucket. */ static void *listening_get_next(struct seq_file *seq, void *cur) { struct tcp_iter_state *st = seq->private; struct inet_listen_hashbucket *ilb2; struct hlist_nulls_node *node; struct inet_hashinfo *hinfo; struct sock *sk = cur; ++st->num; ++st->offset; sk = sk_nulls_next(sk); sk_nulls_for_each_from(sk, node) { if (seq_sk_match(seq, sk)) return sk; } hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; ilb2 = &hinfo->lhash2[st->bucket]; spin_unlock(&ilb2->lock); ++st->bucket; return listening_get_first(seq); } static void *listening_get_idx(struct seq_file *seq, loff_t *pos) { struct tcp_iter_state *st = seq->private; void *rc; st->bucket = 0; st->offset = 0; rc = listening_get_first(seq); while (rc && *pos) { rc = listening_get_next(seq, rc); --*pos; } return rc; } static inline bool empty_bucket(struct inet_hashinfo *hinfo, const struct tcp_iter_state *st) { return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain); } /* * Get first established socket starting from bucket given in st->bucket. * If st->bucket is zero, the very first socket in the hash is returned. */ static void *established_get_first(struct seq_file *seq) { struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; struct tcp_iter_state *st = seq->private; st->offset = 0; for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) { struct sock *sk; struct hlist_nulls_node *node; spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket); cond_resched(); /* Lockless fast path for the common case of empty buckets */ if (empty_bucket(hinfo, st)) continue; spin_lock_bh(lock); sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) { if (seq_sk_match(seq, sk)) return sk; } spin_unlock_bh(lock); } return NULL; } static void *established_get_next(struct seq_file *seq, void *cur) { struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; struct tcp_iter_state *st = seq->private; struct hlist_nulls_node *node; struct sock *sk = cur; ++st->num; ++st->offset; sk = sk_nulls_next(sk); sk_nulls_for_each_from(sk, node) { if (seq_sk_match(seq, sk)) return sk; } spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); ++st->bucket; return established_get_first(seq); } static void *established_get_idx(struct seq_file *seq, loff_t pos) { struct tcp_iter_state *st = seq->private; void *rc; st->bucket = 0; rc = established_get_first(seq); while (rc && pos) { rc = established_get_next(seq, rc); --pos; } return rc; } static void *tcp_get_idx(struct seq_file *seq, loff_t pos) { void *rc; struct tcp_iter_state *st = seq->private; st->state = TCP_SEQ_STATE_LISTENING; rc = listening_get_idx(seq, &pos); if (!rc) { st->state = TCP_SEQ_STATE_ESTABLISHED; rc = established_get_idx(seq, pos); } return rc; } static void *tcp_seek_last_pos(struct seq_file *seq) { struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; struct tcp_iter_state *st = seq->private; int bucket = st->bucket; int offset = st->offset; int orig_num = st->num; void *rc = NULL; switch (st->state) { case TCP_SEQ_STATE_LISTENING: if (st->bucket > hinfo->lhash2_mask) break; rc = listening_get_first(seq); while (offset-- && rc && bucket == st->bucket) rc = listening_get_next(seq, rc); if (rc) break; st->bucket = 0; st->state = TCP_SEQ_STATE_ESTABLISHED; fallthrough; case TCP_SEQ_STATE_ESTABLISHED: if (st->bucket > hinfo->ehash_mask) break; rc = established_get_first(seq); while (offset-- && rc && bucket == st->bucket) rc = established_get_next(seq, rc); } st->num = orig_num; return rc; } void *tcp_seq_start(struct seq_file *seq, loff_t *pos) { struct tcp_iter_state *st = seq->private; void *rc; if (*pos && *pos == st->last_pos) { rc = tcp_seek_last_pos(seq); if (rc) goto out; } st->state = TCP_SEQ_STATE_LISTENING; st->num = 0; st->bucket = 0; st->offset = 0; rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; out: st->last_pos = *pos; return rc; } EXPORT_IPV6_MOD(tcp_seq_start); void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct tcp_iter_state *st = seq->private; void *rc = NULL; if (v == SEQ_START_TOKEN) { rc = tcp_get_idx(seq, 0); goto out; } switch (st->state) { case TCP_SEQ_STATE_LISTENING: rc = listening_get_next(seq, v); if (!rc) { st->state = TCP_SEQ_STATE_ESTABLISHED; st->bucket = 0; st->offset = 0; rc = established_get_first(seq); } break; case TCP_SEQ_STATE_ESTABLISHED: rc = established_get_next(seq, v); break; } out: ++*pos; st->last_pos = *pos; return rc; } EXPORT_IPV6_MOD(tcp_seq_next); void tcp_seq_stop(struct seq_file *seq, void *v) { struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; struct tcp_iter_state *st = seq->private; switch (st->state) { case TCP_SEQ_STATE_LISTENING: if (v != SEQ_START_TOKEN) spin_unlock(&hinfo->lhash2[st->bucket].lock); break; case TCP_SEQ_STATE_ESTABLISHED: if (v) spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); break; } } EXPORT_IPV6_MOD(tcp_seq_stop); static void get_openreq4(const struct request_sock *req, struct seq_file *f, int i) { const struct inet_request_sock *ireq = inet_rsk(req); long delta = req->rsk_timer.expires - jiffies; seq_printf(f, "%4d: %08X:%04X %08X:%04X" " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", i, ireq->ir_loc_addr, ireq->ir_num, ireq->ir_rmt_addr, ntohs(ireq->ir_rmt_port), TCP_SYN_RECV, 0, 0, /* could print option size, but that is af dependent. */ 1, /* timers active (only the expire timer) */ jiffies_delta_to_clock_t(delta), req->num_timeout, from_kuid_munged(seq_user_ns(f), sk_uid(req->rsk_listener)), 0, /* non standard timer */ 0, /* open_requests have no inode */ 0, req); } static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) { int timer_active; unsigned long timer_expires; const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); const struct inet_sock *inet = inet_sk(sk); const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; __be32 dest = inet->inet_daddr; __be32 src = inet->inet_rcv_saddr; __u16 destp = ntohs(inet->inet_dport); __u16 srcp = ntohs(inet->inet_sport); u8 icsk_pending; int rx_queue; int state; icsk_pending = smp_load_acquire(&icsk->icsk_pending); if (icsk_pending == ICSK_TIME_RETRANS || icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk_pending == ICSK_TIME_LOSS_PROBE) { timer_active = 1; timer_expires = icsk_timeout(icsk); } else if (icsk_pending == ICSK_TIME_PROBE0) { timer_active = 4; timer_expires = icsk_timeout(icsk); } else if (timer_pending(&sk->sk_timer)) { timer_active = 2; timer_expires = sk->sk_timer.expires; } else { timer_active = 0; timer_expires = jiffies; } state = inet_sk_state_load(sk); if (state == TCP_LISTEN) rx_queue = READ_ONCE(sk->sk_ack_backlog); else /* Because we don't lock the socket, * we might find a transient negative value. */ rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq), 0); seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", i, src, srcp, dest, destp, state, READ_ONCE(tp->write_seq) - tp->snd_una, rx_queue, timer_active, jiffies_delta_to_clock_t(timer_expires - jiffies), READ_ONCE(icsk->icsk_retransmits), from_kuid_munged(seq_user_ns(f), sk_uid(sk)), READ_ONCE(icsk->icsk_probes_out), sock_i_ino(sk), refcount_read(&sk->sk_refcnt), sk, jiffies_to_clock_t(icsk->icsk_rto), jiffies_to_clock_t(icsk->icsk_ack.ato), (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), tcp_snd_cwnd(tp), state == TCP_LISTEN ? fastopenq->max_qlen : (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); } static void get_timewait4_sock(const struct inet_timewait_sock *tw, struct seq_file *f, int i) { long delta = tw->tw_timer.expires - jiffies; __be32 dest, src; __u16 destp, srcp; dest = tw->tw_daddr; src = tw->tw_rcv_saddr; destp = ntohs(tw->tw_dport); srcp = ntohs(tw->tw_sport); seq_printf(f, "%4d: %08X:%04X %08X:%04X" " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0, 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, refcount_read(&tw->tw_refcnt), tw); } #define TMPSZ 150 static int tcp4_seq_show(struct seq_file *seq, void *v) { struct tcp_iter_state *st; struct sock *sk = v; seq_setwidth(seq, TMPSZ - 1); if (v == SEQ_START_TOKEN) { seq_puts(seq, " sl local_address rem_address st tx_queue " "rx_queue tr tm->when retrnsmt uid timeout " "inode"); goto out; } st = seq->private; if (sk->sk_state == TCP_TIME_WAIT) get_timewait4_sock(v, seq, st->num); else if (sk->sk_state == TCP_NEW_SYN_RECV) get_openreq4(v, seq, st->num); else get_tcp4_sock(v, seq, st->num); out: seq_pad(seq, '\n'); return 0; } #ifdef CONFIG_BPF_SYSCALL union bpf_tcp_iter_batch_item { struct sock *sk; __u64 cookie; }; struct bpf_tcp_iter_state { struct tcp_iter_state state; unsigned int cur_sk; unsigned int end_sk; unsigned int max_sk; union bpf_tcp_iter_batch_item *batch; }; struct bpf_iter__tcp { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct sock_common *, sk_common); uid_t uid __aligned(8); }; static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, struct sock_common *sk_common, uid_t uid) { struct bpf_iter__tcp ctx; meta->seq_num--; /* skip SEQ_START_TOKEN */ ctx.meta = meta; ctx.sk_common = sk_common; ctx.uid = uid; return bpf_iter_run_prog(prog, &ctx); } static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) { union bpf_tcp_iter_batch_item *item; unsigned int cur_sk = iter->cur_sk; __u64 cookie; /* Remember the cookies of the sockets we haven't seen yet, so we can * pick up where we left off next time around. */ while (cur_sk < iter->end_sk) { item = &iter->batch[cur_sk++]; cookie = sock_gen_cookie(item->sk); sock_gen_put(item->sk); item->cookie = cookie; } } static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, unsigned int new_batch_sz, gfp_t flags) { union bpf_tcp_iter_batch_item *new_batch; new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, flags | __GFP_NOWARN); if (!new_batch) return -ENOMEM; memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk); kvfree(iter->batch); iter->batch = new_batch; iter->max_sk = new_batch_sz; return 0; } static struct sock *bpf_iter_tcp_resume_bucket(struct sock *first_sk, union bpf_tcp_iter_batch_item *cookies, int n_cookies) { struct hlist_nulls_node *node; struct sock *sk; int i; for (i = 0; i < n_cookies; i++) { sk = first_sk; sk_nulls_for_each_from(sk, node) if (cookies[i].cookie == atomic64_read(&sk->sk_cookie)) return sk; } return NULL; } static struct sock *bpf_iter_tcp_resume_listening(struct seq_file *seq) { struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; struct bpf_tcp_iter_state *iter = seq->private; struct tcp_iter_state *st = &iter->state; unsigned int find_cookie = iter->cur_sk; unsigned int end_cookie = iter->end_sk; int resume_bucket = st->bucket; struct sock *sk; if (end_cookie && find_cookie == end_cookie) ++st->bucket; sk = listening_get_first(seq); iter->cur_sk = 0; iter->end_sk = 0; if (sk && st->bucket == resume_bucket && end_cookie) { sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie], end_cookie - find_cookie); if (!sk) { spin_unlock(&hinfo->lhash2[st->bucket].lock); ++st->bucket; sk = listening_get_first(seq); } } return sk; } static struct sock *bpf_iter_tcp_resume_established(struct seq_file *seq) { struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; struct bpf_tcp_iter_state *iter = seq->private; struct tcp_iter_state *st = &iter->state; unsigned int find_cookie = iter->cur_sk; unsigned int end_cookie = iter->end_sk; int resume_bucket = st->bucket; struct sock *sk; if (end_cookie && find_cookie == end_cookie) ++st->bucket; sk = established_get_first(seq); iter->cur_sk = 0; iter->end_sk = 0; if (sk && st->bucket == resume_bucket && end_cookie) { sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie], end_cookie - find_cookie); if (!sk) { spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); ++st->bucket; sk = established_get_first(seq); } } return sk; } static struct sock *bpf_iter_tcp_resume(struct seq_file *seq) { struct bpf_tcp_iter_state *iter = seq->private; struct tcp_iter_state *st = &iter->state; struct sock *sk = NULL; switch (st->state) { case TCP_SEQ_STATE_LISTENING: sk = bpf_iter_tcp_resume_listening(seq); if (sk) break; st->bucket = 0; st->state = TCP_SEQ_STATE_ESTABLISHED; fallthrough; case TCP_SEQ_STATE_ESTABLISHED: sk = bpf_iter_tcp_resume_established(seq); break; } return sk; } static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, struct sock **start_sk) { struct bpf_tcp_iter_state *iter = seq->private; struct hlist_nulls_node *node; unsigned int expected = 1; struct sock *sk; sock_hold(*start_sk); iter->batch[iter->end_sk++].sk = *start_sk; sk = sk_nulls_next(*start_sk); *start_sk = NULL; sk_nulls_for_each_from(sk, node) { if (seq_sk_match(seq, sk)) { if (iter->end_sk < iter->max_sk) { sock_hold(sk); iter->batch[iter->end_sk++].sk = sk; } else if (!*start_sk) { /* Remember where we left off. */ *start_sk = sk; } expected++; } } return expected; } static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, struct sock **start_sk) { struct bpf_tcp_iter_state *iter = seq->private; struct hlist_nulls_node *node; unsigned int expected = 1; struct sock *sk; sock_hold(*start_sk); iter->batch[iter->end_sk++].sk = *start_sk; sk = sk_nulls_next(*start_sk); *start_sk = NULL; sk_nulls_for_each_from(sk, node) { if (seq_sk_match(seq, sk)) { if (iter->end_sk < iter->max_sk) { sock_hold(sk); iter->batch[iter->end_sk++].sk = sk; } else if (!*start_sk) { /* Remember where we left off. */ *start_sk = sk; } expected++; } } return expected; } static unsigned int bpf_iter_fill_batch(struct seq_file *seq, struct sock **start_sk) { struct bpf_tcp_iter_state *iter = seq->private; struct tcp_iter_state *st = &iter->state; if (st->state == TCP_SEQ_STATE_LISTENING) return bpf_iter_tcp_listening_batch(seq, start_sk); else return bpf_iter_tcp_established_batch(seq, start_sk); } static void bpf_iter_tcp_unlock_bucket(struct seq_file *seq) { struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; struct bpf_tcp_iter_state *iter = seq->private; struct tcp_iter_state *st = &iter->state; if (st->state == TCP_SEQ_STATE_LISTENING) spin_unlock(&hinfo->lhash2[st->bucket].lock); else spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); } static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) { struct bpf_tcp_iter_state *iter = seq->private; unsigned int expected; struct sock *sk; int err; sk = bpf_iter_tcp_resume(seq); if (!sk) return NULL; /* Done */ expected = bpf_iter_fill_batch(seq, &sk); if (likely(iter->end_sk == expected)) goto done; /* Batch size was too small. */ bpf_iter_tcp_unlock_bucket(seq); bpf_iter_tcp_put_batch(iter); err = bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2, GFP_USER); if (err) return ERR_PTR(err); sk = bpf_iter_tcp_resume(seq); if (!sk) return NULL; /* Done */ expected = bpf_iter_fill_batch(seq, &sk); if (likely(iter->end_sk == expected)) goto done; /* Batch size was still too small. Hold onto the lock while we try * again with a larger batch to make sure the current bucket's size * does not change in the meantime. */ err = bpf_iter_tcp_realloc_batch(iter, expected, GFP_NOWAIT); if (err) { bpf_iter_tcp_unlock_bucket(seq); return ERR_PTR(err); } expected = bpf_iter_fill_batch(seq, &sk); WARN_ON_ONCE(iter->end_sk != expected); done: bpf_iter_tcp_unlock_bucket(seq); return iter->batch[0].sk; } static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos) { /* bpf iter does not support lseek, so it always * continue from where it was stop()-ped. */ if (*pos) return bpf_iter_tcp_batch(seq); return SEQ_START_TOKEN; } static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct bpf_tcp_iter_state *iter = seq->private; struct tcp_iter_state *st = &iter->state; struct sock *sk; /* Whenever seq_next() is called, the iter->cur_sk is * done with seq_show(), so advance to the next sk in * the batch. */ if (iter->cur_sk < iter->end_sk) { /* Keeping st->num consistent in tcp_iter_state. * bpf_iter_tcp does not use st->num. * meta.seq_num is used instead. */ st->num++; sock_gen_put(iter->batch[iter->cur_sk++].sk); } if (iter->cur_sk < iter->end_sk) sk = iter->batch[iter->cur_sk].sk; else sk = bpf_iter_tcp_batch(seq); ++*pos; /* Keeping st->last_pos consistent in tcp_iter_state. * bpf iter does not do lseek, so st->last_pos always equals to *pos. */ st->last_pos = *pos; return sk; } static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) { struct bpf_iter_meta meta; struct bpf_prog *prog; struct sock *sk = v; uid_t uid; int ret; if (v == SEQ_START_TOKEN) return 0; if (sk_fullsock(sk)) lock_sock(sk); if (unlikely(sk_unhashed(sk))) { ret = SEQ_SKIP; goto unlock; } if (sk->sk_state == TCP_TIME_WAIT) { uid = 0; } else if (sk->sk_state == TCP_NEW_SYN_RECV) { const struct request_sock *req = v; uid = from_kuid_munged(seq_user_ns(seq), sk_uid(req->rsk_listener)); } else { uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk)); } meta.seq = seq; prog = bpf_iter_get_info(&meta, false); ret = tcp_prog_seq_show(prog, &meta, v, uid); unlock: if (sk_fullsock(sk)) release_sock(sk); return ret; } static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) { struct bpf_tcp_iter_state *iter = seq->private; struct bpf_iter_meta meta; struct bpf_prog *prog; if (!v) { meta.seq = seq; prog = bpf_iter_get_info(&meta, true); if (prog) (void)tcp_prog_seq_show(prog, &meta, v, 0); } if (iter->cur_sk < iter->end_sk) bpf_iter_tcp_put_batch(iter); } static const struct seq_operations bpf_iter_tcp_seq_ops = { .show = bpf_iter_tcp_seq_show, .start = bpf_iter_tcp_seq_start, .next = bpf_iter_tcp_seq_next, .stop = bpf_iter_tcp_seq_stop, }; #endif static unsigned short seq_file_family(const struct seq_file *seq) { const struct tcp_seq_afinfo *afinfo; #ifdef CONFIG_BPF_SYSCALL /* Iterated from bpf_iter. Let the bpf prog to filter instead. */ if (seq->op == &bpf_iter_tcp_seq_ops) return AF_UNSPEC; #endif /* Iterated from proc fs */ afinfo = pde_data(file_inode(seq->file)); return afinfo->family; } static const struct seq_operations tcp4_seq_ops = { .show = tcp4_seq_show, .start = tcp_seq_start, .next = tcp_seq_next, .stop = tcp_seq_stop, }; static struct tcp_seq_afinfo tcp4_seq_afinfo = { .family = AF_INET, }; static int __net_init tcp4_proc_init_net(struct net *net) { if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) return -ENOMEM; return 0; } static void __net_exit tcp4_proc_exit_net(struct net *net) { remove_proc_entry("tcp", net->proc_net); } static struct pernet_operations tcp4_net_ops = { .init = tcp4_proc_init_net, .exit = tcp4_proc_exit_net, }; int __init tcp4_proc_init(void) { return register_pernet_subsys(&tcp4_net_ops); } void tcp4_proc_exit(void) { unregister_pernet_subsys(&tcp4_net_ops); } #endif /* CONFIG_PROC_FS */ /* @wake is one when sk_stream_write_space() calls us. * This sends EPOLLOUT only if notsent_bytes is half the limit. * This mimics the strategy used in sock_def_write_space(). */ bool tcp_stream_memory_free(const struct sock *sk, int wake) { const struct tcp_sock *tp = tcp_sk(sk); u32 notsent_bytes = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt); return (notsent_bytes << wake) < tcp_notsent_lowat(tp); } EXPORT_SYMBOL(tcp_stream_memory_free); struct proto tcp_prot = { .name = "TCP", .owner = THIS_MODULE, .close = tcp_close, .pre_connect = tcp_v4_pre_connect, .connect = tcp_v4_connect, .disconnect = tcp_disconnect, .accept = inet_csk_accept, .ioctl = tcp_ioctl, .init = tcp_v4_init_sock, .destroy = tcp_v4_destroy_sock, .shutdown = tcp_shutdown, .setsockopt = tcp_setsockopt, .getsockopt = tcp_getsockopt, .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, .keepalive = tcp_set_keepalive, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, .splice_eof = tcp_splice_eof, .backlog_rcv = tcp_v4_do_rcv, .release_cb = tcp_release_cb, .hash = inet_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, .put_port = inet_put_port, #ifdef CONFIG_BPF_SYSCALL .psock_update_sk_prot = tcp_bpf_update_proto, #endif .enter_memory_pressure = tcp_enter_memory_pressure, .leave_memory_pressure = tcp_leave_memory_pressure, .stream_memory_free = tcp_stream_memory_free, .sockets_allocated = &tcp_sockets_allocated, .memory_allocated = &net_aligned_data.tcp_memory_allocated, .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, .memory_pressure = &tcp_memory_pressure, .sysctl_mem = sysctl_tcp_mem, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp_sock), .slab_flags = SLAB_TYPESAFE_BY_RCU, .twsk_prot = &tcp_timewait_sock_ops, .rsk_prot = &tcp_request_sock_ops, .h.hashinfo = NULL, .no_autobind = true, .diag_destroy = tcp_abort, }; EXPORT_SYMBOL(tcp_prot); static void __net_exit tcp_sk_exit(struct net *net) { if (net->ipv4.tcp_congestion_control) bpf_module_put(net->ipv4.tcp_congestion_control, net->ipv4.tcp_congestion_control->owner); } static void __net_init tcp_set_hashinfo(struct net *net) { struct inet_hashinfo *hinfo; unsigned int ehash_entries; struct net *old_net; if (net_eq(net, &init_net)) goto fallback; old_net = current->nsproxy->net_ns; ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries); if (!ehash_entries) goto fallback; ehash_entries = roundup_pow_of_two(ehash_entries); hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries); if (!hinfo) { pr_warn("Failed to allocate TCP ehash (entries: %u) " "for a netns, fallback to the global one\n", ehash_entries); fallback: hinfo = &tcp_hashinfo; ehash_entries = tcp_hashinfo.ehash_mask + 1; } net->ipv4.tcp_death_row.hashinfo = hinfo; net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2; net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128); } static int __net_init tcp_sk_init(struct net *net) { net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN; net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL; net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON; net->ipv4.sysctl_tcp_ecn_fallback = 1; net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; net->ipv4.sysctl_tcp_syncookies = 1; net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; net->ipv4.sysctl_tcp_orphan_retries = 0; net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; net->ipv4.sysctl_tcp_tw_reuse = 2; net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC; net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1); tcp_set_hashinfo(net); net->ipv4.sysctl_tcp_sack = 1; net->ipv4.sysctl_tcp_window_scaling = 1; net->ipv4.sysctl_tcp_timestamps = 1; net->ipv4.sysctl_tcp_early_retrans = 3; net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ net->ipv4.sysctl_tcp_retrans_collapse = 1; net->ipv4.sysctl_tcp_max_reordering = 300; net->ipv4.sysctl_tcp_dsack = 1; net->ipv4.sysctl_tcp_app_win = 31; net->ipv4.sysctl_tcp_adv_win_scale = 1; net->ipv4.sysctl_tcp_frto = 2; net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; /* This limits the percentage of the congestion window which we * will allow a single TSO frame to consume. Building TSO frames * which are too large can cause TCP streams to be bursty. */ net->ipv4.sysctl_tcp_tso_win_divisor = 3; /* Default TSQ limit of 4 MB */ net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20; /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */ net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX; net->ipv4.sysctl_tcp_min_tso_segs = 2; net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */ net->ipv4.sysctl_tcp_min_rtt_wlen = 300; net->ipv4.sysctl_tcp_autocorking = 1; net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; if (net != &init_net) { memcpy(net->ipv4.sysctl_tcp_rmem, init_net.ipv4.sysctl_tcp_rmem, sizeof(init_net.ipv4.sysctl_tcp_rmem)); memcpy(net->ipv4.sysctl_tcp_wmem, init_net.ipv4.sysctl_tcp_wmem, sizeof(init_net.ipv4.sysctl_tcp_wmem)); } net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; net->ipv4.sysctl_tcp_comp_sack_nr = 44; net->ipv4.sysctl_tcp_backlog_ack_defer = 1; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; atomic_set(&net->ipv4.tfo_active_disable_times, 0); /* Set default values for PLB */ net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */ net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3; net->ipv4.sysctl_tcp_plb_rehash_rounds = 12; net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60; /* Default congestion threshold for PLB to mark a round is 50% */ net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2; /* Reno is always built in */ if (!net_eq(net, &init_net) && bpf_try_module_get(init_net.ipv4.tcp_congestion_control, init_net.ipv4.tcp_congestion_control->owner)) net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; else net->ipv4.tcp_congestion_control = &tcp_reno; net->ipv4.sysctl_tcp_syn_linear_timeouts = 4; net->ipv4.sysctl_tcp_shrink_window = 0; net->ipv4.sysctl_tcp_pingpong_thresh = 1; net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN); net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC; return 0; } static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) { struct net *net; /* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work * and failed setup_net error unwinding path are serialized. * * tcp_twsk_purge() handles twsk in any dead netns, not just those in * net_exit_list, the thread that dismantles a particular twsk must * do so without other thread progressing to refcount_dec_and_test() of * tcp_death_row.tw_refcount. */ mutex_lock(&tcp_exit_batch_mutex); tcp_twsk_purge(net_exit_list); list_for_each_entry(net, net_exit_list, exit_list) { inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo); WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount)); tcp_fastopen_ctx_destroy(net); } mutex_unlock(&tcp_exit_batch_mutex); } static struct pernet_operations __net_initdata tcp_sk_ops = { .init = tcp_sk_init, .exit = tcp_sk_exit, .exit_batch = tcp_sk_exit_batch, }; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, struct sock_common *sk_common, uid_t uid) #define INIT_BATCH_SZ 16 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) { struct bpf_tcp_iter_state *iter = priv_data; int err; err = bpf_iter_init_seq_net(priv_data, aux); if (err) return err; err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER); if (err) { bpf_iter_fini_seq_net(priv_data); return err; } return 0; } static void bpf_iter_fini_tcp(void *priv_data) { struct bpf_tcp_iter_state *iter = priv_data; bpf_iter_fini_seq_net(priv_data); kvfree(iter->batch); } static const struct bpf_iter_seq_info tcp_seq_info = { .seq_ops = &bpf_iter_tcp_seq_ops, .init_seq_private = bpf_iter_init_tcp, .fini_seq_private = bpf_iter_fini_tcp, .seq_priv_size = sizeof(struct bpf_tcp_iter_state), }; static const struct bpf_func_proto * bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_setsockopt: return &bpf_sk_setsockopt_proto; case BPF_FUNC_getsockopt: return &bpf_sk_getsockopt_proto; default: return NULL; } } static struct bpf_iter_reg tcp_reg_info = { .target = "tcp", .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__tcp, sk_common), PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, }, .get_func_proto = bpf_iter_tcp_get_func_proto, .seq_info = &tcp_seq_info, }; static void __init bpf_iter_register(void) { tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; if (bpf_iter_reg_target(&tcp_reg_info)) pr_warn("Warning: could not register bpf iterator tcp\n"); } #endif void __init tcp_v4_init(void) { int cpu, res; for_each_possible_cpu(cpu) { struct sock *sk; res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, IPPROTO_TCP, &init_net); if (res) panic("Failed to create the TCP control socket.\n"); sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); /* Please enforce IP_DF and IPID==0 for RST and * ACK sent in SYN-RECV and TIME-WAIT state. */ inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; sk->sk_clockid = CLOCK_MONOTONIC; per_cpu(ipv4_tcp_sk.sock, cpu) = sk; } if (register_pernet_subsys(&tcp_sk_ops)) panic("Failed to create the TCP control socket.\n"); #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) bpf_iter_register(); #endif }
5 5 4 3 3 3 2 2 5 3 3 5 1 1 1 1 1 1 1 1 1 1 2 2 2 1 2 2 1 2 1 1 2 2 2 2 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 // SPDX-License-Identifier: GPL-2.0 /* dvb-usb-remote.c is part of the DVB USB library. * * Copyright (C) 2004-6 Patrick Boettcher (patrick.boettcher@posteo.de) * see dvb-usb-init.c for copyright information. * * This file contains functions for initializing the input-device and for handling remote-control-queries. */ #include "dvb-usb-common.h" #include <linux/usb/input.h> static unsigned int legacy_dvb_usb_get_keymap_index(const struct input_keymap_entry *ke, struct rc_map_table *keymap, unsigned int keymap_size) { unsigned int index; unsigned int scancode; if (ke->flags & INPUT_KEYMAP_BY_INDEX) { index = ke->index; } else { if (input_scancode_to_scalar(ke, &scancode)) return keymap_size; /* See if we can match the raw key code. */ for (index = 0; index < keymap_size; index++) if (keymap[index].scancode == scancode) break; /* See if there is an unused hole in the map */ if (index >= keymap_size) { for (index = 0; index < keymap_size; index++) { if (keymap[index].keycode == KEY_RESERVED || keymap[index].keycode == KEY_UNKNOWN) { break; } } } } return index; } static int legacy_dvb_usb_getkeycode(struct input_dev *dev, struct input_keymap_entry *ke) { struct dvb_usb_device *d = input_get_drvdata(dev); struct rc_map_table *keymap = d->props.rc.legacy.rc_map_table; unsigned int keymap_size = d->props.rc.legacy.rc_map_size; unsigned int index; index = legacy_dvb_usb_get_keymap_index(ke, keymap, keymap_size); if (index >= keymap_size) return -EINVAL; ke->keycode = keymap[index].keycode; if (ke->keycode == KEY_UNKNOWN) ke->keycode = KEY_RESERVED; ke->len = sizeof(keymap[index].scancode); memcpy(&ke->scancode, &keymap[index].scancode, ke->len); ke->index = index; return 0; } static int legacy_dvb_usb_setkeycode(struct input_dev *dev, const struct input_keymap_entry *ke, unsigned int *old_keycode) { struct dvb_usb_device *d = input_get_drvdata(dev); struct rc_map_table *keymap = d->props.rc.legacy.rc_map_table; unsigned int keymap_size = d->props.rc.legacy.rc_map_size; unsigned int index; index = legacy_dvb_usb_get_keymap_index(ke, keymap, keymap_size); /* * FIXME: Currently, it is not possible to increase the size of * scancode table. For it to happen, one possibility * would be to allocate a table with key_map_size + 1, * copying data, appending the new key on it, and freeing * the old one - or maybe just allocating some spare space */ if (index >= keymap_size) return -EINVAL; *old_keycode = keymap[index].keycode; keymap->keycode = ke->keycode; __set_bit(ke->keycode, dev->keybit); if (*old_keycode != KEY_RESERVED) { __clear_bit(*old_keycode, dev->keybit); for (index = 0; index < keymap_size; index++) { if (keymap[index].keycode == *old_keycode) { __set_bit(*old_keycode, dev->keybit); break; } } } return 0; } /* Remote-control poll function - called every dib->rc_query_interval ms to see * whether the remote control has received anything. * * TODO: Fix the repeat rate of the input device. */ static void legacy_dvb_usb_read_remote_control(struct work_struct *work) { struct dvb_usb_device *d = container_of(work, struct dvb_usb_device, rc_query_work.work); u32 event; int state; /* TODO: need a lock here. We can simply skip checking for the remote control if we're busy. */ /* when the parameter has been set to 1 via sysfs while the driver was running */ if (dvb_usb_disable_rc_polling) return; if (d->props.rc.legacy.rc_query(d,&event,&state)) { err("error while querying for an remote control event."); goto schedule; } switch (state) { case REMOTE_NO_KEY_PRESSED: break; case REMOTE_KEY_PRESSED: deb_rc("key pressed\n"); d->last_event = event; input_event(d->input_dev, EV_KEY, event, 1); input_sync(d->input_dev); input_event(d->input_dev, EV_KEY, d->last_event, 0); input_sync(d->input_dev); break; case REMOTE_KEY_REPEAT: deb_rc("key repeated\n"); input_event(d->input_dev, EV_KEY, event, 1); input_sync(d->input_dev); input_event(d->input_dev, EV_KEY, d->last_event, 0); input_sync(d->input_dev); break; default: break; } /* improved repeat handling ??? switch (state) { case REMOTE_NO_KEY_PRESSED: deb_rc("NO KEY PRESSED\n"); if (d->last_state != REMOTE_NO_KEY_PRESSED) { deb_rc("releasing event %d\n",d->last_event); input_event(d->rc_input_dev, EV_KEY, d->last_event, 0); input_sync(d->rc_input_dev); } d->last_state = REMOTE_NO_KEY_PRESSED; d->last_event = 0; break; case REMOTE_KEY_PRESSED: deb_rc("KEY PRESSED\n"); deb_rc("pressing event %d\n",event); input_event(d->rc_input_dev, EV_KEY, event, 1); input_sync(d->rc_input_dev); d->last_event = event; d->last_state = REMOTE_KEY_PRESSED; break; case REMOTE_KEY_REPEAT: deb_rc("KEY_REPEAT\n"); if (d->last_state != REMOTE_NO_KEY_PRESSED) { deb_rc("repeating event %d\n",d->last_event); input_event(d->rc_input_dev, EV_KEY, d->last_event, 2); input_sync(d->rc_input_dev); d->last_state = REMOTE_KEY_REPEAT; } default: break; } */ schedule: schedule_delayed_work(&d->rc_query_work,msecs_to_jiffies(d->props.rc.legacy.rc_interval)); } static int legacy_dvb_usb_remote_init(struct dvb_usb_device *d) { int i, err, rc_interval; struct input_dev *input_dev; input_dev = input_allocate_device(); if (!input_dev) return -ENOMEM; input_dev->evbit[0] = BIT_MASK(EV_KEY); input_dev->name = "IR-receiver inside an USB DVB receiver"; input_dev->phys = d->rc_phys; usb_to_input_id(d->udev, &input_dev->id); input_dev->dev.parent = &d->udev->dev; d->input_dev = input_dev; d->rc_dev = NULL; input_dev->getkeycode = legacy_dvb_usb_getkeycode; input_dev->setkeycode = legacy_dvb_usb_setkeycode; /* set the bits for the keys */ deb_rc("key map size: %d\n", d->props.rc.legacy.rc_map_size); for (i = 0; i < d->props.rc.legacy.rc_map_size; i++) { deb_rc("setting bit for event %d item %d\n", d->props.rc.legacy.rc_map_table[i].keycode, i); set_bit(d->props.rc.legacy.rc_map_table[i].keycode, input_dev->keybit); } /* setting these two values to non-zero, we have to manage key repeats */ input_dev->rep[REP_PERIOD] = d->props.rc.legacy.rc_interval; input_dev->rep[REP_DELAY] = d->props.rc.legacy.rc_interval + 150; input_set_drvdata(input_dev, d); err = input_register_device(input_dev); if (err) input_free_device(input_dev); rc_interval = d->props.rc.legacy.rc_interval; INIT_DELAYED_WORK(&d->rc_query_work, legacy_dvb_usb_read_remote_control); info("schedule remote query interval to %d msecs.", rc_interval); schedule_delayed_work(&d->rc_query_work, msecs_to_jiffies(rc_interval)); d->state |= DVB_USB_STATE_REMOTE; return err; } /* Remote-control poll function - called every dib->rc_query_interval ms to see * whether the remote control has received anything. * * TODO: Fix the repeat rate of the input device. */ static void dvb_usb_read_remote_control(struct work_struct *work) { struct dvb_usb_device *d = container_of(work, struct dvb_usb_device, rc_query_work.work); int err; /* TODO: need a lock here. We can simply skip checking for the remote control if we're busy. */ /* when the parameter has been set to 1 via sysfs while the * driver was running, or when bulk mode is enabled after IR init */ if (dvb_usb_disable_rc_polling || d->props.rc.core.bulk_mode) return; err = d->props.rc.core.rc_query(d); if (err) err("error %d while querying for an remote control event.", err); schedule_delayed_work(&d->rc_query_work, msecs_to_jiffies(d->props.rc.core.rc_interval)); } static int rc_core_dvb_usb_remote_init(struct dvb_usb_device *d) { int err, rc_interval; struct rc_dev *dev; dev = rc_allocate_device(d->props.rc.core.driver_type); if (!dev) return -ENOMEM; dev->driver_name = d->props.rc.core.module_name; dev->map_name = d->props.rc.core.rc_codes; dev->change_protocol = d->props.rc.core.change_protocol; dev->allowed_protocols = d->props.rc.core.allowed_protos; usb_to_input_id(d->udev, &dev->input_id); dev->device_name = d->desc->name; dev->input_phys = d->rc_phys; dev->dev.parent = &d->udev->dev; dev->priv = d; dev->scancode_mask = d->props.rc.core.scancode_mask; err = rc_register_device(dev); if (err < 0) { rc_free_device(dev); return err; } d->input_dev = NULL; d->rc_dev = dev; if (!d->props.rc.core.rc_query || d->props.rc.core.bulk_mode) return 0; /* Polling mode - initialize a work queue for handling it */ INIT_DELAYED_WORK(&d->rc_query_work, dvb_usb_read_remote_control); rc_interval = d->props.rc.core.rc_interval; info("schedule remote query interval to %d msecs.", rc_interval); schedule_delayed_work(&d->rc_query_work, msecs_to_jiffies(rc_interval)); return 0; } int dvb_usb_remote_init(struct dvb_usb_device *d) { int err; if (dvb_usb_disable_rc_polling) return 0; if (d->props.rc.legacy.rc_map_table && d->props.rc.legacy.rc_query) d->props.rc.mode = DVB_RC_LEGACY; else if (d->props.rc.core.rc_codes) d->props.rc.mode = DVB_RC_CORE; else return 0; usb_make_path(d->udev, d->rc_phys, sizeof(d->rc_phys)); strlcat(d->rc_phys, "/ir0", sizeof(d->rc_phys)); /* Start the remote-control polling. */ if (d->props.rc.legacy.rc_interval < 40) d->props.rc.legacy.rc_interval = 100; /* default */ if (d->props.rc.mode == DVB_RC_LEGACY) err = legacy_dvb_usb_remote_init(d); else err = rc_core_dvb_usb_remote_init(d); if (err) return err; d->state |= DVB_USB_STATE_REMOTE; return 0; } int dvb_usb_remote_exit(struct dvb_usb_device *d) { if (d->state & DVB_USB_STATE_REMOTE) { cancel_delayed_work_sync(&d->rc_query_work); if (d->props.rc.mode == DVB_RC_LEGACY) input_unregister_device(d->input_dev); else rc_unregister_device(d->rc_dev); } d->state &= ~DVB_USB_STATE_REMOTE; return 0; } #define DVB_USB_RC_NEC_EMPTY 0x00 #define DVB_USB_RC_NEC_KEY_PRESSED 0x01 #define DVB_USB_RC_NEC_KEY_REPEATED 0x02 int dvb_usb_nec_rc_key_to_event(struct dvb_usb_device *d, u8 keybuf[5], u32 *event, int *state) { int i; struct rc_map_table *keymap = d->props.rc.legacy.rc_map_table; *event = 0; *state = REMOTE_NO_KEY_PRESSED; switch (keybuf[0]) { case DVB_USB_RC_NEC_EMPTY: break; case DVB_USB_RC_NEC_KEY_PRESSED: if ((u8) ~keybuf[1] != keybuf[2] || (u8) ~keybuf[3] != keybuf[4]) { deb_err("remote control checksum failed.\n"); break; } /* See if we can match the raw key code. */ for (i = 0; i < d->props.rc.legacy.rc_map_size; i++) if (rc5_custom(&keymap[i]) == keybuf[1] && rc5_data(&keymap[i]) == keybuf[3]) { *event = keymap[i].keycode; *state = REMOTE_KEY_PRESSED; return 0; } deb_err("key mapping failed - no appropriate key found in keymapping\n"); break; case DVB_USB_RC_NEC_KEY_REPEATED: *state = REMOTE_KEY_REPEAT; break; default: deb_err("unknown type of remote status: %d\n",keybuf[0]); break; } return 0; } EXPORT_SYMBOL(dvb_usb_nec_rc_key_to_event);
1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 // SPDX-License-Identifier: GPL-2.0-only /* * Line 6 Linux USB driver * * Copyright (C) 2004-2010 Markus Grabner (line6@grabner-graz.at) */ #include <linux/slab.h> #include <linux/wait.h> #include <linux/interrupt.h> #include <linux/module.h> #include <linux/usb.h> #include <sound/core.h> #include <sound/control.h> #include "capture.h" #include "driver.h" #include "playback.h" /* Locate name in binary program dump */ #define POD_NAME_OFFSET 0 #define POD_NAME_LENGTH 16 /* Other constants */ #define POD_CONTROL_SIZE 0x80 #define POD_BUFSIZE_DUMPREQ 7 #define POD_STARTUP_DELAY 1000 /* Stages of POD startup procedure */ enum { POD_STARTUP_VERSIONREQ, POD_STARTUP_SETUP, POD_STARTUP_DONE, }; enum { LINE6_BASSPODXT, LINE6_BASSPODXTLIVE, LINE6_BASSPODXTPRO, LINE6_POCKETPOD, LINE6_PODXT, LINE6_PODXTLIVE_POD, LINE6_PODXTPRO, }; struct usb_line6_pod { /* Generic Line 6 USB data */ struct usb_line6 line6; /* Instrument monitor level */ int monitor_level; /* Current progress in startup procedure */ int startup_progress; /* Serial number of device */ u32 serial_number; /* Firmware version (x 100) */ int firmware_version; /* Device ID */ int device_id; }; #define line6_to_pod(x) container_of(x, struct usb_line6_pod, line6) #define POD_SYSEX_CODE 3 /* *INDENT-OFF* */ enum { POD_SYSEX_SAVE = 0x24, POD_SYSEX_SYSTEM = 0x56, POD_SYSEX_SYSTEMREQ = 0x57, /* POD_SYSEX_UPDATE = 0x6c, */ /* software update! */ POD_SYSEX_STORE = 0x71, POD_SYSEX_FINISH = 0x72, POD_SYSEX_DUMPMEM = 0x73, POD_SYSEX_DUMP = 0x74, POD_SYSEX_DUMPREQ = 0x75 /* dumps entire internal memory of PODxt Pro */ /* POD_SYSEX_DUMPMEM2 = 0x76 */ }; enum { POD_MONITOR_LEVEL = 0x04, POD_SYSTEM_INVALID = 0x10000 }; /* *INDENT-ON* */ enum { POD_DUMP_MEMORY = 2 }; enum { POD_BUSY_READ, POD_BUSY_WRITE, POD_CHANNEL_DIRTY, POD_SAVE_PRESSED, POD_BUSY_MIDISEND }; static const struct snd_ratden pod_ratden = { .num_min = 78125, .num_max = 78125, .num_step = 1, .den = 2 }; static struct line6_pcm_properties pod_pcm_properties = { .playback_hw = { .info = (SNDRV_PCM_INFO_MMAP | SNDRV_PCM_INFO_INTERLEAVED | SNDRV_PCM_INFO_BLOCK_TRANSFER | SNDRV_PCM_INFO_MMAP_VALID | SNDRV_PCM_INFO_PAUSE | SNDRV_PCM_INFO_SYNC_START), .formats = SNDRV_PCM_FMTBIT_S24_3LE, .rates = SNDRV_PCM_RATE_KNOT, .rate_min = 39062, .rate_max = 39063, .channels_min = 2, .channels_max = 2, .buffer_bytes_max = 60000, .period_bytes_min = 64, .period_bytes_max = 8192, .periods_min = 1, .periods_max = 1024}, .capture_hw = { .info = (SNDRV_PCM_INFO_MMAP | SNDRV_PCM_INFO_INTERLEAVED | SNDRV_PCM_INFO_BLOCK_TRANSFER | SNDRV_PCM_INFO_MMAP_VALID | SNDRV_PCM_INFO_SYNC_START), .formats = SNDRV_PCM_FMTBIT_S24_3LE, .rates = SNDRV_PCM_RATE_KNOT, .rate_min = 39062, .rate_max = 39063, .channels_min = 2, .channels_max = 2, .buffer_bytes_max = 60000, .period_bytes_min = 64, .period_bytes_max = 8192, .periods_min = 1, .periods_max = 1024}, .rates = { .nrats = 1, .rats = &pod_ratden}, .bytes_per_channel = 3 /* SNDRV_PCM_FMTBIT_S24_3LE */ }; static const char pod_version_header[] = { 0xf0, 0x7e, 0x7f, 0x06, 0x02 }; static char *pod_alloc_sysex_buffer(struct usb_line6_pod *pod, int code, int size) { return line6_alloc_sysex_buffer(&pod->line6, POD_SYSEX_CODE, code, size); } /* Process a completely received message. */ static void line6_pod_process_message(struct usb_line6 *line6) { struct usb_line6_pod *pod = line6_to_pod(line6); const unsigned char *buf = pod->line6.buffer_message; if (memcmp(buf, pod_version_header, sizeof(pod_version_header)) == 0) { pod->firmware_version = buf[13] * 100 + buf[14] * 10 + buf[15]; pod->device_id = ((int)buf[8] << 16) | ((int)buf[9] << 8) | (int) buf[10]; if (pod->startup_progress == POD_STARTUP_VERSIONREQ) { pod->startup_progress = POD_STARTUP_SETUP; schedule_delayed_work(&line6->startup_work, 0); } return; } /* Only look for sysex messages from this device */ if (buf[0] != (LINE6_SYSEX_BEGIN | LINE6_CHANNEL_DEVICE) && buf[0] != (LINE6_SYSEX_BEGIN | LINE6_CHANNEL_UNKNOWN)) { return; } if (memcmp(buf + 1, line6_midi_id, sizeof(line6_midi_id)) != 0) return; if (buf[5] == POD_SYSEX_SYSTEM && buf[6] == POD_MONITOR_LEVEL) { short value = ((int)buf[7] << 12) | ((int)buf[8] << 8) | ((int)buf[9] << 4) | (int)buf[10]; pod->monitor_level = value; } } /* Send system parameter (from integer). */ static int pod_set_system_param_int(struct usb_line6_pod *pod, int value, int code) { char *sysex; static const int size = 5; sysex = pod_alloc_sysex_buffer(pod, POD_SYSEX_SYSTEM, size); if (!sysex) return -ENOMEM; sysex[SYSEX_DATA_OFS] = code; sysex[SYSEX_DATA_OFS + 1] = (value >> 12) & 0x0f; sysex[SYSEX_DATA_OFS + 2] = (value >> 8) & 0x0f; sysex[SYSEX_DATA_OFS + 3] = (value >> 4) & 0x0f; sysex[SYSEX_DATA_OFS + 4] = (value) & 0x0f; line6_send_sysex_message(&pod->line6, sysex, size); kfree(sysex); return 0; } /* "read" request on "serial_number" special file. */ static ssize_t serial_number_show(struct device *dev, struct device_attribute *attr, char *buf) { struct snd_card *card = dev_to_snd_card(dev); struct usb_line6_pod *pod = card->private_data; return sysfs_emit(buf, "%u\n", pod->serial_number); } /* "read" request on "firmware_version" special file. */ static ssize_t firmware_version_show(struct device *dev, struct device_attribute *attr, char *buf) { struct snd_card *card = dev_to_snd_card(dev); struct usb_line6_pod *pod = card->private_data; return sysfs_emit(buf, "%d.%02d\n", pod->firmware_version / 100, pod->firmware_version % 100); } /* "read" request on "device_id" special file. */ static ssize_t device_id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct snd_card *card = dev_to_snd_card(dev); struct usb_line6_pod *pod = card->private_data; return sysfs_emit(buf, "%d\n", pod->device_id); } /* POD startup procedure. This is a sequence of functions with special requirements (e.g., must not run immediately after initialization, must not run in interrupt context). After the last one has finished, the device is ready to use. */ static void pod_startup(struct usb_line6 *line6) { struct usb_line6_pod *pod = line6_to_pod(line6); switch (pod->startup_progress) { case POD_STARTUP_VERSIONREQ: /* request firmware version: */ line6_version_request_async(line6); break; case POD_STARTUP_SETUP: /* serial number: */ line6_read_serial_number(&pod->line6, &pod->serial_number); /* ALSA audio interface: */ if (snd_card_register(line6->card)) dev_err(line6->ifcdev, "Failed to register POD card.\n"); pod->startup_progress = POD_STARTUP_DONE; break; default: break; } } /* POD special files: */ static DEVICE_ATTR_RO(device_id); static DEVICE_ATTR_RO(firmware_version); static DEVICE_ATTR_RO(serial_number); static struct attribute *pod_dev_attrs[] = { &dev_attr_device_id.attr, &dev_attr_firmware_version.attr, &dev_attr_serial_number.attr, NULL }; static const struct attribute_group pod_dev_attr_group = { .name = "pod", .attrs = pod_dev_attrs, }; /* control info callback */ static int snd_pod_control_monitor_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER; uinfo->count = 1; uinfo->value.integer.min = 0; uinfo->value.integer.max = 65535; return 0; } /* control get callback */ static int snd_pod_control_monitor_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct snd_line6_pcm *line6pcm = snd_kcontrol_chip(kcontrol); struct usb_line6_pod *pod = line6_to_pod(line6pcm->line6); ucontrol->value.integer.value[0] = pod->monitor_level; return 0; } /* control put callback */ static int snd_pod_control_monitor_put(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct snd_line6_pcm *line6pcm = snd_kcontrol_chip(kcontrol); struct usb_line6_pod *pod = line6_to_pod(line6pcm->line6); if (ucontrol->value.integer.value[0] == pod->monitor_level) return 0; pod->monitor_level = ucontrol->value.integer.value[0]; pod_set_system_param_int(pod, ucontrol->value.integer.value[0], POD_MONITOR_LEVEL); return 1; } /* control definition */ static const struct snd_kcontrol_new pod_control_monitor = { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = "Monitor Playback Volume", .index = 0, .access = SNDRV_CTL_ELEM_ACCESS_READWRITE, .info = snd_pod_control_monitor_info, .get = snd_pod_control_monitor_get, .put = snd_pod_control_monitor_put }; /* Try to init POD device. */ static int pod_init(struct usb_line6 *line6, const struct usb_device_id *id) { int err; struct usb_line6_pod *pod = line6_to_pod(line6); line6->process_message = line6_pod_process_message; line6->startup = pod_startup; /* create sysfs entries: */ err = snd_card_add_dev_attr(line6->card, &pod_dev_attr_group); if (err < 0) return err; /* initialize PCM subsystem: */ err = line6_init_pcm(line6, &pod_pcm_properties); if (err < 0) return err; /* register monitor control: */ err = snd_ctl_add(line6->card, snd_ctl_new1(&pod_control_monitor, line6->line6pcm)); if (err < 0) return err; /* When the sound card is registered at this point, the PODxt Live displays "Invalid Code Error 07", so we do it later in the event handler. */ if (pod->line6.properties->capabilities & LINE6_CAP_CONTROL) { pod->monitor_level = POD_SYSTEM_INVALID; /* initiate startup procedure: */ schedule_delayed_work(&line6->startup_work, msecs_to_jiffies(POD_STARTUP_DELAY)); } return 0; } #define LINE6_DEVICE(prod) USB_DEVICE(0x0e41, prod) #define LINE6_IF_NUM(prod, n) USB_DEVICE_INTERFACE_NUMBER(0x0e41, prod, n) /* table of devices that work with this driver */ static const struct usb_device_id pod_id_table[] = { { LINE6_DEVICE(0x4250), .driver_info = LINE6_BASSPODXT }, { LINE6_DEVICE(0x4642), .driver_info = LINE6_BASSPODXTLIVE }, { LINE6_DEVICE(0x4252), .driver_info = LINE6_BASSPODXTPRO }, { LINE6_IF_NUM(0x5051, 1), .driver_info = LINE6_POCKETPOD }, { LINE6_DEVICE(0x5044), .driver_info = LINE6_PODXT }, { LINE6_IF_NUM(0x4650, 0), .driver_info = LINE6_PODXTLIVE_POD }, { LINE6_DEVICE(0x5050), .driver_info = LINE6_PODXTPRO }, {} }; MODULE_DEVICE_TABLE(usb, pod_id_table); static const struct line6_properties pod_properties_table[] = { [LINE6_BASSPODXT] = { .id = "BassPODxt", .name = "BassPODxt", .capabilities = LINE6_CAP_CONTROL | LINE6_CAP_CONTROL_MIDI | LINE6_CAP_PCM | LINE6_CAP_HWMON, .altsetting = 5, .ep_ctrl_r = 0x84, .ep_ctrl_w = 0x03, .ep_audio_r = 0x82, .ep_audio_w = 0x01, }, [LINE6_BASSPODXTLIVE] = { .id = "BassPODxtLive", .name = "BassPODxt Live", .capabilities = LINE6_CAP_CONTROL | LINE6_CAP_CONTROL_MIDI | LINE6_CAP_PCM | LINE6_CAP_HWMON, .altsetting = 1, .ep_ctrl_r = 0x84, .ep_ctrl_w = 0x03, .ep_audio_r = 0x82, .ep_audio_w = 0x01, }, [LINE6_BASSPODXTPRO] = { .id = "BassPODxtPro", .name = "BassPODxt Pro", .capabilities = LINE6_CAP_CONTROL | LINE6_CAP_CONTROL_MIDI | LINE6_CAP_PCM | LINE6_CAP_HWMON, .altsetting = 5, .ep_ctrl_r = 0x84, .ep_ctrl_w = 0x03, .ep_audio_r = 0x82, .ep_audio_w = 0x01, }, [LINE6_POCKETPOD] = { .id = "PocketPOD", .name = "Pocket POD", .capabilities = LINE6_CAP_CONTROL | LINE6_CAP_CONTROL_MIDI, .altsetting = 0, .ep_ctrl_r = 0x82, .ep_ctrl_w = 0x02, /* no audio channel */ }, [LINE6_PODXT] = { .id = "PODxt", .name = "PODxt", .capabilities = LINE6_CAP_CONTROL | LINE6_CAP_CONTROL_MIDI | LINE6_CAP_PCM | LINE6_CAP_HWMON, .altsetting = 5, .ep_ctrl_r = 0x84, .ep_ctrl_w = 0x03, .ep_audio_r = 0x82, .ep_audio_w = 0x01, }, [LINE6_PODXTLIVE_POD] = { .id = "PODxtLive", .name = "PODxt Live", .capabilities = LINE6_CAP_CONTROL | LINE6_CAP_CONTROL_MIDI | LINE6_CAP_PCM | LINE6_CAP_HWMON, .altsetting = 1, .ep_ctrl_r = 0x84, .ep_ctrl_w = 0x03, .ep_audio_r = 0x82, .ep_audio_w = 0x01, }, [LINE6_PODXTPRO] = { .id = "PODxtPro", .name = "PODxt Pro", .capabilities = LINE6_CAP_CONTROL | LINE6_CAP_CONTROL_MIDI | LINE6_CAP_PCM | LINE6_CAP_HWMON, .altsetting = 5, .ep_ctrl_r = 0x84, .ep_ctrl_w = 0x03, .ep_audio_r = 0x82, .ep_audio_w = 0x01, }, }; /* Probe USB device. */ static int pod_probe(struct usb_interface *interface, const struct usb_device_id *id) { return line6_probe(interface, id, "Line6-POD", &pod_properties_table[id->driver_info], pod_init, sizeof(struct usb_line6_pod)); } static struct usb_driver pod_driver = { .name = KBUILD_MODNAME, .probe = pod_probe, .disconnect = line6_disconnect, #ifdef CONFIG_PM .suspend = line6_suspend, .resume = line6_resume, .reset_resume = line6_resume, #endif .id_table = pod_id_table, }; module_usb_driver(pod_driver); MODULE_DESCRIPTION("Line 6 POD USB driver"); MODULE_LICENSE("GPL");
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * ocfs2.h * * Defines macros and structures used in OCFS2 * * Copyright (C) 2002, 2004 Oracle. All rights reserved. */ #ifndef OCFS2_H #define OCFS2_H #include <linux/spinlock.h> #include <linux/sched.h> #include <linux/wait.h> #include <linux/list.h> #include <linux/llist.h> #include <linux/rbtree.h> #include <linux/workqueue.h> #include <linux/kref.h> #include <linux/mutex.h> #include <linux/lockdep.h> #include <linux/jbd2.h> /* For union ocfs2_dlm_lksb */ #include "stackglue.h" #include "ocfs2_fs.h" #include "ocfs2_lockid.h" #include "ocfs2_ioctl.h" /* For struct ocfs2_blockcheck_stats */ #include "blockcheck.h" #include "reservations.h" #include "filecheck.h" /* Caching of metadata buffers */ /* Most user visible OCFS2 inodes will have very few pieces of * metadata, but larger files (including bitmaps, etc) must be taken * into account when designing an access scheme. We allow a small * amount of inlined blocks to be stored on an array and grow the * structure into a rb tree when necessary. */ #define OCFS2_CACHE_INFO_MAX_ARRAY 2 /* Flags for ocfs2_caching_info */ enum ocfs2_caching_info_flags { /* Indicates that the metadata cache is using the inline array */ OCFS2_CACHE_FL_INLINE = 1<<1, }; struct ocfs2_caching_operations; struct ocfs2_caching_info { /* * The parent structure provides the locks, but because the * parent structure can differ, it provides locking operations * to struct ocfs2_caching_info. */ const struct ocfs2_caching_operations *ci_ops; /* next two are protected by trans_inc_lock */ /* which transaction were we created on? Zero if none. */ unsigned long ci_created_trans; /* last transaction we were a part of. */ unsigned long ci_last_trans; /* Cache structures */ unsigned int ci_flags; unsigned int ci_num_cached; union { sector_t ci_array[OCFS2_CACHE_INFO_MAX_ARRAY]; struct rb_root ci_tree; } ci_cache; }; /* * Need this prototype here instead of in uptodate.h because journal.h * uses it. */ struct super_block *ocfs2_metadata_cache_get_super(struct ocfs2_caching_info *ci); /* this limits us to 256 nodes * if we need more, we can do a kmalloc for the map */ #define OCFS2_NODE_MAP_MAX_NODES 256 struct ocfs2_node_map { u16 num_nodes; unsigned long map[BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES)]; }; enum ocfs2_ast_action { OCFS2_AST_INVALID = 0, OCFS2_AST_ATTACH, OCFS2_AST_CONVERT, OCFS2_AST_DOWNCONVERT, }; /* actions for an unlockast function to take. */ enum ocfs2_unlock_action { OCFS2_UNLOCK_INVALID = 0, OCFS2_UNLOCK_CANCEL_CONVERT, OCFS2_UNLOCK_DROP_LOCK, }; /* ocfs2_lock_res->l_flags flags. */ #define OCFS2_LOCK_ATTACHED (0x00000001) /* we have initialized * the lvb */ #define OCFS2_LOCK_BUSY (0x00000002) /* we are currently in * dlm_lock */ #define OCFS2_LOCK_BLOCKED (0x00000004) /* blocked waiting to * downconvert*/ #define OCFS2_LOCK_LOCAL (0x00000008) /* newly created inode */ #define OCFS2_LOCK_NEEDS_REFRESH (0x00000010) #define OCFS2_LOCK_REFRESHING (0x00000020) #define OCFS2_LOCK_INITIALIZED (0x00000040) /* track initialization * for shutdown paths */ #define OCFS2_LOCK_FREEING (0x00000080) /* help dlmglue track * when to skip queueing * a lock because it's * about to be * dropped. */ #define OCFS2_LOCK_QUEUED (0x00000100) /* queued for downconvert */ #define OCFS2_LOCK_NOCACHE (0x00000200) /* don't use a holder count */ #define OCFS2_LOCK_PENDING (0x00000400) /* This lockres is pending a call to dlm_lock. Only exists with BUSY set. */ #define OCFS2_LOCK_UPCONVERT_FINISHING (0x00000800) /* blocks the dc thread * from downconverting * before the upconvert * has completed */ #define OCFS2_LOCK_NONBLOCK_FINISHED (0x00001000) /* NONBLOCK cluster * lock has already * returned, do not block * dc thread from * downconverting */ struct ocfs2_lock_res_ops; typedef void (*ocfs2_lock_callback)(int status, unsigned long data); #ifdef CONFIG_OCFS2_FS_STATS struct ocfs2_lock_stats { u64 ls_total; /* Total wait in NSEC */ u32 ls_gets; /* Num acquires */ u32 ls_fail; /* Num failed acquires */ /* Storing max wait in usecs saves 24 bytes per inode */ u32 ls_max; /* Max wait in USEC */ u64 ls_last; /* Last unlock time in USEC */ }; #endif struct ocfs2_lock_res { void *l_priv; const struct ocfs2_lock_res_ops *l_ops; struct list_head l_blocked_list; struct list_head l_mask_waiters; struct list_head l_holders; unsigned long l_flags; char l_name[OCFS2_LOCK_ID_MAX_LEN]; unsigned int l_ro_holders; unsigned int l_ex_holders; signed char l_level; signed char l_requested; signed char l_blocking; /* Data packed - type enum ocfs2_lock_type */ unsigned char l_type; /* used from AST/BAST funcs. */ /* Data packed - enum type ocfs2_ast_action */ unsigned char l_action; /* Data packed - enum type ocfs2_unlock_action */ unsigned char l_unlock_action; unsigned int l_pending_gen; spinlock_t l_lock; struct ocfs2_dlm_lksb l_lksb; wait_queue_head_t l_event; struct list_head l_debug_list; #ifdef CONFIG_OCFS2_FS_STATS struct ocfs2_lock_stats l_lock_prmode; /* PR mode stats */ u32 l_lock_refresh; /* Disk refreshes */ u64 l_lock_wait; /* First lock wait time */ struct ocfs2_lock_stats l_lock_exmode; /* EX mode stats */ #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map l_lockdep_map; #endif }; enum ocfs2_orphan_reco_type { ORPHAN_NO_NEED_TRUNCATE = 0, ORPHAN_NEED_TRUNCATE, }; enum ocfs2_orphan_scan_state { ORPHAN_SCAN_ACTIVE, ORPHAN_SCAN_INACTIVE }; struct ocfs2_orphan_scan { struct mutex os_lock; struct ocfs2_super *os_osb; struct ocfs2_lock_res os_lockres; /* lock to synchronize scans */ struct delayed_work os_orphan_scan_work; time64_t os_scantime; /* time this node ran the scan */ u32 os_count; /* tracks node specific scans */ u32 os_seqno; /* tracks cluster wide scans */ atomic_t os_state; /* ACTIVE or INACTIVE */ }; struct ocfs2_dlm_debug { struct kref d_refcnt; u32 d_filter_secs; struct list_head d_lockres_tracking; }; enum ocfs2_vol_state { VOLUME_INIT = 0, VOLUME_MOUNTED, VOLUME_MOUNTED_QUOTAS, VOLUME_DISMOUNTED, VOLUME_DISABLED }; struct ocfs2_alloc_stats { atomic_t moves; atomic_t local_data; atomic_t bitmap_data; atomic_t bg_allocs; atomic_t bg_extends; }; enum ocfs2_local_alloc_state { OCFS2_LA_UNUSED = 0, /* Local alloc will never be used for * this mountpoint. */ OCFS2_LA_ENABLED, /* Local alloc is in use. */ OCFS2_LA_THROTTLED, /* Local alloc is in use, but number * of bits has been reduced. */ OCFS2_LA_DISABLED /* Local alloc has temporarily been * disabled. */ }; enum ocfs2_mount_options { OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Local heartbeat */ OCFS2_MOUNT_BARRIER = 1 << 1, /* Use block barriers */ OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */ OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */ OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */ OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */ OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */ OCFS2_MOUNT_INODE64 = 1 << 7, /* Allow inode numbers > 2^32 */ OCFS2_MOUNT_POSIX_ACL = 1 << 8, /* Force POSIX access control lists */ OCFS2_MOUNT_NO_POSIX_ACL = 1 << 9, /* Disable POSIX access control lists */ OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */ OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */ OCFS2_MOUNT_COHERENCY_BUFFERED = 1 << 12, /* Allow concurrent O_DIRECT writes */ OCFS2_MOUNT_HB_NONE = 1 << 13, /* No heartbeat */ OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */ OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */ OCFS2_MOUNT_ERRORS_CONT = 1 << 16, /* Return EIO to the calling process on error */ OCFS2_MOUNT_ERRORS_ROFS = 1 << 17, /* Change filesystem to read-only on error */ }; #define OCFS2_OSB_SOFT_RO 0x0001 #define OCFS2_OSB_HARD_RO 0x0002 #define OCFS2_OSB_ERROR_FS 0x0004 #define OCFS2_DEFAULT_ATIME_QUANTUM 60 struct ocfs2_triggers { struct jbd2_buffer_trigger_type ot_triggers; int ot_offset; struct super_block *sb; }; enum ocfs2_journal_trigger_type { OCFS2_JTR_DI, OCFS2_JTR_EB, OCFS2_JTR_RB, OCFS2_JTR_GD, OCFS2_JTR_DB, OCFS2_JTR_XB, OCFS2_JTR_DQ, OCFS2_JTR_DR, OCFS2_JTR_DL, OCFS2_JTR_NONE /* This must be the last entry */ }; #define OCFS2_JOURNAL_TRIGGER_COUNT OCFS2_JTR_NONE void ocfs2_initialize_journal_triggers(struct super_block *sb, struct ocfs2_triggers triggers[]); enum ocfs2_recovery_state { OCFS2_REC_ENABLED = 0, OCFS2_REC_QUOTA_WANT_DISABLE, /* * Must be OCFS2_REC_QUOTA_WANT_DISABLE + 1 for * ocfs2_recovery_disable_quota() to work. */ OCFS2_REC_QUOTA_DISABLED, OCFS2_REC_WANT_DISABLE, /* * Must be OCFS2_REC_WANT_DISABLE + 1 for ocfs2_recovery_exit() to work */ OCFS2_REC_DISABLED, }; struct ocfs2_journal; struct ocfs2_slot_info; struct ocfs2_recovery_map; struct ocfs2_replay_map; struct ocfs2_quota_recovery; struct ocfs2_super { struct task_struct *commit_task; struct super_block *sb; struct inode *root_inode; struct inode *sys_root_inode; struct inode *global_system_inodes[NUM_GLOBAL_SYSTEM_INODES]; struct inode **local_system_inodes; struct ocfs2_slot_info *slot_info; u32 *slot_recovery_generations; spinlock_t node_map_lock; u64 root_blkno; u64 system_dir_blkno; u64 bitmap_blkno; u32 bitmap_cpg; char *uuid_str; u32 uuid_hash; u8 *vol_label; u64 first_cluster_group_blkno; u32 fs_generation; u32 s_feature_compat; u32 s_feature_incompat; u32 s_feature_ro_compat; /* Protects s_next_generation, osb_flags and s_inode_steal_slot. * Could protect more on osb as it's very short lived. */ spinlock_t osb_lock; u32 s_next_generation; unsigned long osb_flags; u16 s_inode_steal_slot; u16 s_meta_steal_slot; atomic_t s_num_inodes_stolen; atomic_t s_num_meta_stolen; unsigned long s_mount_opt; unsigned int s_atime_quantum; unsigned int max_slots; unsigned int node_num; int slot_num; int preferred_slot; int s_sectsize_bits; int s_clustersize; int s_clustersize_bits; unsigned int s_xattr_inline_size; atomic_t vol_state; struct mutex recovery_lock; struct ocfs2_recovery_map *recovery_map; struct ocfs2_replay_map *replay_map; struct task_struct *recovery_thread_task; enum ocfs2_recovery_state recovery_state; wait_queue_head_t checkpoint_event; struct ocfs2_journal *journal; unsigned long osb_commit_interval; /* Journal triggers for checksum */ struct ocfs2_triggers s_journal_triggers[OCFS2_JOURNAL_TRIGGER_COUNT]; struct delayed_work la_enable_wq; /* * Must hold local alloc i_rwsem and osb->osb_lock to change * local_alloc_bits. Reads can be done under either lock. */ unsigned int local_alloc_bits; unsigned int local_alloc_default_bits; /* osb_clusters_at_boot can become stale! Do not trust it to * be up to date. */ unsigned int osb_clusters_at_boot; enum ocfs2_local_alloc_state local_alloc_state; /* protected * by osb_lock */ struct buffer_head *local_alloc_bh; u64 la_last_gd; struct ocfs2_reservation_map osb_la_resmap; unsigned int osb_resv_level; unsigned int osb_dir_resv_level; /* Next two fields are for local node slot recovery during * mount. */ struct ocfs2_dinode *local_alloc_copy; struct ocfs2_quota_recovery *quota_rec; struct ocfs2_blockcheck_stats osb_ecc_stats; struct ocfs2_alloc_stats alloc_stats; char dev_str[20]; /* "major,minor" of the device */ u8 osb_stackflags; char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; char osb_cluster_name[OCFS2_CLUSTER_NAME_LEN + 1]; struct ocfs2_cluster_connection *cconn; struct ocfs2_lock_res osb_super_lockres; struct ocfs2_lock_res osb_rename_lockres; struct ocfs2_lock_res osb_nfs_sync_lockres; struct rw_semaphore nfs_sync_rwlock; struct ocfs2_lock_res osb_trim_fs_lockres; struct mutex obs_trim_fs_mutex; struct ocfs2_dlm_debug *osb_dlm_debug; struct dentry *osb_debug_root; wait_queue_head_t recovery_event; spinlock_t dc_task_lock; struct task_struct *dc_task; wait_queue_head_t dc_event; unsigned long dc_wake_sequence; unsigned long dc_work_sequence; /* * Any thread can add locks to the list, but the downconvert * thread is the only one allowed to remove locks. Any change * to this rule requires updating * ocfs2_downconvert_thread_do_work(). */ struct list_head blocked_lock_list; unsigned long blocked_lock_count; /* List of dquot structures to drop last reference to */ struct llist_head dquot_drop_list; struct work_struct dquot_drop_work; wait_queue_head_t osb_mount_event; /* Truncate log info */ struct inode *osb_tl_inode; struct buffer_head *osb_tl_bh; struct delayed_work osb_truncate_log_wq; atomic_t osb_tl_disable; /* * How many clusters in our truncate log. * It must be protected by osb_tl_inode->i_rwsem. */ unsigned int truncated_clusters; struct ocfs2_node_map osb_recovering_orphan_dirs; unsigned int *osb_orphan_wipes; wait_queue_head_t osb_wipe_event; struct ocfs2_orphan_scan osb_orphan_scan; /* used to protect metaecc calculation check of xattr. */ spinlock_t osb_xattr_lock; unsigned int osb_dx_mask; u32 osb_dx_seed[4]; /* the group we used to allocate inodes. */ u64 osb_inode_alloc_group; /* rb tree root for refcount lock. */ struct rb_root osb_rf_lock_tree; struct ocfs2_refcount_tree *osb_ref_tree_lru; struct mutex system_file_mutex; /* * OCFS2 needs to schedule several different types of work which * require cluster locking, disk I/O, recovery waits, etc. Since these * types of work tend to be heavy we avoid using the kernel events * workqueue and schedule on our own. */ struct workqueue_struct *ocfs2_wq; /* sysfs directory per partition */ struct kset *osb_dev_kset; /* file check related stuff */ struct ocfs2_filecheck_sysfs_entry osb_fc_ent; }; #define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) /* Useful typedef for passing around journal access functions */ typedef int (*ocfs2_journal_access_func)(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type); static inline int ocfs2_should_order_data(struct inode *inode) { if (!S_ISREG(inode->i_mode)) return 0; if (OCFS2_SB(inode->i_sb)->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) return 0; return 1; } static inline int ocfs2_sparse_alloc(struct ocfs2_super *osb) { if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC) return 1; return 0; } static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb) { /* * Support for sparse files is a pre-requisite */ if (!ocfs2_sparse_alloc(osb)) return 0; if (osb->s_feature_ro_compat & OCFS2_FEATURE_RO_COMPAT_UNWRITTEN) return 1; return 0; } static inline int ocfs2_supports_append_dio(struct ocfs2_super *osb) { if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_APPEND_DIO) return 1; return 0; } static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb) { if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INLINE_DATA) return 1; return 0; } static inline int ocfs2_supports_xattr(struct ocfs2_super *osb) { if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR) return 1; return 0; } static inline int ocfs2_meta_ecc(struct ocfs2_super *osb) { if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_META_ECC) return 1; return 0; } static inline int ocfs2_supports_indexed_dirs(struct ocfs2_super *osb) { if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS) return 1; return 0; } static inline int ocfs2_supports_discontig_bg(struct ocfs2_super *osb) { if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG) return 1; return 0; } static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb) { if (ocfs2_supports_indexed_dirs(osb)) return OCFS2_DX_LINK_MAX; return OCFS2_LINK_MAX; } static inline unsigned int ocfs2_read_links_count(struct ocfs2_dinode *di) { u32 nlink = le16_to_cpu(di->i_links_count); u32 hi = le16_to_cpu(di->i_links_count_hi); nlink |= (hi << OCFS2_LINKS_HI_SHIFT); return nlink; } static inline void ocfs2_set_links_count(struct ocfs2_dinode *di, u32 nlink) { u16 lo, hi; lo = nlink; hi = nlink >> OCFS2_LINKS_HI_SHIFT; di->i_links_count = cpu_to_le16(lo); di->i_links_count_hi = cpu_to_le16(hi); } static inline void ocfs2_add_links_count(struct ocfs2_dinode *di, int n) { u32 links = ocfs2_read_links_count(di); links += n; ocfs2_set_links_count(di, links); } static inline int ocfs2_refcount_tree(struct ocfs2_super *osb) { if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE) return 1; return 0; } /* set / clear functions because cluster events can make these happen * in parallel so we want the transitions to be atomic. this also * means that any future flags osb_flags must be protected by spinlock * too! */ static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb, unsigned long flag) { spin_lock(&osb->osb_lock); osb->osb_flags |= flag; spin_unlock(&osb->osb_lock); } static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb, int hard) { spin_lock(&osb->osb_lock); osb->osb_flags &= ~(OCFS2_OSB_SOFT_RO|OCFS2_OSB_HARD_RO); if (hard) osb->osb_flags |= OCFS2_OSB_HARD_RO; else osb->osb_flags |= OCFS2_OSB_SOFT_RO; spin_unlock(&osb->osb_lock); } static inline int ocfs2_is_hard_readonly(struct ocfs2_super *osb) { int ret; spin_lock(&osb->osb_lock); ret = osb->osb_flags & OCFS2_OSB_HARD_RO; spin_unlock(&osb->osb_lock); return ret; } static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb) { int ret; spin_lock(&osb->osb_lock); ret = osb->osb_flags & OCFS2_OSB_SOFT_RO; spin_unlock(&osb->osb_lock); return ret; } static inline int ocfs2_clusterinfo_valid(struct ocfs2_super *osb) { return (osb->s_feature_incompat & (OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO)); } static inline int ocfs2_userspace_stack(struct ocfs2_super *osb) { if (ocfs2_clusterinfo_valid(osb) && memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK, OCFS2_STACK_LABEL_LEN)) return 1; return 0; } static inline int ocfs2_o2cb_stack(struct ocfs2_super *osb) { if (ocfs2_clusterinfo_valid(osb) && !memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK, OCFS2_STACK_LABEL_LEN)) return 1; return 0; } static inline int ocfs2_cluster_o2cb_global_heartbeat(struct ocfs2_super *osb) { return ocfs2_o2cb_stack(osb) && (osb->osb_stackflags & OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT); } static inline int ocfs2_mount_local(struct ocfs2_super *osb) { return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT); } static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb) { return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP); } #define OCFS2_IS_VALID_DINODE(ptr) \ (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE)) #define OCFS2_IS_VALID_EXTENT_BLOCK(ptr) \ (!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE)) #define OCFS2_IS_VALID_GROUP_DESC(ptr) \ (!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE)) #define OCFS2_IS_VALID_XATTR_BLOCK(ptr) \ (!strcmp((ptr)->xb_signature, OCFS2_XATTR_BLOCK_SIGNATURE)) #define OCFS2_IS_VALID_DIR_TRAILER(ptr) \ (!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE)) #define OCFS2_IS_VALID_DX_ROOT(ptr) \ (!strcmp((ptr)->dr_signature, OCFS2_DX_ROOT_SIGNATURE)) #define OCFS2_IS_VALID_DX_LEAF(ptr) \ (!strcmp((ptr)->dl_signature, OCFS2_DX_LEAF_SIGNATURE)) #define OCFS2_IS_VALID_REFCOUNT_BLOCK(ptr) \ (!strcmp((ptr)->rf_signature, OCFS2_REFCOUNT_BLOCK_SIGNATURE)) static inline unsigned long ino_from_blkno(struct super_block *sb, u64 blkno) { return (unsigned long)(blkno & (u64)ULONG_MAX); } static inline u64 ocfs2_clusters_to_blocks(struct super_block *sb, u32 clusters) { int c_to_b_bits = OCFS2_SB(sb)->s_clustersize_bits - sb->s_blocksize_bits; return (u64)clusters << c_to_b_bits; } static inline u32 ocfs2_clusters_for_blocks(struct super_block *sb, u64 blocks) { int b_to_c_bits = OCFS2_SB(sb)->s_clustersize_bits - sb->s_blocksize_bits; blocks += (1 << b_to_c_bits) - 1; return (u32)(blocks >> b_to_c_bits); } static inline u32 ocfs2_blocks_to_clusters(struct super_block *sb, u64 blocks) { int b_to_c_bits = OCFS2_SB(sb)->s_clustersize_bits - sb->s_blocksize_bits; return (u32)(blocks >> b_to_c_bits); } static inline unsigned int ocfs2_clusters_for_bytes(struct super_block *sb, u64 bytes) { int cl_bits = OCFS2_SB(sb)->s_clustersize_bits; unsigned int clusters; bytes += OCFS2_SB(sb)->s_clustersize - 1; /* OCFS2 just cannot have enough clusters to overflow this */ clusters = (unsigned int)(bytes >> cl_bits); return clusters; } static inline unsigned int ocfs2_bytes_to_clusters(struct super_block *sb, u64 bytes) { int cl_bits = OCFS2_SB(sb)->s_clustersize_bits; unsigned int clusters; clusters = (unsigned int)(bytes >> cl_bits); return clusters; } static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb, u64 bytes) { bytes += sb->s_blocksize - 1; return bytes >> sb->s_blocksize_bits; } static inline u64 ocfs2_clusters_to_bytes(struct super_block *sb, u32 clusters) { return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits; } static inline u64 ocfs2_block_to_cluster_start(struct super_block *sb, u64 blocks) { int bits = OCFS2_SB(sb)->s_clustersize_bits - sb->s_blocksize_bits; unsigned int clusters; clusters = ocfs2_blocks_to_clusters(sb, blocks); return (u64)clusters << bits; } static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb, u64 bytes) { int cl_bits = OCFS2_SB(sb)->s_clustersize_bits; unsigned int clusters; clusters = ocfs2_clusters_for_bytes(sb, bytes); return (u64)clusters << cl_bits; } static inline u64 ocfs2_align_bytes_to_blocks(struct super_block *sb, u64 bytes) { u64 blocks; blocks = ocfs2_blocks_for_bytes(sb, bytes); return blocks << sb->s_blocksize_bits; } static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes) { return (unsigned long)((bytes + 511) >> 9); } static inline unsigned int ocfs2_page_index_to_clusters(struct super_block *sb, unsigned long pg_index) { u32 clusters = pg_index; unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; if (unlikely(PAGE_SHIFT > cbits)) clusters = pg_index << (PAGE_SHIFT - cbits); else if (PAGE_SHIFT < cbits) clusters = pg_index >> (cbits - PAGE_SHIFT); return clusters; } /* * Find the 1st page index which covers the given clusters. */ static inline pgoff_t ocfs2_align_clusters_to_page_index(struct super_block *sb, u32 clusters) { unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; pgoff_t index = clusters; if (PAGE_SHIFT > cbits) { index = (pgoff_t)clusters >> (PAGE_SHIFT - cbits); } else if (PAGE_SHIFT < cbits) { index = (pgoff_t)clusters << (cbits - PAGE_SHIFT); } return index; } static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb) { unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; unsigned int pages_per_cluster = 1; if (PAGE_SHIFT < cbits) pages_per_cluster = 1 << (cbits - PAGE_SHIFT); return pages_per_cluster; } static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb, unsigned int megs) { BUILD_BUG_ON(OCFS2_MAX_CLUSTERSIZE > 1048576); return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits); } static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb, unsigned int clusters) { return clusters >> (20 - OCFS2_SB(sb)->s_clustersize_bits); } static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap) { __set_bit_le(bit, bitmap); } #define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr)) static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap) { __clear_bit_le(bit, bitmap); } #define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr)) #define ocfs2_test_bit test_bit_le #define ocfs2_find_next_zero_bit find_next_zero_bit_le #define ocfs2_find_next_bit find_next_bit_le static inline void *correct_addr_and_bit_unaligned(int *bit, void *addr) { #if BITS_PER_LONG == 64 *bit += ((unsigned long) addr & 7UL) << 3; addr = (void *) ((unsigned long) addr & ~7UL); #elif BITS_PER_LONG == 32 *bit += ((unsigned long) addr & 3UL) << 3; addr = (void *) ((unsigned long) addr & ~3UL); #else #error "how many bits you are?!" #endif return addr; } static inline void ocfs2_set_bit_unaligned(int bit, void *bitmap) { bitmap = correct_addr_and_bit_unaligned(&bit, bitmap); ocfs2_set_bit(bit, bitmap); } static inline void ocfs2_clear_bit_unaligned(int bit, void *bitmap) { bitmap = correct_addr_and_bit_unaligned(&bit, bitmap); ocfs2_clear_bit(bit, bitmap); } static inline int ocfs2_test_bit_unaligned(int bit, void *bitmap) { bitmap = correct_addr_and_bit_unaligned(&bit, bitmap); return ocfs2_test_bit(bit, bitmap); } static inline int ocfs2_find_next_zero_bit_unaligned(void *bitmap, int max, int start) { int fix = 0, ret, tmpmax; bitmap = correct_addr_and_bit_unaligned(&fix, bitmap); tmpmax = max + fix; start += fix; ret = ocfs2_find_next_zero_bit(bitmap, tmpmax, start) - fix; if (ret > max) return max; return ret; } #endif /* OCFS2_H */
7325 7301 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM workqueue #if !defined(_TRACE_WORKQUEUE_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_WORKQUEUE_H #include <linux/tracepoint.h> #include <linux/workqueue.h> struct pool_workqueue; /** * workqueue_queue_work - called when a work gets queued * @req_cpu: the requested cpu * @pwq: pointer to struct pool_workqueue * @work: pointer to struct work_struct * * This event occurs when a work is queued immediately or once a * delayed work is actually queued on a workqueue (ie: once the delay * has been reached). */ TRACE_EVENT(workqueue_queue_work, TP_PROTO(int req_cpu, struct pool_workqueue *pwq, struct work_struct *work), TP_ARGS(req_cpu, pwq, work), TP_STRUCT__entry( __field( void *, work ) __field( void *, function) __string( workqueue, pwq->wq->name) __field( int, req_cpu ) __field( int, cpu ) ), TP_fast_assign( __entry->work = work; __entry->function = work->func; __assign_str(workqueue); __entry->req_cpu = req_cpu; __entry->cpu = pwq->pool->cpu; ), TP_printk("work struct=%p function=%ps workqueue=%s req_cpu=%d cpu=%d", __entry->work, __entry->function, __get_str(workqueue), __entry->req_cpu, __entry->cpu) ); /** * workqueue_activate_work - called when a work gets activated * @work: pointer to struct work_struct * * This event occurs when a queued work is put on the active queue, * which happens immediately after queueing unless @max_active limit * is reached. */ TRACE_EVENT(workqueue_activate_work, TP_PROTO(struct work_struct *work), TP_ARGS(work), TP_STRUCT__entry( __field( void *, work ) __field( void *, function) ), TP_fast_assign( __entry->work = work; __entry->function = work->func; ), TP_printk("work struct %p function=%ps ", __entry->work, __entry->function) ); /** * workqueue_execute_start - called immediately before the workqueue callback * @work: pointer to struct work_struct * * Allows to track workqueue execution. */ TRACE_EVENT(workqueue_execute_start, TP_PROTO(struct work_struct *work), TP_ARGS(work), TP_STRUCT__entry( __field( void *, work ) __field( void *, function) ), TP_fast_assign( __entry->work = work; __entry->function = work->func; ), TP_printk("work struct %p: function %ps", __entry->work, __entry->function) ); /** * workqueue_execute_end - called immediately after the workqueue callback * @work: pointer to struct work_struct * @function: pointer to worker function * * Allows to track workqueue execution. */ TRACE_EVENT(workqueue_execute_end, TP_PROTO(struct work_struct *work, work_func_t function), TP_ARGS(work, function), TP_STRUCT__entry( __field( void *, work ) __field( void *, function) ), TP_fast_assign( __entry->work = work; __entry->function = function; ), TP_printk("work struct %p: function %ps", __entry->work, __entry->function) ); #endif /* _TRACE_WORKQUEUE_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 // SPDX-License-Identifier: GPL-2.0-or-later /* * Abilis Systems Single DVB-T Receiver * Copyright (C) 2008 Pierrick Hascoet <pierrick.hascoet@abilis.com> * Copyright (C) 2010 Devin Heitmueller <dheitmueller@kernellabs.com> */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/usb.h> #include "as102_drv.h" #include "as102_usb_drv.h" #include "as102_fw.h" static void as102_usb_disconnect(struct usb_interface *interface); static int as102_usb_probe(struct usb_interface *interface, const struct usb_device_id *id); static int as102_usb_start_stream(struct as102_dev_t *dev); static void as102_usb_stop_stream(struct as102_dev_t *dev); static int as102_open(struct inode *inode, struct file *file); static int as102_release(struct inode *inode, struct file *file); static const struct usb_device_id as102_usb_id_table[] = { { USB_DEVICE(AS102_USB_DEVICE_VENDOR_ID, AS102_USB_DEVICE_PID_0001) }, { USB_DEVICE(PCTV_74E_USB_VID, PCTV_74E_USB_PID) }, { USB_DEVICE(ELGATO_EYETV_DTT_USB_VID, ELGATO_EYETV_DTT_USB_PID) }, { USB_DEVICE(NBOX_DVBT_DONGLE_USB_VID, NBOX_DVBT_DONGLE_USB_PID) }, { USB_DEVICE(SKY_IT_DIGITAL_KEY_USB_VID, SKY_IT_DIGITAL_KEY_USB_PID) }, { } /* Terminating entry */ }; /* Note that this table must always have the same number of entries as the as102_usb_id_table struct */ static const char * const as102_device_names[] = { AS102_REFERENCE_DESIGN, AS102_PCTV_74E, AS102_ELGATO_EYETV_DTT_NAME, AS102_NBOX_DVBT_DONGLE_NAME, AS102_SKY_IT_DIGITAL_KEY_NAME, NULL /* Terminating entry */ }; /* eLNA configuration: devices built on the reference design work best with 0xA0, while custom designs seem to require 0xC0 */ static uint8_t const as102_elna_cfg[] = { 0xA0, 0xC0, 0xC0, 0xA0, 0xA0, 0x00 /* Terminating entry */ }; struct usb_driver as102_usb_driver = { .name = DRIVER_FULL_NAME, .probe = as102_usb_probe, .disconnect = as102_usb_disconnect, .id_table = as102_usb_id_table }; static const struct file_operations as102_dev_fops = { .owner = THIS_MODULE, .open = as102_open, .release = as102_release, }; static struct usb_class_driver as102_usb_class_driver = { .name = "aton2-%d", .fops = &as102_dev_fops, .minor_base = AS102_DEVICE_MAJOR, }; static int as102_usb_xfer_cmd(struct as10x_bus_adapter_t *bus_adap, unsigned char *send_buf, int send_buf_len, unsigned char *recv_buf, int recv_buf_len) { int ret = 0; if (send_buf != NULL) { ret = usb_control_msg(bus_adap->usb_dev, usb_sndctrlpipe(bus_adap->usb_dev, 0), AS102_USB_DEVICE_TX_CTRL_CMD, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, bus_adap->cmd_xid, /* value */ 0, /* index */ send_buf, send_buf_len, USB_CTRL_SET_TIMEOUT /* 200 */); if (ret < 0) { dev_dbg(&bus_adap->usb_dev->dev, "usb_control_msg(send) failed, err %i\n", ret); return ret; } if (ret != send_buf_len) { dev_dbg(&bus_adap->usb_dev->dev, "only wrote %d of %d bytes\n", ret, send_buf_len); return -1; } } if (recv_buf != NULL) { #ifdef TRACE dev_dbg(bus_adap->usb_dev->dev, "want to read: %d bytes\n", recv_buf_len); #endif ret = usb_control_msg(bus_adap->usb_dev, usb_rcvctrlpipe(bus_adap->usb_dev, 0), AS102_USB_DEVICE_RX_CTRL_CMD, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE, bus_adap->cmd_xid, /* value */ 0, /* index */ recv_buf, recv_buf_len, USB_CTRL_GET_TIMEOUT /* 200 */); if (ret < 0) { dev_dbg(&bus_adap->usb_dev->dev, "usb_control_msg(recv) failed, err %i\n", ret); return ret; } #ifdef TRACE dev_dbg(bus_adap->usb_dev->dev, "read %d bytes\n", recv_buf_len); #endif } return ret; } static int as102_send_ep1(struct as10x_bus_adapter_t *bus_adap, unsigned char *send_buf, int send_buf_len, int swap32) { int ret, actual_len; ret = usb_bulk_msg(bus_adap->usb_dev, usb_sndbulkpipe(bus_adap->usb_dev, 1), send_buf, send_buf_len, &actual_len, 200); if (ret) { dev_dbg(&bus_adap->usb_dev->dev, "usb_bulk_msg(send) failed, err %i\n", ret); return ret; } if (actual_len != send_buf_len) { dev_dbg(&bus_adap->usb_dev->dev, "only wrote %d of %d bytes\n", actual_len, send_buf_len); return -1; } return actual_len; } static int as102_read_ep2(struct as10x_bus_adapter_t *bus_adap, unsigned char *recv_buf, int recv_buf_len) { int ret, actual_len; if (recv_buf == NULL) return -EINVAL; ret = usb_bulk_msg(bus_adap->usb_dev, usb_rcvbulkpipe(bus_adap->usb_dev, 2), recv_buf, recv_buf_len, &actual_len, 200); if (ret) { dev_dbg(&bus_adap->usb_dev->dev, "usb_bulk_msg(recv) failed, err %i\n", ret); return ret; } if (actual_len != recv_buf_len) { dev_dbg(&bus_adap->usb_dev->dev, "only read %d of %d bytes\n", actual_len, recv_buf_len); return -1; } return actual_len; } static const struct as102_priv_ops_t as102_priv_ops = { .upload_fw_pkt = as102_send_ep1, .xfer_cmd = as102_usb_xfer_cmd, .as102_read_ep2 = as102_read_ep2, .start_stream = as102_usb_start_stream, .stop_stream = as102_usb_stop_stream, }; static int as102_submit_urb_stream(struct as102_dev_t *dev, struct urb *urb) { int err; usb_fill_bulk_urb(urb, dev->bus_adap.usb_dev, usb_rcvbulkpipe(dev->bus_adap.usb_dev, 0x2), urb->transfer_buffer, AS102_USB_BUF_SIZE, as102_urb_stream_irq, dev); err = usb_submit_urb(urb, GFP_ATOMIC); if (err) dev_dbg(&urb->dev->dev, "%s: usb_submit_urb failed\n", __func__); return err; } void as102_urb_stream_irq(struct urb *urb) { struct as102_dev_t *as102_dev = urb->context; if (urb->actual_length > 0) { dvb_dmx_swfilter(&as102_dev->dvb_dmx, urb->transfer_buffer, urb->actual_length); } else { if (urb->actual_length == 0) memset(urb->transfer_buffer, 0, AS102_USB_BUF_SIZE); } /* is not stopped, re-submit urb */ if (as102_dev->streaming) as102_submit_urb_stream(as102_dev, urb); } static void as102_free_usb_stream_buffer(struct as102_dev_t *dev) { int i; for (i = 0; i < MAX_STREAM_URB; i++) usb_free_urb(dev->stream_urb[i]); usb_free_coherent(dev->bus_adap.usb_dev, MAX_STREAM_URB * AS102_USB_BUF_SIZE, dev->stream, dev->dma_addr); } static int as102_alloc_usb_stream_buffer(struct as102_dev_t *dev) { int i; dev->stream = usb_alloc_coherent(dev->bus_adap.usb_dev, MAX_STREAM_URB * AS102_USB_BUF_SIZE, GFP_KERNEL, &dev->dma_addr); if (!dev->stream) { dev_dbg(&dev->bus_adap.usb_dev->dev, "%s: usb_buffer_alloc failed\n", __func__); return -ENOMEM; } memset(dev->stream, 0, MAX_STREAM_URB * AS102_USB_BUF_SIZE); /* init urb buffers */ for (i = 0; i < MAX_STREAM_URB; i++) { struct urb *urb; urb = usb_alloc_urb(0, GFP_KERNEL); if (urb == NULL) { as102_free_usb_stream_buffer(dev); return -ENOMEM; } urb->transfer_buffer = dev->stream + (i * AS102_USB_BUF_SIZE); urb->transfer_dma = dev->dma_addr + (i * AS102_USB_BUF_SIZE); urb->transfer_flags = URB_NO_TRANSFER_DMA_MAP; urb->transfer_buffer_length = AS102_USB_BUF_SIZE; dev->stream_urb[i] = urb; } return 0; } static void as102_usb_stop_stream(struct as102_dev_t *dev) { int i; for (i = 0; i < MAX_STREAM_URB; i++) usb_kill_urb(dev->stream_urb[i]); } static int as102_usb_start_stream(struct as102_dev_t *dev) { int i, ret = 0; for (i = 0; i < MAX_STREAM_URB; i++) { ret = as102_submit_urb_stream(dev, dev->stream_urb[i]); if (ret) { as102_usb_stop_stream(dev); return ret; } } return 0; } static void as102_usb_release(struct kref *kref) { struct as102_dev_t *as102_dev; as102_dev = container_of(kref, struct as102_dev_t, kref); usb_put_dev(as102_dev->bus_adap.usb_dev); kfree(as102_dev); } static void as102_usb_disconnect(struct usb_interface *intf) { struct as102_dev_t *as102_dev; /* extract as102_dev_t from usb_device private data */ as102_dev = usb_get_intfdata(intf); /* unregister dvb layer */ as102_dvb_unregister(as102_dev); /* free usb buffers */ as102_free_usb_stream_buffer(as102_dev); usb_set_intfdata(intf, NULL); /* usb unregister device */ usb_deregister_dev(intf, &as102_usb_class_driver); /* decrement usage counter */ kref_put(&as102_dev->kref, as102_usb_release); pr_info("%s: device has been disconnected\n", DRIVER_NAME); } static int as102_usb_probe(struct usb_interface *intf, const struct usb_device_id *id) { int ret; struct as102_dev_t *as102_dev; int i; /* This should never actually happen */ if (ARRAY_SIZE(as102_usb_id_table) != (sizeof(as102_device_names) / sizeof(const char *))) { pr_err("Device names table invalid size"); return -EINVAL; } as102_dev = kzalloc(sizeof(struct as102_dev_t), GFP_KERNEL); if (as102_dev == NULL) return -ENOMEM; /* Assign the user-friendly device name */ for (i = 0; i < ARRAY_SIZE(as102_usb_id_table); i++) { if (id == &as102_usb_id_table[i]) { as102_dev->name = as102_device_names[i]; as102_dev->elna_cfg = as102_elna_cfg[i]; } } if (as102_dev->name == NULL) as102_dev->name = "Unknown AS102 device"; /* set private callback functions */ as102_dev->bus_adap.ops = &as102_priv_ops; /* init cmd token for usb bus */ as102_dev->bus_adap.cmd = &as102_dev->bus_adap.token.usb.c; as102_dev->bus_adap.rsp = &as102_dev->bus_adap.token.usb.r; /* init kernel device reference */ kref_init(&as102_dev->kref); /* store as102 device to usb_device private data */ usb_set_intfdata(intf, (void *) as102_dev); /* store in as102 device the usb_device pointer */ as102_dev->bus_adap.usb_dev = usb_get_dev(interface_to_usbdev(intf)); /* we can register the device now, as it is ready */ ret = usb_register_dev(intf, &as102_usb_class_driver); if (ret < 0) { /* something prevented us from registering this driver */ dev_err(&intf->dev, "%s: usb_register_dev() failed (errno = %d)\n", __func__, ret); goto failed; } pr_info("%s: device has been detected\n", DRIVER_NAME); /* request buffer allocation for streaming */ ret = as102_alloc_usb_stream_buffer(as102_dev); if (ret != 0) goto failed_stream; /* register dvb layer */ ret = as102_dvb_register(as102_dev); if (ret != 0) goto failed_dvb; return ret; failed_dvb: as102_free_usb_stream_buffer(as102_dev); failed_stream: usb_deregister_dev(intf, &as102_usb_class_driver); failed: usb_put_dev(as102_dev->bus_adap.usb_dev); usb_set_intfdata(intf, NULL); kfree(as102_dev); return ret; } static int as102_open(struct inode *inode, struct file *file) { int ret = 0, minor = 0; struct usb_interface *intf = NULL; struct as102_dev_t *dev = NULL; /* read minor from inode */ minor = iminor(inode); /* fetch device from usb interface */ intf = usb_find_interface(&as102_usb_driver, minor); if (intf == NULL) { pr_err("%s: can't find device for minor %d\n", __func__, minor); ret = -ENODEV; goto exit; } /* get our device */ dev = usb_get_intfdata(intf); if (dev == NULL) { ret = -EFAULT; goto exit; } /* save our device object in the file's private structure */ file->private_data = dev; /* increment our usage count for the device */ kref_get(&dev->kref); exit: return ret; } static int as102_release(struct inode *inode, struct file *file) { struct as102_dev_t *dev = NULL; dev = file->private_data; if (dev != NULL) { /* decrement the count on our device */ kref_put(&dev->kref, as102_usb_release); } return 0; } MODULE_DEVICE_TABLE(usb, as102_usb_id_table);
7 97 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 // SPDX-License-Identifier: GPL-2.0+ #include <drm/drm_atomic_helper.h> #include <drm/drm_edid.h> #include <drm/drm_managed.h> #include <drm/drm_probe_helper.h> #include "vkms_connector.h" static const struct drm_connector_funcs vkms_connector_funcs = { .fill_modes = drm_helper_probe_single_connector_modes, .reset = drm_atomic_helper_connector_reset, .atomic_duplicate_state = drm_atomic_helper_connector_duplicate_state, .atomic_destroy_state = drm_atomic_helper_connector_destroy_state, }; static int vkms_conn_get_modes(struct drm_connector *connector) { int count; /* Use the default modes list from DRM */ count = drm_add_modes_noedid(connector, XRES_MAX, YRES_MAX); drm_set_preferred_mode(connector, XRES_DEF, YRES_DEF); return count; } static struct drm_encoder *vkms_conn_best_encoder(struct drm_connector *connector) { struct drm_encoder *encoder; drm_connector_for_each_possible_encoder(connector, encoder) return encoder; return NULL; } static const struct drm_connector_helper_funcs vkms_conn_helper_funcs = { .get_modes = vkms_conn_get_modes, .best_encoder = vkms_conn_best_encoder, }; struct vkms_connector *vkms_connector_init(struct vkms_device *vkmsdev) { struct drm_device *dev = &vkmsdev->drm; struct vkms_connector *connector; int ret; connector = drmm_kzalloc(dev, sizeof(*connector), GFP_KERNEL); if (!connector) return ERR_PTR(-ENOMEM); ret = drmm_connector_init(dev, &connector->base, &vkms_connector_funcs, DRM_MODE_CONNECTOR_VIRTUAL, NULL); if (ret) return ERR_PTR(ret); drm_connector_helper_add(&connector->base, &vkms_conn_helper_funcs); return connector; }
37 3 36 30 36 6 37 6 6 6 1 1 1 1 1 1 1 1 1 1 1 13 13 13 12 12 2 1 11 8 7 10 2 1 1 6 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 /* * net/tipc/net.c: TIPC network routing code * * Copyright (c) 1995-2006, 2014, Ericsson AB * Copyright (c) 2005, 2010-2011, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "net.h" #include "name_distr.h" #include "subscr.h" #include "socket.h" #include "node.h" #include "bcast.h" #include "link.h" #include "netlink.h" #include "monitor.h" /* * The TIPC locking policy is designed to ensure a very fine locking * granularity, permitting complete parallel access to individual * port and node/link instances. The code consists of four major * locking domains, each protected with their own disjunct set of locks. * * 1: The bearer level. * RTNL lock is used to serialize the process of configuring bearer * on update side, and RCU lock is applied on read side to make * bearer instance valid on both paths of message transmission and * reception. * * 2: The node and link level. * All node instances are saved into two tipc_node_list and node_htable * lists. The two lists are protected by node_list_lock on write side, * and they are guarded with RCU lock on read side. Especially node * instance is destroyed only when TIPC module is removed, and we can * confirm that there has no any user who is accessing the node at the * moment. Therefore, Except for iterating the two lists within RCU * protection, it's no needed to hold RCU that we access node instance * in other places. * * In addition, all members in node structure including link instances * are protected by node spin lock. * * 3: The transport level of the protocol. * This consists of the structures port, (and its user level * representations, such as user_port and tipc_sock), reference and * tipc_user (port.c, reg.c, socket.c). * * This layer has four different locks: * - The tipc_port spin_lock. This is protecting each port instance * from parallel data access and removal. Since we can not place * this lock in the port itself, it has been placed in the * corresponding reference table entry, which has the same life * cycle as the module. This entry is difficult to access from * outside the TIPC core, however, so a pointer to the lock has * been added in the port instance, -to be used for unlocking * only. * - A read/write lock to protect the reference table itself (teg.c). * (Nobody is using read-only access to this, so it can just as * well be changed to a spin_lock) * - A spin lock to protect the registry of kernel/driver users (reg.c) * - A global spin_lock (tipc_port_lock), which only task is to ensure * consistency where more than one port is involved in an operation, * i.e., when a port is part of a linked list of ports. * There are two such lists; 'port_list', which is used for management, * and 'wait_list', which is used to queue ports during congestion. * * 4: The name table (name_table.c, name_distr.c, subscription.c) * - There is one big read/write-lock (tipc_nametbl_lock) protecting the * overall name table structure. Nothing must be added/removed to * this structure without holding write access to it. * - There is one local spin_lock per sub_sequence, which can be seen * as a sub-domain to the tipc_nametbl_lock domain. It is used only * for translation operations, and is needed because a translation * steps the root of the 'publication' linked list between each lookup. * This is always used within the scope of a tipc_nametbl_lock(read). * - A local spin_lock protecting the queue of subscriber events. */ static void tipc_net_finalize(struct net *net, u32 addr); int tipc_net_init(struct net *net, u8 *node_id, u32 addr) { if (tipc_own_id(net)) { pr_info("Cannot configure node identity twice\n"); return -1; } pr_info("Started in network mode\n"); if (node_id) tipc_set_node_id(net, node_id); if (addr) tipc_net_finalize(net, addr); return 0; } static void tipc_net_finalize(struct net *net, u32 addr) { struct tipc_net *tn = tipc_net(net); struct tipc_socket_addr sk = {0, addr}; struct tipc_uaddr ua; tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_CLUSTER_SCOPE, TIPC_NODE_STATE, addr, addr); if (cmpxchg(&tn->node_addr, 0, addr)) return; tipc_set_node_addr(net, addr); tipc_named_reinit(net); tipc_sk_reinit(net); tipc_mon_reinit_self(net); tipc_nametbl_publish(net, &ua, &sk, addr); } void tipc_net_finalize_work(struct work_struct *work) { struct tipc_net *tn = container_of(work, struct tipc_net, work); rtnl_lock(); tipc_net_finalize(tipc_link_net(tn->bcl), tn->trial_addr); rtnl_unlock(); } void tipc_net_stop(struct net *net) { if (!tipc_own_id(net)) return; rtnl_lock(); tipc_bearer_stop(net); tipc_node_stop(net); rtnl_unlock(); pr_info("Left network mode\n"); } static int __tipc_nl_add_net(struct net *net, struct tipc_nl_msg *msg) { struct tipc_net *tn = net_generic(net, tipc_net_id); u64 *w0 = (u64 *)&tn->node_id[0]; u64 *w1 = (u64 *)&tn->node_id[8]; struct nlattr *attrs; void *hdr; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_NET_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_NET); if (!attrs) goto msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_NET_ID, tn->net_id)) goto attr_msg_full; if (nla_put_u64_64bit(msg->skb, TIPC_NLA_NET_NODEID, *w0, 0)) goto attr_msg_full; if (nla_put_u64_64bit(msg->skb, TIPC_NLA_NET_NODEID_W1, *w1, 0)) goto attr_msg_full; nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); return 0; attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; } int tipc_nl_net_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); int err; int done = cb->args[0]; struct tipc_nl_msg msg; if (done) return 0; msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; err = __tipc_nl_add_net(net, &msg); if (err) goto out; done = 1; out: cb->args[0] = done; return skb->len; } int __tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attrs[TIPC_NLA_NET_MAX + 1]; struct net *net = sock_net(skb->sk); struct tipc_net *tn = tipc_net(net); int err; if (!info->attrs[TIPC_NLA_NET]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_NET_MAX, info->attrs[TIPC_NLA_NET], tipc_nl_net_policy, info->extack); if (err) return err; /* Can't change net id once TIPC has joined a network */ if (tipc_own_addr(net)) return -EPERM; if (attrs[TIPC_NLA_NET_ID]) { u32 val; val = nla_get_u32(attrs[TIPC_NLA_NET_ID]); if (val < 1 || val > 9999) return -EINVAL; tn->net_id = val; } if (attrs[TIPC_NLA_NET_ADDR]) { u32 addr; addr = nla_get_u32(attrs[TIPC_NLA_NET_ADDR]); if (!addr) return -EINVAL; tn->legacy_addr_format = true; tipc_net_init(net, NULL, addr); } if (attrs[TIPC_NLA_NET_NODEID]) { u8 node_id[NODE_ID_LEN]; u64 *w0 = (u64 *)&node_id[0]; u64 *w1 = (u64 *)&node_id[8]; if (!attrs[TIPC_NLA_NET_NODEID_W1]) return -EINVAL; *w0 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID]); *w1 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID_W1]); tipc_net_init(net, node_id, 0); } return 0; } int tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info) { int err; rtnl_lock(); err = __tipc_nl_net_set(skb, info); rtnl_unlock(); return err; } static int __tipc_nl_addr_legacy_get(struct net *net, struct tipc_nl_msg *msg) { struct tipc_net *tn = tipc_net(net); struct nlattr *attrs; void *hdr; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, 0, TIPC_NL_ADDR_LEGACY_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start(msg->skb, TIPC_NLA_NET); if (!attrs) goto msg_full; if (tn->legacy_addr_format) if (nla_put_flag(msg->skb, TIPC_NLA_NET_ADDR_LEGACY)) goto attr_msg_full; nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); return 0; attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; } int tipc_nl_net_addr_legacy_get(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct tipc_nl_msg msg; struct sk_buff *rep; int err; rep = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!rep) return -ENOMEM; msg.skb = rep; msg.portid = info->snd_portid; msg.seq = info->snd_seq; err = __tipc_nl_addr_legacy_get(net, &msg); if (err) { nlmsg_free(msg.skb); return err; } return genlmsg_reply(msg.skb, info); }
147 18 17 15 1 4 15 1193 1861 8 1882 1884 579 5 1 4 2 3 5 228 174 167 171 55 1 54 55 89 8 911 911 910 909 888 888 71 911 911 911 911 911 911 1195 1061 1283 136 799 911 911 867 437 10 457 41 21 37 7 8 8 8 2 7 8 6 2 2 2 2 2 2 2 2 1 42 1 41 781 782 2 1 2 2 2 2 2 2 2 2 4 2 2 1 279 30 73 68 38 103 17 258 733 704 705 706 736 821 821 820 821 821 736 735 736 68 782 783 782 780 781 68 68 68 68 68 50 3 50 1674 79 1674 1675 1675 1674 1644 595 94 67 59 736 731 736 6 6 736 735 736 736 736 735 1886 271 1159 246 246 246 1401 1401 1400 139 48 48 1399 139 139 135 135 138 1 138 139 139 139 14 14 2 14 644 139 138 139 139 14 1346 1345 100 2 103 89 10 89 89 22 22 22 4 22 88 88 106 20 105 1345 1345 1344 1345 1345 574 1669 444 444 337 443 395 444 394 444 1673 450 450 450 450 450 1685 25 1684 1687 6 58 6 1 5 1 4 2 2 52 550 554 64 15 34 34 814 1059 1060 1060 1050 929 1060 1050 1049 1048 929 1059 18 1 1058 128 110 1058 929 1059 927 1057 1050 1059 1049 221 2 221 18 1059 894 891 1060 1060 1060 1060 1372 1369 1371 1370 1080 1080 1079 1079 1371 1081 1078 1076 1081 1369 67 66 109 55 1684 119 1645 1644 1 1684 1685 1684 1685 1687 99 1692 1864 1697 89 89 1863 1863 107 91 1863 1863 1863 4 1863 1862 1860 1863 1854 10 10 10 2 2 2 2 5 5 1854 1863 1863 1859 1864 115 2 1861 2 111 2 112 1854 1854 242 242 1854 1851 1853 1095 1066 24 1855 1463 23 1800 1783 623 621 6 1855 1854 1843 96 96 1767 92 1672 1498 1672 979 961 1661 1655 383 1654 1650 548 550 1476 750 694 1448 1307 42 203 425 656 42 26 2 24 11 1272 410 1605 335 1672 1 1670 1625 17 1625 1669 1618 1606 1666 3 1666 1320 892 836 782 1045 117 116 100 100 79 79 1207 126 127 127 1208 1209 1207 840 841 212 193 834 281 826 71 71 71 211 878 22 19 19 840 836 840 841 35 84 48 83 211 19 836 79 79 79 79 84 58 58 841 841 79 69 79 79 32 79 70 70 70 23 840 839 834 79 700 676 675 650 119 549 160 37 36 469 211 138 114 455 193 194 83 27 449 271 19 19 19 19 19 19 19 1 19 270 270 93 93 263 22 22 22 22 22 22 4 260 150 122 115 74 116 289 241 833 12 4 12 11 11 11 11 11 10 3 11 8 8 8 3 628 629 629 629 591 630 459 402 383 840 899 877 876 839 839 878 840 79 840 899 838 840 841 4 841 841 833 213 213 213 213 213 27 213 174 213 211 30 30 30 1 29 26 29 213 48 213 212 11 11 187 73 751 381 375 121 350 150 803 707 7 707 150 815 755 755 729 103 396 643 782 781 781 782 782 32 6 1 782 671 396 18 8 32 32 32 781 782 718 782 671 631 616 631 396 129 114 73 66 1 65 65 309 255 255 265 306 21 310 354 754 14 3 3 11 14 14 14 13 14 14 14 10 10 14 11 14 14 14 6 41 41 41 1 41 1 40 139 139 139 104 61 104 76 1 76 61 40 48 33 33 28 21 21 1 20 1 32 11 1 32 32 28 26 9 1 47 47 29 1 47 139 67 1545 2 1 138 56 129 139 139 2 782 782 782 14 27 11 18 1 1 27 5 74 74 5 1 3 1 45 45 45 2 43 43 45 882 1 882 1 881 1 882 953 885 3 3 39 38 39 11 11 2 9 2 7 7 29 1 29 29 3 29 26 438 441 359 359 359 359 82 6 5 3 22 22 22 22 18 4 1483 73 70 69 71 70 69 73 73 71 70 70 69 55 30 3 3 1 1185 822 1184 811 1 1184 30 1185 1267 1282 1284 1285 1284 13 802 13 1274 1 1282 1283 1283 1285 63 63 63 63 21 63 62 1 1283 1282 18 1283 61 59 61 1285 1266 1285 1284 1267 1285 1284 1285 1285 1283 1283 1282 13 6 1 5 6 5 5 6 168 168 341 45 340 176 339 19 24 15 32 27 9 1 8 2 6 9 27 1 11 10 3 7 5 24 6 6 4 3 1 4 11 10 2 9 6 25 23 20 2 2 2 1 1 1 1 783 28 783 783 781 783 15 14 1 14 783 782 782 6 5 1 783 1183 1185 1184 1184 1182 14 413 14 14 13 13 3 13 10 10 7 7 7 6 3 1168 6 6 5 5 2 1 4 3 1 1163 442 47 43 1 3 443 442 1 82 360 243 242 127 11 128 344 843 783 781 139 782 67 29 1 7 4 28 26 26 38 6 3 6 1 1 32 23 22 18 9 6 3 3 1 2 1 2 5 4 3 414 354 341 13 12 483 485 485 484 418 421 725 725 724 723 696 69 1 69 68 68 68 30 30 68 67 2 65 46 16 46 64 44 59 59 3 3 1 3 3 1 76 69 3 3 1 506 38 1 506 505 503 2 2 418 260 231 231 231 260 507 296 292 293 290 14 15 2 279 278 21 276 3 3 3 2 1 293 514 1 13 12 2 2 514 3 2 1 405 405 1 405 1 404 1 404 405 401 2 2 2 2 2 2 2 2 2 2 1 1 6 6 6 6 4 1 2 3 2 2 2 34 34 22 34 34 34 16 19 22 22 16 22 22 22 22 22 22 22 19 19 19 19 22 954 423 732 730 23 23 3 2 1 7 735 734 168 735 15 735 416 734 24 735 734 8 8 8 735 731 731 43 41 1 729 731 15 15 2 15 13 13 13 13 13 1 31 30 30 30 30 30 30 27 27 27 3 2 2 954 953 786 785 564 1 738 3 2 736 23 737 735 3 734 731 732 730 2 2 1 2 40 40 40 730 42 1 42 730 318 318 121 121 120 31 29 19 18 18 18 2 2 1 1 1 2 416 15 15 13 12 136 136 291 291 6 63 1 62 459 15 1 1 7 7 7 6 1 1 19 69 307 24 8 4 4 4 98 1 7 5 31 8 7 7 7 7 4 4 11 7 2 10 6 113 19 1 2 1 1 14 3 8 1 15 6 1 1 33 348 954 7 954 6 954 11 954 423 954 423 954 954 954 954 953 953 949 952 6 953 3 949 17 17 954 954 954 64 17 9 3 50 50 50 40 40 40 89 88 8 89 89 89 89 89 89 89 88 89 88 89 89 89 89 70 89 89 147 80 1 79 79 79 79 78 64 79 79 831 66 21 831 905 873 865 861 855 855